From 309c10c203d53ecb7d30aaf4143afbac41392579 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 21 May 2026 17:21:19 +0200 Subject: [PATCH] feat(audit): P72 MC-Scope-Filter + P73 MC-Solution-Generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P72 — rag_document_checker LEFT JOINs canonical_controls.scope_doc_type. _filter_by_canonical_scope wirft MCs raus deren scope explizit auf einen inkompatiblen Doc-Type zeigt (Mapping in _SCOPE_COMPATIBLE). Konservativ: 'other'/NULL/'process' bleiben drin — Heuristik v1 ist noch nicht stark genug fuer hartes Filtern. Erwartete Wirkung: ~10-15% weniger irrelevante MCs pro Doc, weil z.B. ein TOM-MC nicht mehr als DSE-Finding auftaucht. P73 — mc_solution_generator.py: Qwen->OVH Cascade generiert pro HIGH/ CRITICAL-Fail eine konkrete Einfuege-Empfehlung mit Anchor (wo + was) und Aufwand-Schaetzung. JSON-Schema {solution_text, anchor_hint, effort_min}. In-process LRU-Cache (500 entries) per (mc_id, doc_md5). Max 3 Solutions pro Doc-Type, global Cap 8 — haelt Latenz < 60s. Bloecke werden im Mail-Render unter VVT als 'Loesungs-Vorschlaege (KI-generiert)' eingehaengt. Disclaimer: kein Rechts-Beratung, mit DSB pruefen. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/agent_compliance_check_routes.py | 40 ++- .../services/mc_solution_generator.py | 257 ++++++++++++++++++ .../services/rag_document_checker.py | 77 +++++- 3 files changed, 366 insertions(+), 8 deletions(-) create mode 100644 backend-compliance/compliance/services/mc_solution_generator.py diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 03e2f306..b1e7891f 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -973,14 +973,22 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): from compliance.services.mc_scorecard import build_scorecard from .agent_doc_check_scorecard import build_scorecard_html all_mc_checks: list[dict] = [] + # P73: pro-doc Fails sammeln um Solution-Generator pro Doc-Type + # mit dem korrekten doc_text aufzurufen. + fails_by_doc: dict[str, list[dict]] = {} for r in results: for c in r.checks: if c.id.startswith("mc-"): - all_mc_checks.append({ + rec = { "id": c.id, "label": c.label, "passed": c.passed, "severity": c.severity, "skipped": c.skipped, "regulation": c.regulation, - }) + "hint": getattr(c, "hint", "") or "", + } + all_mc_checks.append(rec) + if (not c.passed and not c.skipped + and (c.severity or "").upper() in ("CRITICAL", "HIGH")): + fails_by_doc.setdefault(r.doc_type, []).append(rec) scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {} # Trend: load previous scorecard for the same tenant + domain so the # email can show delta indicators (A6). @@ -1168,6 +1176,32 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): except Exception as e: logger.warning("P92/P94 consistency-check failed: %s", e) + # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail. + # Max 5 Solutions pro Doc-Type um Latenz < 60s zu halten. + solutions_html = "" + try: + from compliance.services.mc_solution_generator import ( + generate_solutions_for_fails, build_solutions_block_html, + ) + all_solutions: list[dict] = [] + for dt, fails in fails_by_doc.items(): + if not fails: + continue + doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or "" + if not doc_txt or len(doc_txt) < 500: + continue + sols = await generate_solutions_for_fails( + fails, doc_txt, dt, limit=3, + ) + all_solutions.extend(sols) + if len(all_solutions) >= 8: + break # global cap + if all_solutions: + solutions_html = build_solutions_block_html(all_solutions[:8]) + logger.info("P73: %d MC-Solutions generiert", len(all_solutions)) + except Exception as e: + logger.warning("P73 MC-Solution-Generator skipped: %s", e) + # P82: GF-1-Pager ganz oben in der Mail — 5-Bullet-Zusammenfassung # damit die GF nicht 124k Char lesen muss. gf_one_pager_html = "" @@ -1232,7 +1266,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): + cookie_arch_html + summary_html + scanned_html + profile_html + scorecard_html + redundancy_html + providers_html + banner_deep_html + library_mismatch_html - + consistency_html + signals_html + + consistency_html + signals_html + solutions_html + vvt_html + report_html ) diff --git a/backend-compliance/compliance/services/mc_solution_generator.py b/backend-compliance/compliance/services/mc_solution_generator.py new file mode 100644 index 00000000..c8069438 --- /dev/null +++ b/backend-compliance/compliance/services/mc_solution_generator.py @@ -0,0 +1,257 @@ +""" +P73 — MC-Solution-Generator. + +Generiert pro Fail-MC eine konkrete Einfuege-Empfehlung mit Anchor: +"Bitte ergaenzen Sie nach Abschnitt 'Kontaktdaten DSB' folgenden +Absatz: ...". LLM-Cascade Qwen (lokal) -> OVH 120B. + +Cache: in-process LRU per (mc_id, doc_md5) damit Re-Runs derselben +Site denselben Vorschlag liefern. Volle DB-Cache kommt spaeter (P31). + +Integration: wird im build_critical_findings_html / mc-detail-rendering +unter jedem HIGH-Fail als eingeklappbarer Block angezeigt. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +from functools import lru_cache +from typing import Iterable + +import httpx + +logger = logging.getLogger(__name__) + + +_SYSTEM_PROMPT = ( + "Du bist Datenschutz-Redakteur. Du formulierst kurze, einfueg-bereite " + "Absaetze fuer Datenschutz-Dokumente — sachlich, in deutscher " + "Rechtssprache, ohne Marketing-Floskeln.\n\n" + "Du bekommst:\n" + "- den FAIL-MC (was geprueft wurde, warum es nicht erfuellt ist)\n" + "- einen Auszug aus dem Ist-Dokument\n" + "- den Dokument-Typ\n\n" + "Du lieferst JSON:\n" + '{\n' + ' "solution_text": "<3-6 Saetze Vorschlags-Absatz fuer das Dokument>",\n' + ' "anchor_hint": "",\n' + ' "effort_min": ""\n' + '}\n\n' + "Regeln:\n" + "- KEINE Normtexte 1:1 zitieren — eigene Formulierung + Norm-Referenz.\n" + "- KEINE Annahmen ueber Konkretes (z.B. Firmennamen, Adressen) — " + "Platzhalter [Ihr Firmenname] / [Ihre Adresse] verwenden.\n" + "- Wenn schon eine schwache Variante im Dokument steht, anchor_hint " + "auf 'ersetzen' setzen statt einfuegen.\n" + "- Nur reines JSON, keine Prosa, keine Code-Fences." +) + + +def _doc_hash(doc_text: str) -> str: + return hashlib.md5(doc_text.encode("utf-8")).hexdigest()[:12] + + +_CACHE: dict[str, dict] = {} +_CACHE_MAX = 500 + + +def _cache_get(key: str) -> dict | None: + return _CACHE.get(key) + + +def _cache_put(key: str, val: dict) -> None: + if len(_CACHE) >= _CACHE_MAX: + # Drop oldest 50 entries + for k in list(_CACHE.keys())[:50]: + _CACHE.pop(k, None) + _CACHE[key] = val + + +async def _call_ollama(prompt: str) -> str: + base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") + model = os.getenv("MC_SOLUTION_MODEL", + os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")) + payload = { + "model": model, "stream": False, "format": "json", + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + "options": {"temperature": 0.1, "num_predict": 600}, + } + try: + async with httpx.AsyncClient(timeout=90.0) as client: + resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload) + resp.raise_for_status() + return (resp.json().get("message") or {}).get("content", "") + except Exception as e: + logger.warning("Qwen MC-solution failed: %s", e) + return "" + + +async def _call_ovh(prompt: str) -> str: + base = os.getenv("OVH_LLM_URL", "").strip() + key = os.getenv("OVH_LLM_KEY", "").strip() + model = os.getenv("OVH_LLM_MODEL", "").strip() + if not base or not model: + return "" + headers = {"Content-Type": "application/json"} + if key: + headers["Authorization"] = f"Bearer {key}" + payload = { + "model": model, "temperature": 0.1, "max_tokens": 600, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + "response_format": {"type": "json_object"}, + } + try: + async with httpx.AsyncClient(timeout=45.0) as client: + resp = await client.post( + f"{base.rstrip('/')}/v1/chat/completions", + json=payload, headers=headers, + ) + resp.raise_for_status() + choice = (resp.json().get("choices") or [{}])[0] + return (choice.get("message") or {}).get("content", "") or "" + except Exception as e: + logger.warning("OVH MC-solution failed: %s", e) + return "" + + +def _parse(content: str) -> dict | None: + if not content: + return None + txt = content.strip() + if txt.startswith("```"): + txt = "\n".join(txt.split("\n")[1:-1]) + a, b = txt.find("{"), txt.rfind("}") + if 0 <= a < b: + try: + obj = json.loads(txt[a:b + 1]) + if isinstance(obj, dict) and obj.get("solution_text"): + return { + "solution_text": str(obj["solution_text"])[:1200], + "anchor_hint": str(obj.get("anchor_hint", ""))[:200], + "effort_min": str(obj.get("effort_min", "mittel"))[:20], + } + except Exception: + return None + return None + + +async def generate_solution( + mc: dict, + doc_text: str, + doc_type: str, +) -> dict | None: + """Generates a solution dict for a single FAIL-MC. + + mc must contain: label, hint, severity. Returns + {solution_text, anchor_hint, effort_min} or None. + """ + if not mc or not doc_text: + return None + mc_id = str(mc.get("id") or mc.get("label", ""))[:80] + cache_key = f"{mc_id}:{doc_type}:{_doc_hash(doc_text)}" + cached = _cache_get(cache_key) + if cached: + return cached + + excerpt = doc_text[:3500] + prompt = ( + f"FAIL-MC: {mc.get('label', '')}\n" + f"Severity: {mc.get('severity', 'MEDIUM')}\n" + f"Aktueller Hint: {mc.get('hint', '')[:300]}\n\n" + f"Dokument-Typ: {doc_type}\n" + f"Dokument-Auszug:\n---\n{excerpt}\n---\n\n" + "Liefere die Loesung als JSON." + ) + + content = await _call_ollama(prompt) + parsed = _parse(content) + if not parsed: + content = await _call_ovh(prompt) + parsed = _parse(content) + if parsed: + _cache_put(cache_key, parsed) + return parsed + + +async def generate_solutions_for_fails( + failed_mcs: Iterable[dict], + doc_text: str, + doc_type: str, + limit: int = 5, +) -> list[dict]: + """Returns a list of {mc_label, severity, solution_text, anchor_hint, + effort_min} for the top-N HIGH/CRITICAL fails. Skips MEDIUM/LOW + to keep latency bounded.""" + sev_order = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3} + high_fails = [m for m in (failed_mcs or []) + if (m.get("severity") or "").upper() in ("CRITICAL", "HIGH")] + high_fails.sort(key=lambda m: sev_order.get( + (m.get("severity") or "").upper(), 3)) + high_fails = high_fails[:limit] + + out: list[dict] = [] + for mc in high_fails: + sol = await generate_solution(mc, doc_text, doc_type) + if not sol: + continue + out.append({ + "mc_label": mc.get("label", "")[:200], + "severity": mc.get("severity", "MEDIUM"), + "solution_text": sol["solution_text"], + "anchor_hint": sol["anchor_hint"], + "effort_min": sol["effort_min"], + }) + return out + + +def build_solutions_block_html(solutions: list[dict]) -> str: + """Renders the LLM-generated solutions as a Mail-Block.""" + if not solutions: + return "" + items: list[str] = [] + for s in solutions: + sev_color = "#dc2626" if s["severity"].upper() == "CRITICAL" else "#d97706" + items.append( + f'
  • ' + f'
    ' + f'[{s["severity"]}] {s["mc_label"]}
    ' + f'
    {s["solution_text"]}
    ' + f'
    ' + f'Anchor: {s["anchor_hint"] or "—"} ' + f' ·  Aufwand: {s["effort_min"]}' + f'
  • ' + ) + return ( + '
    ' + '
    ' + 'Loesungs-Vorschlaege (KI-generiert)
    ' + f'

    ' + f'{len(solutions)} konkrete Einfuege-Empfehlung' + f'{"en" if len(solutions) != 1 else ""} ' + 'fuer die kritischen Findings

    ' + '

    ' + 'Folgende Absaetze koennen Sie direkt uebernehmen — Platzhalter ' + '[Ihr Firmenname] / [Ihre Adresse] sind zu ersetzen. Inhaltliche ' + 'Korrektheit ist mit DSB / Rechtsabteilung zu pruefen.

    ' + '
      ' + + "".join(items) + + '
    ' + '

    Generiert via Qwen3-30b lokal (Fallback: ' + 'OVH 120B). Vorschlaege sind kein Rechts-Beratung.

    ' + '
    ' + ) diff --git a/backend-compliance/compliance/services/rag_document_checker.py b/backend-compliance/compliance/services/rag_document_checker.py index 168e5376..4a876841 100644 --- a/backend-compliance/compliance/services/rag_document_checker.py +++ b/backend-compliance/compliance/services/rag_document_checker.py @@ -293,6 +293,59 @@ _MC_ALIAS_FALLBACK = { } +# P72 — kompatible scope_doc_type-Werte pro operativem doc_type. +# 'other' / NULL / 'process' bleiben immer drin (Backfill ist Heuristik v1 +# und nicht stark genug fuer hartes Filtern). +_SCOPE_COMPATIBLE: dict[str, set[str]] = { + "dse": {"dse", "jc", "process", "tom", "accounting"}, + "cookie": {"cookie_richtlinie", "banner_implementation", + "cmp_audit", "dse"}, + "cookie_policy": {"cookie_richtlinie", "banner_implementation", + "cmp_audit", "dse"}, + "impressum": {"impressum", "agb"}, + "agb": {"agb", "widerruf", "impressum"}, + "nutzungsbedingungen": {"agb", "widerruf", "impressum"}, + "widerruf": {"widerruf", "agb"}, + "avv": {"avv", "tom", "jc", "process"}, + "tom": {"tom", "avv", "process"}, + "loeschkonzept": {"process", "dse", "accounting"}, + "dsfa": {"process", "tom", "dse"}, + "social_media": {"jc", "dse"}, + "dsa": {"dse", "impressum"}, + "legal_notice": {"impressum", "agb"}, + "lizenzhinweise": {"agb", "impressum"}, +} +_PERMISSIVE_SCOPES = {"other", "process", None, "", "null"} + + +def _filter_by_canonical_scope( + controls: list[dict], + doc_type: str, +) -> list[dict]: + """P72 — wirft MCs raus, deren canonical scope_doc_type explizit auf + einen INKOMPATIBLEN Doc-Type zeigt. 'other'/NULL/'process' bleiben + drin (Backfill v1 noch zu unsicher). + """ + compatible = _SCOPE_COMPATIBLE.get(doc_type) + if not compatible: + return controls + kept: list[dict] = [] + dropped = 0 + for c in controls: + scope = c.get("canonical_scope") + scope_norm = (scope or "").strip().lower() or None + if scope_norm in _PERMISSIVE_SCOPES or scope_norm in compatible: + kept.append(c) + else: + dropped += 1 + if dropped: + logger.info( + "P72 scope-filter: %d/%d MCs out-of-scope fuer doc_type=%s", + dropped, len(controls), doc_type, + ) + return kept + + def _load_text_only_ids( doc_type: str | None = None, business_scope: set[str] | None = None, @@ -372,11 +425,19 @@ async def _load_controls(doc_type: str, db_url: str, limit: int, return [] try: - query = """SELECT id, control_id, title, regulation, article, - check_question, pass_criteria, fail_criteria, severity - FROM compliance.doc_check_controls - WHERE doc_type = $1 - ORDER BY severity DESC, title""" + # P72: LEFT JOIN canonical_controls.scope_doc_type um scope-Info + # mitzuziehen. Wenn ein MC explizit fuer einen anderen Doc-Type + # klassifiziert ist (z.B. 'tom' statt 'dse'), wird er unten + # gefiltert. 'other' / NULL bleiben drin (Backfill noch nicht stark). + query = """SELECT dc.id, dc.control_id, dc.title, dc.regulation, + dc.article, dc.check_question, dc.pass_criteria, + dc.fail_criteria, dc.severity, + cc.scope_doc_type AS canonical_scope + FROM compliance.doc_check_controls dc + LEFT JOIN compliance.canonical_controls cc + ON cc.id = dc.control_uuid + WHERE dc.doc_type = $1 + ORDER BY dc.severity DESC, dc.title""" if limit > 0: query += f" LIMIT {limit}" @@ -387,6 +448,12 @@ async def _load_controls(doc_type: str, db_url: str, limit: int, rows = await conn.fetch(query, fallback) controls = [dict(r) for r in rows] + + # P72: Scope-Filter — werfe MCs raus, deren canonical scope_doc_type + # explizit auf einen anderen Doc-Type zeigt. Konservativ: + # other/NULL/process bleiben drin (zu unsichere Klassifikation). + controls = _filter_by_canonical_scope(controls, doc_type) + text_only = _load_text_only_ids(doc_type, business_scope) if text_only: before = len(controls)