From 309c10c203d53ecb7d30aaf4143afbac41392579 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Thu, 21 May 2026 17:21:19 +0200
Subject: [PATCH] feat(audit): P72 MC-Scope-Filter + P73 MC-Solution-Generator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P72 — rag_document_checker LEFT JOINs canonical_controls.scope_doc_type.
_filter_by_canonical_scope wirft MCs raus deren scope explizit auf
einen inkompatiblen Doc-Type zeigt (Mapping in _SCOPE_COMPATIBLE).
Konservativ: 'other'/NULL/'process' bleiben drin — Heuristik v1 ist
noch nicht stark genug fuer hartes Filtern.

Erwartete Wirkung: ~10-15% weniger irrelevante MCs pro Doc, weil z.B.
ein TOM-MC nicht mehr als DSE-Finding auftaucht.

P73 — mc_solution_generator.py: Qwen->OVH Cascade generiert pro HIGH/
CRITICAL-Fail eine konkrete Einfuege-Empfehlung mit Anchor (wo + was)
und Aufwand-Schaetzung. JSON-Schema {solution_text, anchor_hint,
effort_min}. In-process LRU-Cache (500 entries) per (mc_id, doc_md5).

Max 3 Solutions pro Doc-Type, global Cap 8 — haelt Latenz < 60s. Bloecke
werden im Mail-Render unter VVT als 'Loesungs-Vorschlaege (KI-generiert)'
eingehaengt. Disclaimer: kein Rechts-Beratung, mit DSB pruefen.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/agent_compliance_check_routes.py      |  40 ++-
 .../services/mc_solution_generator.py         | 257 ++++++++++++++++++
 .../services/rag_document_checker.py          |  77 +++++-
 3 files changed, 366 insertions(+), 8 deletions(-)
 create mode 100644 backend-compliance/compliance/services/mc_solution_generator.py

diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py
index 03e2f306..b1e7891f 100644
--- a/backend-compliance/compliance/api/agent_compliance_check_routes.py
+++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py
@@ -973,14 +973,22 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
         from compliance.services.mc_scorecard import build_scorecard
         from .agent_doc_check_scorecard import build_scorecard_html
         all_mc_checks: list[dict] = []
+        # P73: pro-doc Fails sammeln um Solution-Generator pro Doc-Type
+        # mit dem korrekten doc_text aufzurufen.
+        fails_by_doc: dict[str, list[dict]] = {}
         for r in results:
             for c in r.checks:
                 if c.id.startswith("mc-"):
-                    all_mc_checks.append({
+                    rec = {
                         "id": c.id, "label": c.label, "passed": c.passed,
                         "severity": c.severity, "skipped": c.skipped,
                         "regulation": c.regulation,
-                    })
+                        "hint": getattr(c, "hint", "") or "",
+                    }
+                    all_mc_checks.append(rec)
+                    if (not c.passed and not c.skipped
+                            and (c.severity or "").upper() in ("CRITICAL", "HIGH")):
+                        fails_by_doc.setdefault(r.doc_type, []).append(rec)
         scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {}
         # Trend: load previous scorecard for the same tenant + domain so the
         # email can show delta indicators (A6).
@@ -1168,6 +1176,32 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
         except Exception as e:
             logger.warning("P92/P94 consistency-check failed: %s", e)
 
+        # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail.
+        # Max 5 Solutions pro Doc-Type um Latenz < 60s zu halten.
+        solutions_html = ""
+        try:
+            from compliance.services.mc_solution_generator import (
+                generate_solutions_for_fails, build_solutions_block_html,
+            )
+            all_solutions: list[dict] = []
+            for dt, fails in fails_by_doc.items():
+                if not fails:
+                    continue
+                doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or ""
+                if not doc_txt or len(doc_txt) < 500:
+                    continue
+                sols = await generate_solutions_for_fails(
+                    fails, doc_txt, dt, limit=3,
+                )
+                all_solutions.extend(sols)
+                if len(all_solutions) >= 8:
+                    break  # global cap
+            if all_solutions:
+                solutions_html = build_solutions_block_html(all_solutions[:8])
+                logger.info("P73: %d MC-Solutions generiert", len(all_solutions))
+        except Exception as e:
+            logger.warning("P73 MC-Solution-Generator skipped: %s", e)
+
         # P82: GF-1-Pager ganz oben in der Mail — 5-Bullet-Zusammenfassung
         # damit die GF nicht 124k Char lesen muss.
         gf_one_pager_html = ""
@@ -1232,7 +1266,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
             + cookie_arch_html + summary_html + scanned_html + profile_html
             + scorecard_html + redundancy_html
             + providers_html + banner_deep_html + library_mismatch_html
-            + consistency_html + signals_html
+            + consistency_html + signals_html + solutions_html
             + vvt_html + report_html
         )
 
diff --git a/backend-compliance/compliance/services/mc_solution_generator.py b/backend-compliance/compliance/services/mc_solution_generator.py
new file mode 100644
index 00000000..c8069438
--- /dev/null
+++ b/backend-compliance/compliance/services/mc_solution_generator.py
@@ -0,0 +1,257 @@
+"""
+P73 — MC-Solution-Generator.
+
+Generiert pro Fail-MC eine konkrete Einfuege-Empfehlung mit Anchor:
+"Bitte ergaenzen Sie nach Abschnitt 'Kontaktdaten DSB' folgenden
+Absatz: ...". LLM-Cascade Qwen (lokal) -> OVH 120B.
+
+Cache: in-process LRU per (mc_id, doc_md5) damit Re-Runs derselben
+Site denselben Vorschlag liefern. Volle DB-Cache kommt spaeter (P31).
+
+Integration: wird im build_critical_findings_html / mc-detail-rendering
+unter jedem HIGH-Fail als eingeklappbarer Block angezeigt.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import os
+from functools import lru_cache
+from typing import Iterable
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+_SYSTEM_PROMPT = (
+    "Du bist Datenschutz-Redakteur. Du formulierst kurze, einfueg-bereite "
+    "Absaetze fuer Datenschutz-Dokumente — sachlich, in deutscher "
+    "Rechtssprache, ohne Marketing-Floskeln.\n\n"
+    "Du bekommst:\n"
+    "- den FAIL-MC (was geprueft wurde, warum es nicht erfuellt ist)\n"
+    "- einen Auszug aus dem Ist-Dokument\n"
+    "- den Dokument-Typ\n\n"
+    "Du lieferst JSON:\n"
+    '{\n'
+    '  "solution_text": "<3-6 Saetze Vorschlags-Absatz fuer das Dokument>",\n'
+    '  "anchor_hint": "<wo einfuegen, z.B. \\"nach Abschnitt Kontaktdaten\\">",\n'
+    '  "effort_min": "<gering|mittel|hoch>"\n'
+    '}\n\n'
+    "Regeln:\n"
+    "- KEINE Normtexte 1:1 zitieren — eigene Formulierung + Norm-Referenz.\n"
+    "- KEINE Annahmen ueber Konkretes (z.B. Firmennamen, Adressen) — "
+    "Platzhalter [Ihr Firmenname] / [Ihre Adresse] verwenden.\n"
+    "- Wenn schon eine schwache Variante im Dokument steht, anchor_hint "
+    "auf 'ersetzen' setzen statt einfuegen.\n"
+    "- Nur reines JSON, keine Prosa, keine Code-Fences."
+)
+
+
+def _doc_hash(doc_text: str) -> str:
+    return hashlib.md5(doc_text.encode("utf-8")).hexdigest()[:12]
+
+
+_CACHE: dict[str, dict] = {}
+_CACHE_MAX = 500
+
+
+def _cache_get(key: str) -> dict | None:
+    return _CACHE.get(key)
+
+
+def _cache_put(key: str, val: dict) -> None:
+    if len(_CACHE) >= _CACHE_MAX:
+        # Drop oldest 50 entries
+        for k in list(_CACHE.keys())[:50]:
+            _CACHE.pop(k, None)
+    _CACHE[key] = val
+
+
+async def _call_ollama(prompt: str) -> str:
+    base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+    model = os.getenv("MC_SOLUTION_MODEL",
+                       os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b"))
+    payload = {
+        "model": model, "stream": False, "format": "json",
+        "messages": [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ],
+        "options": {"temperature": 0.1, "num_predict": 600},
+    }
+    try:
+        async with httpx.AsyncClient(timeout=90.0) as client:
+            resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload)
+            resp.raise_for_status()
+        return (resp.json().get("message") or {}).get("content", "")
+    except Exception as e:
+        logger.warning("Qwen MC-solution failed: %s", e)
+        return ""
+
+
+async def _call_ovh(prompt: str) -> str:
+    base = os.getenv("OVH_LLM_URL", "").strip()
+    key = os.getenv("OVH_LLM_KEY", "").strip()
+    model = os.getenv("OVH_LLM_MODEL", "").strip()
+    if not base or not model:
+        return ""
+    headers = {"Content-Type": "application/json"}
+    if key:
+        headers["Authorization"] = f"Bearer {key}"
+    payload = {
+        "model": model, "temperature": 0.1, "max_tokens": 600,
+        "messages": [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ],
+        "response_format": {"type": "json_object"},
+    }
+    try:
+        async with httpx.AsyncClient(timeout=45.0) as client:
+            resp = await client.post(
+                f"{base.rstrip('/')}/v1/chat/completions",
+                json=payload, headers=headers,
+            )
+            resp.raise_for_status()
+        choice = (resp.json().get("choices") or [{}])[0]
+        return (choice.get("message") or {}).get("content", "") or ""
+    except Exception as e:
+        logger.warning("OVH MC-solution failed: %s", e)
+        return ""
+
+
+def _parse(content: str) -> dict | None:
+    if not content:
+        return None
+    txt = content.strip()
+    if txt.startswith("```"):
+        txt = "\n".join(txt.split("\n")[1:-1])
+    a, b = txt.find("{"), txt.rfind("}")
+    if 0 <= a < b:
+        try:
+            obj = json.loads(txt[a:b + 1])
+            if isinstance(obj, dict) and obj.get("solution_text"):
+                return {
+                    "solution_text": str(obj["solution_text"])[:1200],
+                    "anchor_hint":   str(obj.get("anchor_hint", ""))[:200],
+                    "effort_min":    str(obj.get("effort_min", "mittel"))[:20],
+                }
+        except Exception:
+            return None
+    return None
+
+
+async def generate_solution(
+    mc: dict,
+    doc_text: str,
+    doc_type: str,
+) -> dict | None:
+    """Generates a solution dict for a single FAIL-MC.
+
+    mc must contain: label, hint, severity. Returns
+    {solution_text, anchor_hint, effort_min} or None.
+    """
+    if not mc or not doc_text:
+        return None
+    mc_id = str(mc.get("id") or mc.get("label", ""))[:80]
+    cache_key = f"{mc_id}:{doc_type}:{_doc_hash(doc_text)}"
+    cached = _cache_get(cache_key)
+    if cached:
+        return cached
+
+    excerpt = doc_text[:3500]
+    prompt = (
+        f"FAIL-MC: {mc.get('label', '')}\n"
+        f"Severity: {mc.get('severity', 'MEDIUM')}\n"
+        f"Aktueller Hint: {mc.get('hint', '')[:300]}\n\n"
+        f"Dokument-Typ: {doc_type}\n"
+        f"Dokument-Auszug:\n---\n{excerpt}\n---\n\n"
+        "Liefere die Loesung als JSON."
+    )
+
+    content = await _call_ollama(prompt)
+    parsed = _parse(content)
+    if not parsed:
+        content = await _call_ovh(prompt)
+        parsed = _parse(content)
+    if parsed:
+        _cache_put(cache_key, parsed)
+    return parsed
+
+
+async def generate_solutions_for_fails(
+    failed_mcs: Iterable[dict],
+    doc_text: str,
+    doc_type: str,
+    limit: int = 5,
+) -> list[dict]:
+    """Returns a list of {mc_label, severity, solution_text, anchor_hint,
+    effort_min} for the top-N HIGH/CRITICAL fails. Skips MEDIUM/LOW
+    to keep latency bounded."""
+    sev_order = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3}
+    high_fails = [m for m in (failed_mcs or [])
+                  if (m.get("severity") or "").upper() in ("CRITICAL", "HIGH")]
+    high_fails.sort(key=lambda m: sev_order.get(
+        (m.get("severity") or "").upper(), 3))
+    high_fails = high_fails[:limit]
+
+    out: list[dict] = []
+    for mc in high_fails:
+        sol = await generate_solution(mc, doc_text, doc_type)
+        if not sol:
+            continue
+        out.append({
+            "mc_label":      mc.get("label", "")[:200],
+            "severity":      mc.get("severity", "MEDIUM"),
+            "solution_text": sol["solution_text"],
+            "anchor_hint":   sol["anchor_hint"],
+            "effort_min":    sol["effort_min"],
+        })
+    return out
+
+
+def build_solutions_block_html(solutions: list[dict]) -> str:
+    """Renders the LLM-generated solutions as a Mail-Block."""
+    if not solutions:
+        return ""
+    items: list[str] = []
+    for s in solutions:
+        sev_color = "#dc2626" if s["severity"].upper() == "CRITICAL" else "#d97706"
+        items.append(
+            f'<li style="margin-bottom:12px;font-size:11px;line-height:1.5">'
+            f'<div style="font-weight:600;color:{sev_color}">'
+            f'[{s["severity"]}] {s["mc_label"]}</div>'
+            f'<div style="background:#fff;padding:8px 10px;border:1px solid '
+            f'#cbd5e1;border-radius:4px;margin-top:4px;color:#1e293b;'
+            f'white-space:pre-wrap">{s["solution_text"]}</div>'
+            f'<div style="font-size:10px;color:#64748b;margin-top:3px">'
+            f'<strong>Anchor:</strong> {s["anchor_hint"] or "—"} '
+            f'&nbsp;·&nbsp; <strong>Aufwand:</strong> {s["effort_min"]}'
+            f'</div></li>'
+        )
+    return (
+        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
+        'max-width:760px;margin:0 auto 16px;padding:14px 18px;'
+        'background:#f0f9ff;border:1px solid #bfdbfe;border-radius:8px">'
+        '<div style="font-size:11px;color:#1e40af;text-transform:uppercase;'
+        'letter-spacing:1.2px;margin-bottom:4px;font-weight:600">'
+        'Loesungs-Vorschlaege (KI-generiert)</div>'
+        f'<h3 style="margin:0 0 6px;font-size:14px;color:#1e293b">'
+        f'{len(solutions)} konkrete Einfuege-Empfehlung'
+        f'{"en" if len(solutions) != 1 else ""} '
+        'fuer die kritischen Findings</h3>'
+        '<p style="margin:0 0 10px;font-size:11px;color:#475569;line-height:1.5">'
+        'Folgende Absaetze koennen Sie direkt uebernehmen — Platzhalter '
+        '[Ihr Firmenname] / [Ihre Adresse] sind zu ersetzen. Inhaltliche '
+        'Korrektheit ist mit DSB / Rechtsabteilung zu pruefen.</p>'
+        '<ul style="margin:0 0 0 18px;padding:0">'
+        + "".join(items) +
+        '</ul>'
+        '<p style="margin:8px 0 0;font-size:10px;color:#94a3b8;'
+        'font-style:italic">Generiert via Qwen3-30b lokal (Fallback: '
+        'OVH 120B). Vorschlaege sind kein Rechts-Beratung.</p>'
+        '</div>'
+    )
diff --git a/backend-compliance/compliance/services/rag_document_checker.py b/backend-compliance/compliance/services/rag_document_checker.py
index 168e5376..4a876841 100644
--- a/backend-compliance/compliance/services/rag_document_checker.py
+++ b/backend-compliance/compliance/services/rag_document_checker.py
@@ -293,6 +293,59 @@ _MC_ALIAS_FALLBACK = {
 }
 
 
+# P72 — kompatible scope_doc_type-Werte pro operativem doc_type.
+# 'other' / NULL / 'process' bleiben immer drin (Backfill ist Heuristik v1
+# und nicht stark genug fuer hartes Filtern).
+_SCOPE_COMPATIBLE: dict[str, set[str]] = {
+    "dse":           {"dse", "jc", "process", "tom", "accounting"},
+    "cookie":        {"cookie_richtlinie", "banner_implementation",
+                       "cmp_audit", "dse"},
+    "cookie_policy": {"cookie_richtlinie", "banner_implementation",
+                       "cmp_audit", "dse"},
+    "impressum":     {"impressum", "agb"},
+    "agb":           {"agb", "widerruf", "impressum"},
+    "nutzungsbedingungen": {"agb", "widerruf", "impressum"},
+    "widerruf":      {"widerruf", "agb"},
+    "avv":           {"avv", "tom", "jc", "process"},
+    "tom":           {"tom", "avv", "process"},
+    "loeschkonzept": {"process", "dse", "accounting"},
+    "dsfa":          {"process", "tom", "dse"},
+    "social_media":  {"jc", "dse"},
+    "dsa":           {"dse", "impressum"},
+    "legal_notice":  {"impressum", "agb"},
+    "lizenzhinweise": {"agb", "impressum"},
+}
+_PERMISSIVE_SCOPES = {"other", "process", None, "", "null"}
+
+
+def _filter_by_canonical_scope(
+    controls: list[dict],
+    doc_type: str,
+) -> list[dict]:
+    """P72 — wirft MCs raus, deren canonical scope_doc_type explizit auf
+    einen INKOMPATIBLEN Doc-Type zeigt. 'other'/NULL/'process' bleiben
+    drin (Backfill v1 noch zu unsicher).
+    """
+    compatible = _SCOPE_COMPATIBLE.get(doc_type)
+    if not compatible:
+        return controls
+    kept: list[dict] = []
+    dropped = 0
+    for c in controls:
+        scope = c.get("canonical_scope")
+        scope_norm = (scope or "").strip().lower() or None
+        if scope_norm in _PERMISSIVE_SCOPES or scope_norm in compatible:
+            kept.append(c)
+        else:
+            dropped += 1
+    if dropped:
+        logger.info(
+            "P72 scope-filter: %d/%d MCs out-of-scope fuer doc_type=%s",
+            dropped, len(controls), doc_type,
+        )
+    return kept
+
+
 def _load_text_only_ids(
     doc_type: str | None = None,
     business_scope: set[str] | None = None,
@@ -372,11 +425,19 @@ async def _load_controls(doc_type: str, db_url: str, limit: int,
         return []
 
     try:
-        query = """SELECT id, control_id, title, regulation, article,
-                          check_question, pass_criteria, fail_criteria, severity
-                   FROM compliance.doc_check_controls
-                   WHERE doc_type = $1
-                   ORDER BY severity DESC, title"""
+        # P72: LEFT JOIN canonical_controls.scope_doc_type um scope-Info
+        # mitzuziehen. Wenn ein MC explizit fuer einen anderen Doc-Type
+        # klassifiziert ist (z.B. 'tom' statt 'dse'), wird er unten
+        # gefiltert. 'other' / NULL bleiben drin (Backfill noch nicht stark).
+        query = """SELECT dc.id, dc.control_id, dc.title, dc.regulation,
+                          dc.article, dc.check_question, dc.pass_criteria,
+                          dc.fail_criteria, dc.severity,
+                          cc.scope_doc_type AS canonical_scope
+                   FROM compliance.doc_check_controls dc
+                   LEFT JOIN compliance.canonical_controls cc
+                          ON cc.id = dc.control_uuid
+                   WHERE dc.doc_type = $1
+                   ORDER BY dc.severity DESC, dc.title"""
         if limit > 0:
             query += f" LIMIT {limit}"
 
@@ -387,6 +448,12 @@ async def _load_controls(doc_type: str, db_url: str, limit: int,
             rows = await conn.fetch(query, fallback)
 
         controls = [dict(r) for r in rows]
+
+        # P72: Scope-Filter — werfe MCs raus, deren canonical scope_doc_type
+        # explizit auf einen anderen Doc-Type zeigt. Konservativ:
+        # other/NULL/process bleiben drin (zu unsichere Klassifikation).
+        controls = _filter_by_canonical_scope(controls, doc_type)
+
         text_only = _load_text_only_ids(doc_type, business_scope)
         if text_only:
             before = len(controls)