feat: Browser-Matrix Stufe 1.a + 2 weitere GT-Findings + Plausibility-LLM-Härtung

Stage 1.a Browser-Matrix (Task #15) — Multi-Engine Scaffolding: - consent-tester/Dockerfile: firefox + webkit + Xvfb deps - playwright install chromium firefox webkit - services/browser_profiles.py: Registry mit DEFAULT_PROFILES (Chromium-Headed/Firefox-Headed/WebKit-Headed/Mobile-Safari) + EXTRA_PROFILES (Chrome-Channel, Edge, Brave) - services/multi_browser_scanner.py: run_matrix() orchestriert N parallele Scans + worst-of-Aggregation + 3 Sub-Scores (Pre-Consent 50%, Reject-Respekt 30%, Banner-Design 20%) + Hard-Fail-Cap auf <60% bei Pre-Consent/Reject-Verstoß - routes_matrix.py: POST /scan-matrix Endpoint (eigenes Modul, damit main.py unter 500 LOC bleibt) KNOWN: Stage 1.a-Shim ruft alle Profile auf demselben Chromium, echte Engine-Diversität in Stage 1.b (consent_scanner.py Param) Coverage-Gap 3 (Task #17): 2/3 verbleibende GT-Lücken geschlossen: - B9 impressum_multi_entity_check (IMPRESSUM-001): erkennt USt-IdNr/HR/GF-Fehlen pro Entity bei multi-entity Impressen (Elli: USt-IdNr nur bei Elli Mobility, fehlt bei VW Group Charging) - B10 transfer_mechanism_check (TRANSFER-001): pro Non-EU-Vendor in cmp_vendors prüft DSE auf DPF/SCCs/BCRs/Einwilligung im ±400-char-Window. Findet Vendors ohne benannten Mechanismus. - TH-RETENTION-002 (AI-Datenkategorie-Differenzierung) bleibt semantisch-tief, vorgesehen für Specialist-Agents Task #18. Plausibility-LLM Empty-Response-Härtung (Task #16): - BATCH_SIZE 8 → 4, EXCERPT 4000 → 1500 chars, TIMEOUT 60 → 45s - Single-retry mit halbierter Batch wenn LLM empty content zurückgibt — qwen3:30b-a3b rejektiert manchmal ≥6-Item-Prompts unter format='json'. Falls auch Half-Batch empty: log + skip. - Pipeline läuft jetzt nicht mehr 10min in Timeouts. GT-Coverage Sprung: 10/13 → 11/13 (85%). 4/4 HIGH ✓, 5/6 MEDIUM ✓, 2/3 LOW ✓. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-06 21:42:27 +02:00
parent d0e3621192
commit e1dadc8027
10 changed files with 687 additions and 4 deletions
@@ -0,0 +1,92 @@
+"""B9 + B10 wiring — Multi-Entity-Impressum + Drittland-Mechanismus.
+
+Runs after B6/B7/B8. Adds Findings into `state["extra_findings"]`
+and re-renders the extra-block HTML.
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+
+from compliance.services.impressum_multi_entity_check import (
+    check_multi_entity_impressum,
+)
+from compliance.services.transfer_mechanism_check import (
+    check_transfer_mechanism,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def run_b9b10(state: dict) -> None:
+    extras = state.get("extra_findings") or []
+    new: list[dict] = []
+    new.extend(check_multi_entity_impressum(state))
+    new.extend(check_transfer_mechanism(state))
+    if not new:
+        return
+    extras.extend(new)
+    state["extra_findings"] = extras
+    state["extra_findings_html"] = _render(extras)
+    logger.info("B9/B10 added %d findings (total extra=%d)",
+                len(new), len(extras))
+
+
+def _render(findings: list[dict]) -> str:
+    cards = []
+    for f in findings:
+        sev = (f.get("severity") or "").upper()
+        color = "#dc2626" if sev == "HIGH" else (
+            "#f59e0b" if sev == "MEDIUM" else "#64748b"
+        )
+        meta = ""
+        if f.get("entities_missing"):
+            meta = ("<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+                    f"<em>Fehlt bei: "
+                    f"{html.escape(', '.join(f['entities_missing']))}</em>"
+                    "</div>")
+        elif f.get("vendor"):
+            meta = ("<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+                    f"<em>Vendor: {html.escape(f['vendor'])} "
+                    f"({html.escape(f.get('country','?'))})</em>"
+                    "</div>")
+        elif f.get("doc_date"):
+            meta = ("<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+                    f"<em>Stand: {html.escape(f['doc_date'])} "
+                    f"({f.get('age_years','?')} J. alt)</em>"
+                    "</div>")
+        elif f.get("detected_provider"):
+            meta = ("<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+                    f"<em>Erkannter Provider: "
+                    f"{html.escape(f['detected_provider'])}</em>"
+                    "</div>")
+        elif f.get("evidence_dse"):
+            meta = ("<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+                    f"<em>In DSE: {html.escape(', '.join(f['evidence_dse']))}</em>"
+                    "</div>")
+        cards.append(
+            f"<div style='margin:12px 0;padding:14px;background:#fff;"
+            f"border-left:3px solid {color};border-radius:4px;'>"
+            f"<div style='font-weight:600;color:{color};font-size:14px;'>"
+            f"{sev} · {html.escape(f.get('check_id') or '')}</div>"
+            f"<div style='font-size:14px;margin-top:4px;'>"
+            f"<strong>{html.escape(f.get('title') or '')}</strong></div>"
+            f"<div style='font-size:12px;color:#64748b;margin-top:2px;'>"
+            f"{html.escape(f.get('norm') or '')}</div>"
+            f"{meta}"
+            f"<div style='font-size:13px;margin-top:8px;background:#dcfce7;"
+            f"padding:8px 10px;border-radius:4px;'>"
+            f"<strong>→ Empfehlung:</strong> "
+            f"{html.escape(f.get('action') or '')}</div>"
+            "</div>"
+        )
+    return (
+        "<div style='margin:24px 0;padding:16px;border-left:4px solid #f59e0b;"
+        "background:#fffbeb;border-radius:4px;'>"
+        "<h2 style='margin:0 0 8px;color:#92400e;font-size:16px;'>"
+        "📌 Zusätzliche Cross-Doc-Befunde"
+        "</h2>"
+        + "".join(cards) +
+        "</div>"
+    )
@@ -21,6 +21,7 @@ from ._b3_wiring import run_b3
 from ._b4_wiring import run_b4
 from ._b5_wiring import run_b5
 from ._b6b7b8_wiring import run_b6b7b8
+from ._b9b10_wiring import run_b9b10
 from ._constants import _compliance_check_jobs
 from ._phase_a_resolve import run_phase_a
 from ._phase_b_profile_check import run_phase_b
@@ -63,6 +64,7 @@ async def run_compliance_check(check_id: str, req) -> None:
        run_b4(state)  # Cross-doc vendor-consistency (Elli Vertex↔Iadvize)
        run_b5(state)  # AI-Act Art. 50 transparency
        run_b6b7b8(state)  # DPO-cross-doc + Doc-Staleness + CMP-fingerprint
+        run_b9b10(state)  # Multi-Entity-Impressum + Drittland-Mechanismus
        # Phase D-3 top/mid/bot: Step 5 HTML blocks
        await run_phase_d3_top(state)
        await run_phase_d3_mid(state)
@@ -51,8 +51,13 @@ logger = logging.getLogger(__name__)

 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b")
-BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "8"))
-TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "60.0"))
+# Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug.
+# 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total,
+# well within qwen3's safe range for format='json'.
+BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4"))
+TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0"))
+# Reduced excerpt 4000 → 1500 chars (same reason).
+DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500"))

 # In-memory cache: (input_hash) -> result_dict. Survives one run.
 _CACHE: dict[str, dict] = {}
@@ -121,7 +126,8 @@ def _build_user_prompt(items: list[dict], doc_title: str,
    )
    return (
        f"DOKUMENT: {doc_title}\n\n"
-        f"DOKUMENT-AUSZUG (max 4000 Zeichen):\n{doc_excerpt[:4000]}\n\n"
+        f"DOKUMENT-AUSZUG (max {DOC_EXCERPT_CHARS} Zeichen):\n"
+        f"{doc_excerpt[:DOC_EXCERPT_CHARS]}\n\n"
        f"FINDINGS ZU BEWERTEN:\n{findings_block}"
    )

@@ -149,6 +155,23 @@ async def _ask_llm_batch(items: list[dict], doc_title: str,
            r.raise_for_status()
            content = (r.json().get("message") or {}).get("content", "")
            if not content:
+                # Single retry with smaller batch — qwen3 sometimes
+                # rejects ≥6-item prompts under format='json'.
+                if len(items) > 2:
+                    half = len(items) // 2
+                    logger.info(
+                        "plausibility empty → retry split %d → %dx2",
+                        len(items), half,
+                    )
+                    first = await _ask_llm_batch(
+                        items[:half], doc_title, doc_excerpt,
+                    )
+                    second = await _ask_llm_batch(
+                        items[half:], doc_title, doc_excerpt,
+                    )
+                    out.update(first)
+                    out.update(second)
+                    return out
                logger.warning("plausibility LLM returned empty content")
                return out
            try:
@@ -0,0 +1,99 @@
+"""B9 — Multi-Entity-Impressum-Check.
+
+Findings, wenn ein Impressum mehrere Entitäten (mehrere GmbH/AG/UG)
+nennt, aber Pflichtangaben nur bei einer davon vollständig sind.
+
+Konkreter Elli-Pattern (GT IMPRESSUM-001):
+  - Entity 1: "Elli Mobility GmbH ... USt-IdNr DE814424009 ..."
+  - Entity 2: "VW Group Charging GmbH ... [keine USt-IdNr] ..."
+  → USt-IdNr fehlt bei Entity 2.
+
+Heuristik:
+  1. Entitäten erkennen: jede Match auf "<Name> (GmbH|AG|UG|KG|SE)" als
+     Entity-Boundary; Text-Slice von dort bis zur nächsten Entity.
+  2. Pro Entity prüfen: USt-IdNr, Handelsregister, Vertretungsberechtigte.
+  3. Wenn Entity N ein Feld nennt, das Entity M nicht hat → MEDIUM.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+_ENTITY_PAT = re.compile(
+    r"([A-ZÄÖÜ][\w\-\&\s]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|"
+    r"e\.V\.|GbR|OHG|Limited|Ltd|LLC))",
+    re.IGNORECASE,
+)
+
+_USTID_PAT = re.compile(r"\b(?:USt-?Id(?:Nr)?\.?|VAT(?:-?Id)?)\s*[:.\s]\s*"
+                        r"(DE\d{8,10}|[A-Z]{2}\d{6,12})", re.IGNORECASE)
+_HR_PAT = re.compile(r"\b(?:HR[BA]|Handelsregister|Registergericht)"
+                     r"\s*[:.\s]*([\w\s\d\-/]{4,80})", re.IGNORECASE)
+_GF_PAT = re.compile(r"(?:Geschäftsführer|Vertretungsberechtigt|"
+                     r"vertreten\s+durch)\s*[:.\s]+", re.IGNORECASE)
+
+
+def _slice_entities(text: str) -> list[tuple[str, str]]:
+    """Return [(entity_name, text_slice)] for each detected entity."""
+    matches = list(_ENTITY_PAT.finditer(text))
+    if len(matches) < 2:
+        return []
+    slices: list[tuple[str, str]] = []
+    for i, m in enumerate(matches):
+        start = m.start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+        slices.append((m.group(1).strip(), text[start:end]))
+    return slices
+
+
+def check_multi_entity_impressum(state: dict) -> list[dict]:
+    doc_texts = state.get("doc_texts") or {}
+    imp = doc_texts.get("impressum") or ""
+    if not imp:
+        return []
+    slices = _slice_entities(imp)
+    if not slices:
+        return []
+    # Compute features per entity
+    features = []
+    for name, slc in slices:
+        features.append({
+            "name": name,
+            "ust_id": bool(_USTID_PAT.search(slc)),
+            "hr": bool(_HR_PAT.search(slc)),
+            "gf": bool(_GF_PAT.search(slc)),
+        })
+    # If ALL share the same flags → no inconsistency
+    findings: list[dict] = []
+    for field, label in (
+        ("ust_id", "USt-IdNr."),
+        ("hr", "Handelsregister-Eintrag"),
+        ("gf", "Vertretungsberechtigte"),
+    ):
+        present = [f for f in features if f[field]]
+        missing = [f for f in features if not f[field]]
+        if present and missing and len(present) >= 1:
+            findings.append({
+                "check_id": f"IMPRESSUM-MULTI-{field.upper()}",
+                "severity": "MEDIUM",
+                "severity_reason": "incomplete",
+                "title": (
+                    f"{label} fehlt bei "
+                    f"{len(missing)} von {len(features)} Entitäten"
+                ),
+                "norm": "§ 5 Abs. 1 TMG (Pflichtangabe pro Diensteanbieter)",
+                "entities_present": [f["name"] for f in present],
+                "entities_missing": [f["name"] for f in missing],
+                "action": (
+                    f"{label} im Impressum für "
+                    f"{', '.join(f['name'] for f in missing)} ergänzen. "
+                    "Pflichtangabe ist pro Diensteanbieter zu erfüllen, "
+                    "nicht 'eine reicht für alle'."
+                ),
+            })
+    if findings:
+        logger.info("B9 multi-entity impressum: %d findings", len(findings))
+    return findings
@@ -0,0 +1,98 @@
+"""B10 — Drittland-Transfer-Mechanismus-Konsistenz pro Vendor.
+
+DSGVO Art. 44 ff. verlangt für Drittland-Transfers EINEN klaren
+Mechanismus: Angemessenheitsbeschluss / EU-US DPF / SCCs / BCRs /
+ausdrückliche Einwilligung. Wenn ein Vendor in cmp_vendors als
+Drittland-Verarbeiter erkannt wird, muss der DSE-Text einen
+Mechanismus pro Vendor (oder per Vendor-Kategorie) klar benennen.
+
+GT-Pattern Elli (TRANSFER-001):
+  - Google/Meta → DPF in DSE genannt ✓
+  - Salesforce → SCCs ✓
+  - Webflow als US-Sitz erwähnt aber kein Mechanismus → MEDIUM
+
+Heuristik:
+  1. Aus cmp_vendors die Drittland-Vendors filtern (third_country=True).
+  2. Im DSE-Text suchen, ob pro Vendor ein Mechanismus erwähnt ist.
+  3. Wenn ein Drittland-Vendor keinen Mechanismus hat → MEDIUM.
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+_MECHANISM_KEYWORDS = (
+    ("DPF / Data Privacy Framework",
+     ["data privacy framework", "dpf-", "eu-us dpf",
+      "angemessenheitsbeschluss"]),
+    ("Standardvertragsklauseln (SCCs)",
+     ["standardvertragsklauseln", "scc-", "scc ", "standard contractual",
+      "art. 46 abs. 2 lit. c"]),
+    ("Binding Corporate Rules",
+     ["binding corporate rules", "bcr-", "verbindliche unternehmensregeln"]),
+    ("Ausdrückliche Einwilligung",
+     ["ausdrückliche einwilligung nach art. 49",
+      "explicit consent under art. 49"]),
+)
+
+
+def _mechanism_for_vendor(vendor_name: str, dse_text: str) -> str | None:
+    if not vendor_name or not dse_text:
+        return None
+    name_lc = vendor_name.lower()
+    text_lc = dse_text.lower()
+    # Find vendor mention in DSE; locate a ±400 char window for
+    # mechanism keywords
+    idx = text_lc.find(name_lc)
+    if idx < 0:
+        return None
+    window = text_lc[max(0, idx - 400): idx + 400]
+    for mech_label, kws in _MECHANISM_KEYWORDS:
+        if any(k in window for k in kws):
+            return mech_label
+    return None
+
+
+def check_transfer_mechanism(state: dict) -> list[dict]:
+    cmp_vendors = state.get("cmp_vendors") or []
+    doc_texts = state.get("doc_texts") or {}
+    dse = doc_texts.get("dse") or ""
+    if not cmp_vendors or not dse:
+        return []
+    findings: list[dict] = []
+    for v in cmp_vendors:
+        country = (v.get("country") or "").upper().strip()
+        name = (v.get("name") or "").strip()
+        if not name:
+            continue
+        # Skip EU/EEA
+        if country in ("DE", "AT", "BE", "BG", "HR", "CY", "CZ", "DK",
+                       "EE", "FI", "FR", "GR", "HU", "IE", "IT", "LV",
+                       "LT", "LU", "MT", "NL", "PL", "PT", "RO", "SK",
+                       "SI", "ES", "SE", "IS", "LI", "NO", "CH"):
+            continue
+        # Either flagged as third_country OR country not in EU
+        mech = _mechanism_for_vendor(name, dse)
+        if mech is None:
+            findings.append({
+                "check_id": "TRANSFER-MECH-001",
+                "vendor": name,
+                "country": country or "UNKNOWN",
+                "severity": "MEDIUM",
+                "severity_reason": "missing",
+                "title": (
+                    f"Drittland-Transfer-Mechanismus für {name} "
+                    f"({country or 'Drittland'}) fehlt in DSE"
+                ),
+                "norm": "DSGVO Art. 44 + Art. 46 / Art. 49",
+                "action": (
+                    f"Im DSE-Abschnitt zu {name} den Transfermechanismus "
+                    "angeben (DPF / SCCs / BCRs / Einwilligung) und ggf. "
+                    "Vertragsdokument referenzieren."
+                ),
+            })
+    if findings:
+        logger.info("B10 transfer-mechanism: %d findings", len(findings))
+    return findings