feat(compliance-check): unlock all 1874 MCs + close gap-table items

User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist Schwachsinn'. Fixed all 6 gaps from the audit. #1 max_controls=0 (was 20): - agent_compliance_check_routes _check_single: passes max_controls=0 to check_document_with_controls -> ALL MCs evaluated per doc_type. - 8 doc_types now use 1874 MCs instead of 160 (10x coverage). - Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays. #2 LLM-verify fixed: - llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode wrapped output in <think>...</think>, /api/generate doesn't enforce JSON, prompt didn't handle code-fence wrappers. - Now uses /api/chat with format='json' (forces valid JSON). - _parse_batch_response strips <think> tags, accepts {results:[...]} AND bare [...], adds richer regex-fallback parse, logs raw head on total parse failure for diagnosis. #3 Loeschkonzept checklist (new): - doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398 + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories, retention periods, legal basis refs (HGB/AO/BGB), deletion trigger, deletion process+technical+systems, deletion proof, exceptions + Art. 18 lock, review cycle, DSGVO references. - runner.py registered for loeschkonzept/loeschung/loeschfristen. #4 regulation backfill script: - backend-compliance/scripts/backfill_mc_regulation.py — regex-detects DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO references in MC title+question+pass_criteria, UPDATEs regulation + article fields. - Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE. - Run inside container: docker exec bp-compliance-backend python3 \ /app/scripts/backfill_mc_regulation.py #5 MC alias-fallback: - rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own MCs to a related set: nutzungsbedingungen->agb, social_media->dse, sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept, eu_institution/dsb->dse. - _load_controls retries with the alias when the primary query returns 0 rows. - 14 additional doc_types now get MC coverage transparently. #6 cross-domain auto-discovery: - _autodiscover_missing builds a crawl plan: primary submitted base + up to 2 related domains sharing the owner SLD (e.g. BMW Group: bmw.de + bmwgroup.com + bmwgroup.jobs). - Detection: regex over submitted texts for https?://...<owner>... hostnames distinct from the primary base. - Each crawled base contributes documents + cmp_payloads to the discovery pool. Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was 20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
2026-05-17 13:07:50 +02:00
parent fab1e35847
commit 8a44e67293
6 changed files with 565 additions and 70 deletions
@@ -9,6 +9,7 @@ GET  /compliance/agent/compliance-check/{check_id} — poll status
 import asyncio
 import logging
 import os
+import re
 import uuid as _uuid
 from dataclasses import asdict
 from datetime import datetime, timezone
@@ -600,32 +601,65 @@ async def _autodiscover_missing(
            })
        return

-    base = max(bases, key=bases.get) + "/"
+    # Build crawl plan: primary base + any related domains mentioned in
+    # the submitted texts that share the owner's SLD. Example: BMW Group
+    # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
+    primary_base = max(bases, key=bases.get) + "/"
+    crawl_bases: list[str] = [primary_base]
+    primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
+    owner_token = primary_netloc.split(".")[0]  # 'bmw'
+
+    if owner_token and len(owner_token) >= 3:
+        domain_re = re.compile(
+            r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
+            + r"[a-z0-9\-]*\.[a-z]{2,}",
+            re.IGNORECASE,
+        )
+        seen_bases = {primary_base}
+        for entry in doc_entries:
+            text = entry.get("text") or ""
+            for m in domain_re.finditer(text):
+                p = urlparse(m.group(0))
+                base = f"{p.scheme}://{p.netloc}/"
+                base_netloc = p.netloc.lower().lstrip("www.")
+                if base_netloc == primary_netloc:
+                    continue
+                if base in seen_bases:
+                    continue
+                seen_bases.add(base)
+                crawl_bases.append(base)
+                if len(crawl_bases) >= 3:
+                    break
+            if len(crawl_bases) >= 3:
+                break
+
    _update(
        check_id,
-        f"Suche fehlende Dokumente auf {urlparse(base).netloc}...",
+        f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
        18,
    )

-    try:
-        async with httpx.AsyncClient(timeout=180.0) as client:
-            resp = await client.post(
-                f"{CONSENT_TESTER_URL}/dsi-discovery",
-                json={"url": base, "max_documents": 15},
-                timeout=180.0,
-            )
-            if resp.status_code != 200:
-                logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base)
-                discovered: list[dict] = []
-                disc_payloads: list[dict] = []
-            else:
-                disc_body = resp.json()
-                discovered = disc_body.get("documents", [])
-                disc_payloads = disc_body.get("cmp_payloads") or []
-    except Exception as e:
-        logger.warning("auto-discovery failed for %s: %s", base, e)
-        discovered = []
-        disc_payloads = []
+    discovered: list[dict] = []
+    disc_payloads: list[dict] = []
+    for base in crawl_bases:
+        try:
+            async with httpx.AsyncClient(timeout=180.0) as client:
+                resp = await client.post(
+                    f"{CONSENT_TESTER_URL}/dsi-discovery",
+                    json={"url": base, "max_documents": 15},
+                    timeout=180.0,
+                )
+                if resp.status_code != 200:
+                    logger.warning("auto-discovery: HTTP %d for %s",
+                                   resp.status_code, base)
+                    continue
+                body = resp.json()
+                discovered.extend(body.get("documents", []) or [])
+                disc_payloads.extend(body.get("cmp_payloads") or [])
+                logger.info("auto-discovery on %s: %d docs",
+                            base, len(body.get("documents", []) or []))
+        except Exception as e:
+            logger.warning("auto-discovery failed for %s: %s", base, e)

    # Classify each discovered doc into a canonical doc_type
    by_type: dict[str, dict] = {}
@@ -736,8 +770,12 @@ async def _check_single(

    # Master Control checks (top 20 by severity to avoid noise)
    try:
+        # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
+        # 1874 across 8 types; regex matching is cheap and dominates
+        # well under 1s per doc). Caps remain on the LLM-enrich step
+        # (top-10 FAILs) so cost stays bounded.
        mc_results = await check_document_with_controls(
-            text, doc_type, label, max_controls=20, use_agent=use_agent,
+            text, doc_type, label, max_controls=0, use_agent=use_agent,
        )
        if mc_results:
            for mc in mc_results: