feat(compliance-check): unlock all 1874 MCs + close gap-table items
User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist Schwachsinn'. Fixed all 6 gaps from the audit. #1 max_controls=0 (was 20): - agent_compliance_check_routes _check_single: passes max_controls=0 to check_document_with_controls -> ALL MCs evaluated per doc_type. - 8 doc_types now use 1874 MCs instead of 160 (10x coverage). - Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays. #2 LLM-verify fixed: - llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode wrapped output in <think>...</think>, /api/generate doesn't enforce JSON, prompt didn't handle code-fence wrappers. - Now uses /api/chat with format='json' (forces valid JSON). - _parse_batch_response strips <think> tags, accepts {results:[...]} AND bare [...], adds richer regex-fallback parse, logs raw head on total parse failure for diagnosis. #3 Loeschkonzept checklist (new): - doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398 + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories, retention periods, legal basis refs (HGB/AO/BGB), deletion trigger, deletion process+technical+systems, deletion proof, exceptions + Art. 18 lock, review cycle, DSGVO references. - runner.py registered for loeschkonzept/loeschung/loeschfristen. #4 regulation backfill script: - backend-compliance/scripts/backfill_mc_regulation.py — regex-detects DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO references in MC title+question+pass_criteria, UPDATEs regulation + article fields. - Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE. - Run inside container: docker exec bp-compliance-backend python3 \ /app/scripts/backfill_mc_regulation.py #5 MC alias-fallback: - rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own MCs to a related set: nutzungsbedingungen->agb, social_media->dse, sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept, eu_institution/dsb->dse. - _load_controls retries with the alias when the primary query returns 0 rows. - 14 additional doc_types now get MC coverage transparently. #6 cross-domain auto-discovery: - _autodiscover_missing builds a crawl plan: primary submitted base + up to 2 related domains sharing the owner SLD (e.g. BMW Group: bmw.de + bmwgroup.com + bmwgroup.jobs). - Detection: regex over submitted texts for https?://...<owner>... hostnames distinct from the primary base. - Each crawled base contributes documents + cmp_payloads to the discovery pool. Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was 20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
This commit is contained in:
@@ -9,6 +9,7 @@ GET /compliance/agent/compliance-check/{check_id} — poll status
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid as _uuid
|
||||
from dataclasses import asdict
|
||||
from datetime import datetime, timezone
|
||||
@@ -600,32 +601,65 @@ async def _autodiscover_missing(
|
||||
})
|
||||
return
|
||||
|
||||
base = max(bases, key=bases.get) + "/"
|
||||
# Build crawl plan: primary base + any related domains mentioned in
|
||||
# the submitted texts that share the owner's SLD. Example: BMW Group
|
||||
# text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
|
||||
primary_base = max(bases, key=bases.get) + "/"
|
||||
crawl_bases: list[str] = [primary_base]
|
||||
primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
|
||||
owner_token = primary_netloc.split(".")[0] # 'bmw'
|
||||
|
||||
if owner_token and len(owner_token) >= 3:
|
||||
domain_re = re.compile(
|
||||
r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
|
||||
+ r"[a-z0-9\-]*\.[a-z]{2,}",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
seen_bases = {primary_base}
|
||||
for entry in doc_entries:
|
||||
text = entry.get("text") or ""
|
||||
for m in domain_re.finditer(text):
|
||||
p = urlparse(m.group(0))
|
||||
base = f"{p.scheme}://{p.netloc}/"
|
||||
base_netloc = p.netloc.lower().lstrip("www.")
|
||||
if base_netloc == primary_netloc:
|
||||
continue
|
||||
if base in seen_bases:
|
||||
continue
|
||||
seen_bases.add(base)
|
||||
crawl_bases.append(base)
|
||||
if len(crawl_bases) >= 3:
|
||||
break
|
||||
if len(crawl_bases) >= 3:
|
||||
break
|
||||
|
||||
_update(
|
||||
check_id,
|
||||
f"Suche fehlende Dokumente auf {urlparse(base).netloc}...",
|
||||
f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
|
||||
18,
|
||||
)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=180.0) as client:
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": base, "max_documents": 15},
|
||||
timeout=180.0,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base)
|
||||
discovered: list[dict] = []
|
||||
disc_payloads: list[dict] = []
|
||||
else:
|
||||
disc_body = resp.json()
|
||||
discovered = disc_body.get("documents", [])
|
||||
disc_payloads = disc_body.get("cmp_payloads") or []
|
||||
except Exception as e:
|
||||
logger.warning("auto-discovery failed for %s: %s", base, e)
|
||||
discovered = []
|
||||
disc_payloads = []
|
||||
discovered: list[dict] = []
|
||||
disc_payloads: list[dict] = []
|
||||
for base in crawl_bases:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=180.0) as client:
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": base, "max_documents": 15},
|
||||
timeout=180.0,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.warning("auto-discovery: HTTP %d for %s",
|
||||
resp.status_code, base)
|
||||
continue
|
||||
body = resp.json()
|
||||
discovered.extend(body.get("documents", []) or [])
|
||||
disc_payloads.extend(body.get("cmp_payloads") or [])
|
||||
logger.info("auto-discovery on %s: %d docs",
|
||||
base, len(body.get("documents", []) or []))
|
||||
except Exception as e:
|
||||
logger.warning("auto-discovery failed for %s: %s", base, e)
|
||||
|
||||
# Classify each discovered doc into a canonical doc_type
|
||||
by_type: dict[str, dict] = {}
|
||||
@@ -736,8 +770,12 @@ async def _check_single(
|
||||
|
||||
# Master Control checks (top 20 by severity to avoid noise)
|
||||
try:
|
||||
# max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
|
||||
# 1874 across 8 types; regex matching is cheap and dominates
|
||||
# well under 1s per doc). Caps remain on the LLM-enrich step
|
||||
# (top-10 FAILs) so cost stays bounded.
|
||||
mc_results = await check_document_with_controls(
|
||||
text, doc_type, label, max_controls=20, use_agent=use_agent,
|
||||
text, doc_type, label, max_controls=0, use_agent=use_agent,
|
||||
)
|
||||
if mc_results:
|
||||
for mc in mc_results:
|
||||
|
||||
Reference in New Issue
Block a user