Files
breakpilot-compliance/backend-compliance/compliance/services/doc_checks/runner.py
T
Benjamin Admin 8a44e67293 feat(compliance-check): unlock all 1874 MCs + close gap-table items
User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist
Schwachsinn'. Fixed all 6 gaps from the audit.

#1 max_controls=0 (was 20):
- agent_compliance_check_routes _check_single: passes max_controls=0 to
  check_document_with_controls -> ALL MCs evaluated per doc_type.
- 8 doc_types now use 1874 MCs instead of 160 (10x coverage).
- Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays.

#2 LLM-verify fixed:
- llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode
  wrapped output in <think>...</think>, /api/generate doesn't enforce
  JSON, prompt didn't handle code-fence wrappers.
- Now uses /api/chat with format='json' (forces valid JSON).
- _parse_batch_response strips <think> tags, accepts {results:[...]}
  AND bare [...], adds richer regex-fallback parse, logs raw head on
  total parse failure for diagnosis.

#3 Loeschkonzept checklist (new):
- doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398
  + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories,
  retention periods, legal basis refs (HGB/AO/BGB), deletion trigger,
  deletion process+technical+systems, deletion proof, exceptions +
  Art. 18 lock, review cycle, DSGVO references.
- runner.py registered for loeschkonzept/loeschung/loeschfristen.

#4 regulation backfill script:
- backend-compliance/scripts/backfill_mc_regulation.py — regex-detects
  DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO
  references in MC title+question+pass_criteria, UPDATEs regulation +
  article fields.
- Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE.
- Run inside container: docker exec bp-compliance-backend python3 \
    /app/scripts/backfill_mc_regulation.py

#5 MC alias-fallback:
- rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own
  MCs to a related set: nutzungsbedingungen->agb, social_media->dse,
  sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept,
  eu_institution/dsb->dse.
- _load_controls retries with the alias when the primary query
  returns 0 rows.
- 14 additional doc_types now get MC coverage transparently.

#6 cross-domain auto-discovery:
- _autodiscover_missing builds a crawl plan: primary submitted base
  + up to 2 related domains sharing the owner SLD (e.g. BMW Group:
  bmw.de + bmwgroup.com + bmwgroup.jobs).
- Detection: regex over submitted texts for https?://...<owner>...
  hostnames distinct from the primary base.
- Each crawled base contributes documents + cmp_payloads to the
  discovery pool.

Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was
20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false
regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social
Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
2026-05-17 13:07:50 +02:00

265 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Document check runner — two-pass L1/L2 logic.
Pass 1: Run all L1 checks ("Is it mentioned?")
Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?")
"""
import logging
import re
from .dse_checks import ART13_CHECKLIST
from .widerruf_checks import WIDERRUF_CHECKLIST
from .agb_checks import AGB_CHECKLIST
from .impressum_checks import IMPRESSUM_CHECKLIST
from .cookie_checks import COOKIE_CHECKLIST
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
from .dsfa_checks import DSFA_CHECKLIST
from .eu_institution_checks import EU_INSTITUTION_CHECKLIST
from .avv_checks import AVV_CHECKLIST
from .scc_checks import SCC_CHECKLIST
from .tom_annex_checks import TOM_ANNEX_CHECKLIST
from .sub_processor_checks import SUB_PROCESSOR_LIST_CHECKLIST
from .loeschkonzept_checks import LOESCHKONZEPT_CHECKLIST
logger = logging.getLogger(__name__)
# Map doc_type strings to (checklist, label)
_CHECKLIST_MAP = {
"dse": (ART13_CHECKLIST, "Art. 13 DSGVO"),
"datenschutz": (ART13_CHECKLIST, "Art. 13 DSGVO"),
"privacy": (ART13_CHECKLIST, "Art. 13 DSGVO"),
"widerruf": (WIDERRUF_CHECKLIST, "§355 BGB"),
"withdrawal": (WIDERRUF_CHECKLIST, "§355 BGB"),
"cancellation": (WIDERRUF_CHECKLIST, "§355 BGB"),
"agb": (AGB_CHECKLIST, "§305ff BGB"),
"terms": (AGB_CHECKLIST, "§305ff BGB"),
"nutzungsbedingungen": (AGB_CHECKLIST, "§305ff BGB"),
"impressum": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
"imprint": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
"cookie": (COOKIE_CHECKLIST, "§25 TDDDG"),
"social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
"joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
"dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"),
"eu_institution": (EU_INSTITUTION_CHECKLIST, "VO (EU) 2018/1725"),
"avv": (AVV_CHECKLIST, "Art. 28 DSGVO"),
"auftragsverarbeitung": (AVV_CHECKLIST, "Art. 28 DSGVO"),
"dpa": (AVV_CHECKLIST, "Art. 28 DSGVO"),
"scc": (SCC_CHECKLIST, "EU SCC 2021"),
"standardvertragsklauseln": (SCC_CHECKLIST, "EU SCC 2021"),
"tom_annex": (TOM_ANNEX_CHECKLIST, "Art. 32 DSGVO"),
"tom_anlage": (TOM_ANNEX_CHECKLIST, "Art. 32 DSGVO"),
"tom": (TOM_ANNEX_CHECKLIST, "Art. 32 DSGVO"),
"sub_processor_list": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"),
"sub_processor": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"),
"unterauftragnehmer": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"),
"loeschkonzept": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
"loeschung": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
"loeschfristen": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
"deletion_concept": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
}
def _match_patterns(patterns: list[str], text_lower: str):
"""Try each regex pattern against text, return first Match or None."""
for p in patterns:
m = re.search(p, text_lower)
if m:
return m
return None
def _extract_context(text_lower: str, match) -> str:
"""Extract ~30 chars around a match for evidence display."""
if not match:
return ""
start = max(0, match.start() - 30)
end = min(len(text_lower), match.end() + 30)
return text_lower[start:end].strip()
def check_document_completeness(
text: str,
doc_type: str,
doc_title: str,
doc_url: str,
) -> list[dict]:
"""Check a legal document against its type-specific requirements.
Two-pass approach:
L1 — Is the mandatory field mentioned at all?
L2 — Is it correct/complete? (only checked if L1 parent passed)
Returns a list of findings (summary + missing items).
"""
findings = []
# Strip soft hyphens (­ / \xad) that CMS tools insert for word-breaking
# — they break regex matches on compound words like "Datenübertragbarkeit"
text_clean = text.replace("\xad", "").replace("&shy;", "")
text_lower = text_clean.lower()
if not text or len(text) < 50:
findings.append({
"code": f"DSI-EMPTY-{doc_type.upper()}",
"severity": "HIGH",
"text": f"Dokument '{doc_title}' ist leer oder zu kurz fuer eine Pruefung.",
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
})
return findings
word_count = len(text.split())
if word_count < 200 and doc_type == "dse":
findings.append({
"code": f"DSI-SCORE-{doc_type.upper()}",
"severity": "LOW",
"text": (
f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer "
f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument."
),
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
"all_checks": [],
})
return findings
entry = _CHECKLIST_MAP.get(doc_type, (ART13_CHECKLIST, "Art. 13 DSGVO"))
checklist, label = entry
l1_checks = [c for c in checklist if c.get("level", 1) == 1]
l2_checks = [c for c in checklist if c.get("level", 1) == 2]
# ── Pass 1: L1 checks ────────────────────────────────────────────
passed_l1_ids: set[str] = set()
all_checks: list[dict] = []
l1_present = 0
l1_scoreable = 0 # Exclude INFO checks from score
for check in l1_checks:
is_info = check.get("severity") == "INFO"
match = _match_patterns(check["patterns"], text_lower)
passed = match is not None
if passed:
passed_l1_ids.add(check["id"])
if not is_info:
l1_present += 1
if not is_info:
l1_scoreable += 1
if not passed and not is_info:
findings.append({
"code": f"DSI-MISSING-{check['id'].upper()}",
"severity": check.get("severity", "MEDIUM"),
"text": (
f"'{doc_title}': Pflichtangabe '{check['label']}' nicht gefunden. "
f"Erforderlich nach {label}."
),
"doc_title": doc_title, "doc_url": doc_url,
"doc_type": doc_type, "check_id": check["id"],
})
all_checks.append({
"id": check["id"], "label": check["label"],
"passed": passed, "severity": check.get("severity", "MEDIUM"),
"matched_text": _extract_context(text_lower, match),
"level": 1, "parent": None, "skipped": False,
"hint": check.get("hint", ""),
})
# ── Pass 2: L2 checks (only if parent L1 passed) ─────────────────
l2_total = 0
l2_passed = 0
for check in l2_checks:
parent = check.get("parent")
skipped = parent not in passed_l1_ids
passed = False
matched_text = ""
if not skipped:
l2_total += 1
match = _match_patterns(check["patterns"], text_lower)
passed = match is not None
if passed:
l2_passed += 1
matched_text = _extract_context(text_lower, match)
else:
findings.append({
"code": f"DSI-DETAIL-{check['id'].upper()}",
"severity": check.get("severity", "MEDIUM"),
"text": (
f"'{doc_title}': Detailpruefung '{check['label']}' "
f"nicht bestanden. Empfohlen nach {label}."
),
"doc_title": doc_title, "doc_url": doc_url,
"doc_type": doc_type, "check_id": check["id"],
})
all_checks.append({
"id": check["id"], "label": check["label"],
"passed": passed, "severity": check.get("severity", "MEDIUM"),
"matched_text": matched_text,
"level": 2, "parent": parent, "skipped": skipped,
"hint": check.get("hint", ""),
})
# ── Summary ───────────────────────────────────────────────────────
l1_total = l1_scoreable # Exclude INFO checks from percentage
completeness_pct = round(l1_present / l1_total * 100) if l1_total else 0
correctness_pct = round(l2_passed / l2_total * 100) if l2_total else 0
severity = (
"OK" if completeness_pct == 100
else "LOW" if completeness_pct >= 80
else "MEDIUM" if completeness_pct >= 50
else "HIGH"
)
summary_text = (
f"'{doc_title}': {l1_present}/{l1_total} Pflichtangaben vorhanden "
f"({completeness_pct}%)."
)
if completeness_pct < 100:
summary_text += f" Fehlend: {l1_total - l1_present} Angaben nach {label}."
if l2_total > 0:
summary_text += (
f" Detailpruefung: {l2_passed}/{l2_total} bestanden "
f"({correctness_pct}%)."
)
findings.insert(0, {
"code": f"DSI-SCORE-{doc_type.upper()}",
"severity": severity,
"text": summary_text,
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
"all_checks": all_checks,
"completeness_pct": completeness_pct,
"correctness_pct": correctness_pct,
})
return findings
def classify_document_type(title: str, url: str) -> str:
"""Classify a document by its title/URL into a legal document type."""
combined = f"{title} {url}".lower()
if any(kw in combined for kw in ["datenschutzfolge", "dsfa", "risikoanalyse für nutzung"]):
return "dsfa"
if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]):
if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]):
return "social_media"
# EU institution check BEFORE generic privacy — 2018/1725 is more specific
if any(kw in combined for kw in ["2018/1725", "2018 1725", "regulation (eu)",
"verordnung (eu)", "edsb", "edps",
"european data protection supervisor"]):
return "eu_institution"
if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]):
return "dse"
if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]):
return "widerruf"
if any(kw in combined for kw in ["agb", "allgemeine geschäftsbedingungen", "terms",
"nutzungsbedingungen", "conditions"]):
return "agb"
if any(kw in combined for kw in ["cookie", "slapuk", "evästeet", "kakor"]):
return "cookie"
if any(kw in combined for kw in ["impressum", "imprint", "legal notice", "mentions légales"]):
return "impressum"
return "other"