8a44e67293
User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist Schwachsinn'. Fixed all 6 gaps from the audit. #1 max_controls=0 (was 20): - agent_compliance_check_routes _check_single: passes max_controls=0 to check_document_with_controls -> ALL MCs evaluated per doc_type. - 8 doc_types now use 1874 MCs instead of 160 (10x coverage). - Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays. #2 LLM-verify fixed: - llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode wrapped output in <think>...</think>, /api/generate doesn't enforce JSON, prompt didn't handle code-fence wrappers. - Now uses /api/chat with format='json' (forces valid JSON). - _parse_batch_response strips <think> tags, accepts {results:[...]} AND bare [...], adds richer regex-fallback parse, logs raw head on total parse failure for diagnosis. #3 Loeschkonzept checklist (new): - doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398 + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories, retention periods, legal basis refs (HGB/AO/BGB), deletion trigger, deletion process+technical+systems, deletion proof, exceptions + Art. 18 lock, review cycle, DSGVO references. - runner.py registered for loeschkonzept/loeschung/loeschfristen. #4 regulation backfill script: - backend-compliance/scripts/backfill_mc_regulation.py — regex-detects DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO references in MC title+question+pass_criteria, UPDATEs regulation + article fields. - Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE. - Run inside container: docker exec bp-compliance-backend python3 \ /app/scripts/backfill_mc_regulation.py #5 MC alias-fallback: - rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own MCs to a related set: nutzungsbedingungen->agb, social_media->dse, sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept, eu_institution/dsb->dse. - _load_controls retries with the alias when the primary query returns 0 rows. - 14 additional doc_types now get MC coverage transparently. #6 cross-domain auto-discovery: - _autodiscover_missing builds a crawl plan: primary submitted base + up to 2 related domains sharing the owner SLD (e.g. BMW Group: bmw.de + bmwgroup.com + bmwgroup.jobs). - Detection: regex over submitted texts for https?://...<owner>... hostnames distinct from the primary base. - Each crawled base contributes documents + cmp_payloads to the discovery pool. Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was 20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
265 lines
11 KiB
Python
265 lines
11 KiB
Python
"""
|
||
Document check runner — two-pass L1/L2 logic.
|
||
|
||
Pass 1: Run all L1 checks ("Is it mentioned?")
|
||
Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?")
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
|
||
from .dse_checks import ART13_CHECKLIST
|
||
from .widerruf_checks import WIDERRUF_CHECKLIST
|
||
from .agb_checks import AGB_CHECKLIST
|
||
from .impressum_checks import IMPRESSUM_CHECKLIST
|
||
from .cookie_checks import COOKIE_CHECKLIST
|
||
from .social_media_checks import JOINT_CONTROLLER_CHECKLIST
|
||
from .dsfa_checks import DSFA_CHECKLIST
|
||
from .eu_institution_checks import EU_INSTITUTION_CHECKLIST
|
||
from .avv_checks import AVV_CHECKLIST
|
||
from .scc_checks import SCC_CHECKLIST
|
||
from .tom_annex_checks import TOM_ANNEX_CHECKLIST
|
||
from .sub_processor_checks import SUB_PROCESSOR_LIST_CHECKLIST
|
||
from .loeschkonzept_checks import LOESCHKONZEPT_CHECKLIST
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Map doc_type strings to (checklist, label)
|
||
_CHECKLIST_MAP = {
|
||
"dse": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
||
"datenschutz": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
||
"privacy": (ART13_CHECKLIST, "Art. 13 DSGVO"),
|
||
"widerruf": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
||
"withdrawal": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
||
"cancellation": (WIDERRUF_CHECKLIST, "§355 BGB"),
|
||
"agb": (AGB_CHECKLIST, "§305ff BGB"),
|
||
"terms": (AGB_CHECKLIST, "§305ff BGB"),
|
||
"nutzungsbedingungen": (AGB_CHECKLIST, "§305ff BGB"),
|
||
"impressum": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
|
||
"imprint": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"),
|
||
"cookie": (COOKIE_CHECKLIST, "§25 TDDDG"),
|
||
"social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
||
"joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"),
|
||
"dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"),
|
||
"eu_institution": (EU_INSTITUTION_CHECKLIST, "VO (EU) 2018/1725"),
|
||
"avv": (AVV_CHECKLIST, "Art. 28 DSGVO"),
|
||
"auftragsverarbeitung": (AVV_CHECKLIST, "Art. 28 DSGVO"),
|
||
"dpa": (AVV_CHECKLIST, "Art. 28 DSGVO"),
|
||
"scc": (SCC_CHECKLIST, "EU SCC 2021"),
|
||
"standardvertragsklauseln": (SCC_CHECKLIST, "EU SCC 2021"),
|
||
"tom_annex": (TOM_ANNEX_CHECKLIST, "Art. 32 DSGVO"),
|
||
"tom_anlage": (TOM_ANNEX_CHECKLIST, "Art. 32 DSGVO"),
|
||
"tom": (TOM_ANNEX_CHECKLIST, "Art. 32 DSGVO"),
|
||
"sub_processor_list": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"),
|
||
"sub_processor": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"),
|
||
"unterauftragnehmer": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"),
|
||
"loeschkonzept": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
|
||
"loeschung": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
|
||
"loeschfristen": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
|
||
"deletion_concept": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"),
|
||
}
|
||
|
||
|
||
def _match_patterns(patterns: list[str], text_lower: str):
|
||
"""Try each regex pattern against text, return first Match or None."""
|
||
for p in patterns:
|
||
m = re.search(p, text_lower)
|
||
if m:
|
||
return m
|
||
return None
|
||
|
||
|
||
def _extract_context(text_lower: str, match) -> str:
|
||
"""Extract ~30 chars around a match for evidence display."""
|
||
if not match:
|
||
return ""
|
||
start = max(0, match.start() - 30)
|
||
end = min(len(text_lower), match.end() + 30)
|
||
return text_lower[start:end].strip()
|
||
|
||
|
||
def check_document_completeness(
|
||
text: str,
|
||
doc_type: str,
|
||
doc_title: str,
|
||
doc_url: str,
|
||
) -> list[dict]:
|
||
"""Check a legal document against its type-specific requirements.
|
||
|
||
Two-pass approach:
|
||
L1 — Is the mandatory field mentioned at all?
|
||
L2 — Is it correct/complete? (only checked if L1 parent passed)
|
||
|
||
Returns a list of findings (summary + missing items).
|
||
"""
|
||
findings = []
|
||
# Strip soft hyphens ( / \xad) that CMS tools insert for word-breaking
|
||
# — they break regex matches on compound words like "Datenübertragbarkeit"
|
||
text_clean = text.replace("\xad", "").replace("­", "")
|
||
text_lower = text_clean.lower()
|
||
|
||
if not text or len(text) < 50:
|
||
findings.append({
|
||
"code": f"DSI-EMPTY-{doc_type.upper()}",
|
||
"severity": "HIGH",
|
||
"text": f"Dokument '{doc_title}' ist leer oder zu kurz fuer eine Pruefung.",
|
||
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
||
})
|
||
return findings
|
||
|
||
word_count = len(text.split())
|
||
if word_count < 200 and doc_type == "dse":
|
||
findings.append({
|
||
"code": f"DSI-SCORE-{doc_type.upper()}",
|
||
"severity": "LOW",
|
||
"text": (
|
||
f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer "
|
||
f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument."
|
||
),
|
||
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
||
"all_checks": [],
|
||
})
|
||
return findings
|
||
|
||
entry = _CHECKLIST_MAP.get(doc_type, (ART13_CHECKLIST, "Art. 13 DSGVO"))
|
||
checklist, label = entry
|
||
|
||
l1_checks = [c for c in checklist if c.get("level", 1) == 1]
|
||
l2_checks = [c for c in checklist if c.get("level", 1) == 2]
|
||
|
||
# ── Pass 1: L1 checks ────────────────────────────────────────────
|
||
passed_l1_ids: set[str] = set()
|
||
all_checks: list[dict] = []
|
||
l1_present = 0
|
||
l1_scoreable = 0 # Exclude INFO checks from score
|
||
|
||
for check in l1_checks:
|
||
is_info = check.get("severity") == "INFO"
|
||
match = _match_patterns(check["patterns"], text_lower)
|
||
passed = match is not None
|
||
if passed:
|
||
passed_l1_ids.add(check["id"])
|
||
if not is_info:
|
||
l1_present += 1
|
||
if not is_info:
|
||
l1_scoreable += 1
|
||
if not passed and not is_info:
|
||
findings.append({
|
||
"code": f"DSI-MISSING-{check['id'].upper()}",
|
||
"severity": check.get("severity", "MEDIUM"),
|
||
"text": (
|
||
f"'{doc_title}': Pflichtangabe '{check['label']}' nicht gefunden. "
|
||
f"Erforderlich nach {label}."
|
||
),
|
||
"doc_title": doc_title, "doc_url": doc_url,
|
||
"doc_type": doc_type, "check_id": check["id"],
|
||
})
|
||
all_checks.append({
|
||
"id": check["id"], "label": check["label"],
|
||
"passed": passed, "severity": check.get("severity", "MEDIUM"),
|
||
"matched_text": _extract_context(text_lower, match),
|
||
"level": 1, "parent": None, "skipped": False,
|
||
"hint": check.get("hint", ""),
|
||
})
|
||
|
||
# ── Pass 2: L2 checks (only if parent L1 passed) ─────────────────
|
||
l2_total = 0
|
||
l2_passed = 0
|
||
|
||
for check in l2_checks:
|
||
parent = check.get("parent")
|
||
skipped = parent not in passed_l1_ids
|
||
passed = False
|
||
matched_text = ""
|
||
|
||
if not skipped:
|
||
l2_total += 1
|
||
match = _match_patterns(check["patterns"], text_lower)
|
||
passed = match is not None
|
||
if passed:
|
||
l2_passed += 1
|
||
matched_text = _extract_context(text_lower, match)
|
||
else:
|
||
findings.append({
|
||
"code": f"DSI-DETAIL-{check['id'].upper()}",
|
||
"severity": check.get("severity", "MEDIUM"),
|
||
"text": (
|
||
f"'{doc_title}': Detailpruefung '{check['label']}' "
|
||
f"nicht bestanden. Empfohlen nach {label}."
|
||
),
|
||
"doc_title": doc_title, "doc_url": doc_url,
|
||
"doc_type": doc_type, "check_id": check["id"],
|
||
})
|
||
|
||
all_checks.append({
|
||
"id": check["id"], "label": check["label"],
|
||
"passed": passed, "severity": check.get("severity", "MEDIUM"),
|
||
"matched_text": matched_text,
|
||
"level": 2, "parent": parent, "skipped": skipped,
|
||
"hint": check.get("hint", ""),
|
||
})
|
||
|
||
# ── Summary ───────────────────────────────────────────────────────
|
||
l1_total = l1_scoreable # Exclude INFO checks from percentage
|
||
completeness_pct = round(l1_present / l1_total * 100) if l1_total else 0
|
||
correctness_pct = round(l2_passed / l2_total * 100) if l2_total else 0
|
||
|
||
severity = (
|
||
"OK" if completeness_pct == 100
|
||
else "LOW" if completeness_pct >= 80
|
||
else "MEDIUM" if completeness_pct >= 50
|
||
else "HIGH"
|
||
)
|
||
|
||
summary_text = (
|
||
f"'{doc_title}': {l1_present}/{l1_total} Pflichtangaben vorhanden "
|
||
f"({completeness_pct}%)."
|
||
)
|
||
if completeness_pct < 100:
|
||
summary_text += f" Fehlend: {l1_total - l1_present} Angaben nach {label}."
|
||
if l2_total > 0:
|
||
summary_text += (
|
||
f" Detailpruefung: {l2_passed}/{l2_total} bestanden "
|
||
f"({correctness_pct}%)."
|
||
)
|
||
|
||
findings.insert(0, {
|
||
"code": f"DSI-SCORE-{doc_type.upper()}",
|
||
"severity": severity,
|
||
"text": summary_text,
|
||
"doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type,
|
||
"all_checks": all_checks,
|
||
"completeness_pct": completeness_pct,
|
||
"correctness_pct": correctness_pct,
|
||
})
|
||
|
||
return findings
|
||
|
||
|
||
def classify_document_type(title: str, url: str) -> str:
|
||
"""Classify a document by its title/URL into a legal document type."""
|
||
combined = f"{title} {url}".lower()
|
||
|
||
if any(kw in combined for kw in ["datenschutzfolge", "dsfa", "risikoanalyse für nutzung"]):
|
||
return "dsfa"
|
||
if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]):
|
||
if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]):
|
||
return "social_media"
|
||
# EU institution check BEFORE generic privacy — 2018/1725 is more specific
|
||
if any(kw in combined for kw in ["2018/1725", "2018 1725", "regulation (eu)",
|
||
"verordnung (eu)", "edsb", "edps",
|
||
"european data protection supervisor"]):
|
||
return "eu_institution"
|
||
if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]):
|
||
return "dse"
|
||
if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]):
|
||
return "widerruf"
|
||
if any(kw in combined for kw in ["agb", "allgemeine geschäftsbedingungen", "terms",
|
||
"nutzungsbedingungen", "conditions"]):
|
||
return "agb"
|
||
if any(kw in combined for kw in ["cookie", "slapuk", "evästeet", "kakor"]):
|
||
return "cookie"
|
||
if any(kw in combined for kw in ["impressum", "imprint", "legal notice", "mentions légales"]):
|
||
return "impressum"
|
||
return "other"
|