""" Document check runner — two-pass L1/L2 logic. Pass 1: Run all L1 checks ("Is it mentioned?") Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?") """ import logging import re from .dse_checks import ART13_CHECKLIST from .widerruf_checks import WIDERRUF_CHECKLIST from .agb_checks import AGB_CHECKLIST from .impressum_checks import IMPRESSUM_CHECKLIST from .cookie_checks import COOKIE_CHECKLIST from .social_media_checks import JOINT_CONTROLLER_CHECKLIST from .dsfa_checks import DSFA_CHECKLIST logger = logging.getLogger(__name__) # Map doc_type strings to (checklist, label) _CHECKLIST_MAP = { "dse": (ART13_CHECKLIST, "Art. 13 DSGVO"), "datenschutz": (ART13_CHECKLIST, "Art. 13 DSGVO"), "privacy": (ART13_CHECKLIST, "Art. 13 DSGVO"), "widerruf": (WIDERRUF_CHECKLIST, "§355 BGB"), "withdrawal": (WIDERRUF_CHECKLIST, "§355 BGB"), "cancellation": (WIDERRUF_CHECKLIST, "§355 BGB"), "agb": (AGB_CHECKLIST, "§305ff BGB"), "terms": (AGB_CHECKLIST, "§305ff BGB"), "nutzungsbedingungen": (AGB_CHECKLIST, "§305ff BGB"), "impressum": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"), "imprint": (IMPRESSUM_CHECKLIST, "§5 TMG / §18 MStV"), "cookie": (COOKIE_CHECKLIST, "§25 TDDDG"), "social_media": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"), "joint_controller": (JOINT_CONTROLLER_CHECKLIST, "Art. 26 DSGVO"), "dsfa": (DSFA_CHECKLIST, "Art. 35 DSGVO"), } def _match_patterns(patterns: list[str], text_lower: str): """Try each regex pattern against text, return first Match or None.""" for p in patterns: m = re.search(p, text_lower) if m: return m return None def _extract_context(text_lower: str, match) -> str: """Extract ~30 chars around a match for evidence display.""" if not match: return "" start = max(0, match.start() - 30) end = min(len(text_lower), match.end() + 30) return text_lower[start:end].strip() def check_document_completeness( text: str, doc_type: str, doc_title: str, doc_url: str, ) -> list[dict]: """Check a legal document against its type-specific requirements. Two-pass approach: L1 — Is the mandatory field mentioned at all? L2 — Is it correct/complete? (only checked if L1 parent passed) Returns a list of findings (summary + missing items). """ findings = [] text_lower = text.lower() if not text or len(text) < 50: findings.append({ "code": f"DSI-EMPTY-{doc_type.upper()}", "severity": "HIGH", "text": f"Dokument '{doc_title}' ist leer oder zu kurz fuer eine Pruefung.", "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, }) return findings word_count = len(text.split()) if word_count < 200 and doc_type == "dse": findings.append({ "code": f"DSI-SCORE-{doc_type.upper()}", "severity": "LOW", "text": ( f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer " f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument." ), "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, "all_checks": [], }) return findings entry = _CHECKLIST_MAP.get(doc_type, (ART13_CHECKLIST, "Art. 13 DSGVO")) checklist, label = entry l1_checks = [c for c in checklist if c.get("level", 1) == 1] l2_checks = [c for c in checklist if c.get("level", 1) == 2] # ── Pass 1: L1 checks ──────────────────────────────────────────── passed_l1_ids: set[str] = set() all_checks: list[dict] = [] l1_present = 0 for check in l1_checks: match = _match_patterns(check["patterns"], text_lower) passed = match is not None if passed: passed_l1_ids.add(check["id"]) l1_present += 1 else: findings.append({ "code": f"DSI-MISSING-{check['id'].upper()}", "severity": check.get("severity", "MEDIUM"), "text": ( f"'{doc_title}': Pflichtangabe '{check['label']}' nicht gefunden. " f"Erforderlich nach {label}." ), "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, "check_id": check["id"], }) all_checks.append({ "id": check["id"], "label": check["label"], "passed": passed, "severity": check.get("severity", "MEDIUM"), "matched_text": _extract_context(text_lower, match), "level": 1, "parent": None, "skipped": False, "hint": check.get("hint", ""), }) # ── Pass 2: L2 checks (only if parent L1 passed) ───────────────── l2_total = 0 l2_passed = 0 for check in l2_checks: parent = check.get("parent") skipped = parent not in passed_l1_ids passed = False matched_text = "" if not skipped: l2_total += 1 match = _match_patterns(check["patterns"], text_lower) passed = match is not None if passed: l2_passed += 1 matched_text = _extract_context(text_lower, match) else: findings.append({ "code": f"DSI-DETAIL-{check['id'].upper()}", "severity": check.get("severity", "MEDIUM"), "text": ( f"'{doc_title}': Detailpruefung '{check['label']}' " f"nicht bestanden. Empfohlen nach {label}." ), "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, "check_id": check["id"], }) all_checks.append({ "id": check["id"], "label": check["label"], "passed": passed, "severity": check.get("severity", "MEDIUM"), "matched_text": matched_text, "level": 2, "parent": parent, "skipped": skipped, "hint": check.get("hint", ""), }) # ── Summary ─────────────────────────────────────────────────────── l1_total = len(l1_checks) completeness_pct = round(l1_present / l1_total * 100) if l1_total else 0 correctness_pct = round(l2_passed / l2_total * 100) if l2_total else 0 severity = ( "OK" if completeness_pct == 100 else "LOW" if completeness_pct >= 80 else "MEDIUM" if completeness_pct >= 50 else "HIGH" ) summary_text = ( f"'{doc_title}': {l1_present}/{l1_total} Pflichtangaben vorhanden " f"({completeness_pct}%)." ) if completeness_pct < 100: summary_text += f" Fehlend: {l1_total - l1_present} Angaben nach {label}." if l2_total > 0: summary_text += ( f" Detailpruefung: {l2_passed}/{l2_total} bestanden " f"({correctness_pct}%)." ) findings.insert(0, { "code": f"DSI-SCORE-{doc_type.upper()}", "severity": severity, "text": summary_text, "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, "all_checks": all_checks, "completeness_pct": completeness_pct, "correctness_pct": correctness_pct, }) return findings def classify_document_type(title: str, url: str) -> str: """Classify a document by its title/URL into a legal document type.""" combined = f"{title} {url}".lower() if any(kw in combined for kw in ["datenschutzfolge", "dsfa", "risikoanalyse für nutzung"]): return "dsfa" if any(kw in combined for kw in ["social media", "facebook", "instagram", "linkedin", "fanpage"]): if any(kw in combined for kw in ["datenschutzerkl", "datenschutz für", "datenschutzinformation"]): return "social_media" if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]): return "dse" if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]): return "widerruf" if any(kw in combined for kw in ["agb", "allgemeine geschäftsbedingungen", "terms", "nutzungsbedingungen", "conditions"]): return "agb" if any(kw in combined for kw in ["cookie", "slapuk", "evästeet", "kakor"]): return "cookie" if any(kw in combined for kw in ["impressum", "imprint", "legal notice", "mentions légales"]): return "impressum" return "other"