""" DSI Document Checker — validates discovered legal documents against mandatory content requirements. Checks each document type against its specific legal requirements: - Datenschutzinformation: Art. 13/14 DSGVO (9 Pflichtangaben) - AGB: §305ff BGB - Widerrufsbelehrung: §355, §312g BGB - Cookie-Richtlinie: §25 TDDDG - Impressum: §5 TMG / §18 MStV """ import logging import re logger = logging.getLogger(__name__) # Art. 13 DSGVO mandatory fields for privacy policies ART13_CHECKLIST = [ { "id": "controller", "label": "Verantwortlicher (Art. 13(1)(a))", "patterns": [ r"verantwortlich\w*\s+(?:ist|im sinne|fuer)", r"controller", r"verantwortliche\s+stelle", r"responsible\s+(?:party|for)", ], "severity": "HIGH", }, { "id": "dpo", "label": "Datenschutzbeauftragter (Art. 13(1)(b))", "patterns": [ r"datenschutzbeauftragt", r"data\s+protection\s+officer", r"dsb", r"dpo", ], "severity": "MEDIUM", }, { "id": "purposes", "label": "Zwecke der Verarbeitung (Art. 13(1)(c))", "patterns": [ r"zweck\w*\s+(?:der|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung)", r"purpose\w*\s+(?:of|for)\s+(?:processing|data)", r"zu\s+welch\w+\s+zweck", ], "severity": "HIGH", }, { "id": "legal_basis", "label": "Rechtsgrundlage (Art. 13(1)(c))", "patterns": [ r"rechtsgrundlage", r"art\.\s*6\s*(?:abs|absatz)?\s*\.?\s*1", r"legal\s+basis", r"berechtigtes\s+interesse", ], "severity": "HIGH", }, { "id": "recipients", "label": "Empfaenger (Art. 13(1)(e))", "patterns": [ r"empf(?:ae|ä)nger", r"(?:ueber|weiter)mitt(?:el|l)ung", r"recipient", r"weitergabe\s+(?:an|von)\s+daten", r"dritte", r"third\s+part", ], "severity": "MEDIUM", }, { "id": "third_country", "label": "Drittlandtransfer (Art. 13(1)(f))", "patterns": [ r"drittland", r"dritt\s*staat", r"drittl(?:ae|ä)nder", r"third\s+countr", r"angemessenheitsbeschluss", r"standard\s*vertragsklausel", r"scc", ], "severity": "MEDIUM", }, { "id": "retention", "label": "Speicherdauer (Art. 13(2)(a))", "patterns": [ r"speicherdauer", r"aufbewahrungsfrist", r"(?:wie\s+lange|dauer)\s+(?:werden|gespeichert)", r"retention\s+period", r"l(?:oe|ö)sch(?:ung|frist|konzept)", ], "severity": "HIGH", }, { "id": "rights", "label": "Betroffenenrechte (Art. 13(2)(b))", "patterns": [ r"recht\s+auf\s+auskunft", r"recht\s+auf\s+l(?:oe|ö)schung", r"recht\s+auf\s+berichtigung", r"widerspruchsrecht", r"art\.\s*1[5-9]", r"art\.\s*2[0-2]", r"right\s+to\s+(?:access|erasure|rectification|object)", ], "severity": "HIGH", }, { "id": "complaint", "label": "Beschwerderecht (Art. 13(2)(d))", "patterns": [ r"beschwerderecht", r"aufsichtsbeh(?:oe|ö)rde", r"right\s+to\s+lodge\s+a\s+complaint", r"supervisory\s+authority", r"datenschutzbeh(?:oe|ö)rde", ], "severity": "MEDIUM", }, ] # §355 BGB requirements for cancellation/withdrawal policies WIDERRUF_CHECKLIST = [ {"id": "right_info", "label": "Belehrung ueber Widerrufsrecht", "patterns": [r"widerrufsrecht", r"right\s+of\s+withdrawal", r"recht\s+(?:zum|auf)\s+widerruf"]}, {"id": "deadline", "label": "Widerrufsfrist (14 Tage)", "patterns": [r"14\s+tage", r"vierzehn\s+tage", r"14\s+days", r"fourteen\s+days"]}, {"id": "form", "label": "Form des Widerrufs", "patterns": [r"widerrufsformular", r"muster.?widerruf", r"withdrawal\s+form", r"formular"]}, {"id": "consequences", "label": "Folgen des Widerrufs", "patterns": [r"folgen\s+des\s+widerrufs", r"consequences\s+of\s+withdrawal", r"rueckerstattung"]}, ] # AGB minimal requirements AGB_CHECKLIST = [ {"id": "scope", "label": "Geltungsbereich", "patterns": [r"geltungsbereich", r"geltung", r"scope", r"diese\s+(?:agb|bedingungen)\s+gelten"]}, {"id": "contract", "label": "Vertragsschluss", "patterns": [r"vertragsschluss", r"zustandekommen", r"contract\s+formation", r"angebot\s+und\s+annahme"]}, {"id": "liability", "label": "Haftung", "patterns": [r"haftung", r"liability", r"schadensersatz", r"haftungsbeschr(?:ae|ä)nkung"]}, {"id": "jurisdiction", "label": "Gerichtsstand / Anwendbares Recht", "patterns": [r"gerichtsstand", r"anwendbares\s+recht", r"jurisdiction", r"governing\s+law"]}, ] def check_document_completeness( text: str, doc_type: str, doc_title: str, doc_url: str, ) -> list[dict]: """Check a legal document against its type-specific requirements. Returns a list of findings (missing/present fields). """ findings = [] text_lower = text.lower() if not text or len(text) < 50: findings.append({ "code": f"DSI-EMPTY-{doc_type.upper()}", "severity": "HIGH", "text": f"Dokument '{doc_title}' ist leer oder zu kurz fuer eine Pruefung.", "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, }) return findings # Select checklist based on document type if doc_type in ("dse", "datenschutz", "privacy"): checklist = ART13_CHECKLIST label = "Art. 13 DSGVO" elif doc_type in ("widerruf", "withdrawal", "cancellation"): checklist = WIDERRUF_CHECKLIST label = "§355 BGB" elif doc_type in ("agb", "terms", "nutzungsbedingungen"): checklist = AGB_CHECKLIST label = "§305ff BGB" else: checklist = ART13_CHECKLIST # Default: check as DSE label = "Art. 13 DSGVO" present = 0 total = len(checklist) for check in checklist: found = any(re.search(p, text_lower) for p in check["patterns"]) if not found: findings.append({ "code": f"DSI-MISSING-{check['id'].upper()}", "severity": check.get("severity", "MEDIUM"), "text": ( f"'{doc_title}': Pflichtangabe '{check['label']}' nicht gefunden. " f"Erforderlich nach {label}." ), "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, "check_id": check["id"], }) else: present += 1 # Add summary finding if total > 0: pct = round(present / total * 100) if pct < 100: findings.insert(0, { "code": f"DSI-SCORE-{doc_type.upper()}", "severity": "LOW" if pct >= 80 else "MEDIUM" if pct >= 50 else "HIGH", "text": ( f"'{doc_title}': {present}/{total} Pflichtangaben vorhanden ({pct}%). " f"Fehlend: {total - present} Angaben nach {label}." ), "doc_title": doc_title, "doc_url": doc_url, "doc_type": doc_type, }) return findings def classify_document_type(title: str, url: str) -> str: """Classify a document by its title/URL into a legal document type.""" combined = f"{title} {url}".lower() if any(kw in combined for kw in ["datenschutz", "privacy", "dsgvo", "data protection", "données"]): return "dse" if any(kw in combined for kw in ["widerruf", "withdrawal", "rétractation", "desistimiento"]): return "widerruf" if any(kw in combined for kw in ["agb", "allgemeine geschäftsbedingungen", "terms", "nutzungsbedingungen", "conditions"]): return "agb" if any(kw in combined for kw in ["cookie", "slapuk", "evästeet", "kakor"]): return "cookie" if any(kw in combined for kw in ["impressum", "imprint", "legal notice", "mentions légales"]): return "impressum" return "other"