diff --git a/backend-compliance/compliance/services/mandatory_content_checker.py b/backend-compliance/compliance/services/mandatory_content_checker.py new file mode 100644 index 0000000..8631c3c --- /dev/null +++ b/backend-compliance/compliance/services/mandatory_content_checker.py @@ -0,0 +1,274 @@ +""" +Mandatory Content Checker — verifies that legally required content +is present on a website. Checks for missing documents, sections, +and mandatory information within documents. + +Knows what MUST be there (not just what IS there). +""" + +import logging +import re +from dataclasses import dataclass, field + +from compliance.services.dse_parser import DSESection + +logger = logging.getLogger(__name__) + + +@dataclass +class MandatoryFinding: + code: str + severity: str # "HIGH", "MEDIUM", "LOW" + category: str # "document_missing", "section_missing", "info_missing" + text: str + legal_ref: str + expected: str # What should be there + suggestion: str = "" # How to fix + + +# ═══════════════════════════════════════════════════════════════ +# MANDATORY DOCUMENTS (must exist as pages/links on the website) +# ═══════════════════════════════════════════════════════════════ + +MANDATORY_DOCUMENTS = [ + { + "id": "impressum", + "name": "Impressum", + "legal_ref": "§5 TMG, §18 MStV", + "patterns": [r"impressum", r"imprint", r"legal.?notice"], + "severity": "HIGH", + }, + { + "id": "datenschutz", + "name": "Datenschutzerklaerung", + "legal_ref": "Art. 13/14 DSGVO", + "patterns": [r"datenschutz", r"privacy", r"dsgvo"], + "severity": "HIGH", + }, + { + "id": "agb", + "name": "AGB / Nutzungsbedingungen", + "legal_ref": "§305 BGB (bei Vertragsschluss)", + "patterns": [r"agb", r"nutzungsbedingung", r"terms"], + "severity": "MEDIUM", + }, + { + "id": "widerruf", + "name": "Widerrufsbelehrung", + "legal_ref": "§355 BGB, Art. 246a §1 EGBGB (nur Fernabsatz)", + "patterns": [r"widerruf", r"cancellation.?policy", r"right.?of.?withdrawal"], + "severity": "MEDIUM", + }, +] + + +# ═══════════════════════════════════════════════════════════════ +# MANDATORY DSE SECTIONS (Art. 13 DSGVO Pflichtangaben) +# ═══════════════════════════════════════════════════════════════ + +MANDATORY_DSE_CONTENT = [ + { + "id": "verantwortlicher", + "name": "Name und Kontakt des Verantwortlichen", + "legal_ref": "Art. 13 Abs. 1 lit. a DSGVO", + "keywords": ["verantwortlich", "responsible", "controller", "betreiber"], + "severity": "HIGH", + }, + { + "id": "dsb_kontakt", + "name": "Kontaktdaten des Datenschutzbeauftragten", + "legal_ref": "Art. 13 Abs. 1 lit. b DSGVO", + "keywords": ["datenschutzbeauftragt", "data protection officer", "dsb", "dpo"], + "severity": "HIGH", + }, + { + "id": "zwecke", + "name": "Zwecke der Datenverarbeitung", + "legal_ref": "Art. 13 Abs. 1 lit. c DSGVO", + "keywords": ["zweck", "purpose", "verarbeitungszweck"], + "severity": "HIGH", + }, + { + "id": "rechtsgrundlage", + "name": "Rechtsgrundlagen der Verarbeitung", + "legal_ref": "Art. 13 Abs. 1 lit. c DSGVO", + "keywords": ["rechtsgrundlage", "legal basis", "art. 6", "art.6"], + "severity": "HIGH", + }, + { + "id": "speicherdauer", + "name": "Speicherdauer / Loeschfristen", + "legal_ref": "Art. 13 Abs. 2 lit. a DSGVO", + "keywords": ["speicherdauer", "aufbewahrung", "loeschung", "loeschfrist", + "storage period", "retention", "deletion"], + "severity": "HIGH", + }, + { + "id": "betroffenenrechte", + "name": "Betroffenenrechte (Auskunft, Loeschung, etc.)", + "legal_ref": "Art. 13 Abs. 2 lit. b-d DSGVO", + "keywords": ["betroffenenrecht", "auskunft", "berichtigung", "loeschung", + "einschraenkung", "widerspruch", "data subject rights", + "right to access", "right to erasure"], + "severity": "HIGH", + }, + { + "id": "beschwerderecht", + "name": "Beschwerderecht bei Aufsichtsbehoerde", + "legal_ref": "Art. 13 Abs. 2 lit. d DSGVO", + "keywords": ["aufsichtsbehoerde", "beschwerde", "supervisory authority", + "datenschutzbehoerde"], + "severity": "MEDIUM", + }, + { + "id": "drittlandtransfer", + "name": "Drittlandtransfer-Information", + "legal_ref": "Art. 13 Abs. 1 lit. f DSGVO", + "keywords": ["drittland", "drittst", "third countr", "usa", "transfer", + "standardvertragsklausel", "adequacy"], + "severity": "MEDIUM", + }, + { + "id": "automatisierte_entscheidung", + "name": "Automatisierte Entscheidungsfindung / Profiling", + "legal_ref": "Art. 13 Abs. 2 lit. f DSGVO", + "keywords": ["automatisiert", "profiling", "automated decision", "scoring"], + "severity": "MEDIUM", + }, +] + + +# ═══════════════════════════════════════════════════════════════ +# MANDATORY IMPRESSUM CONTENT (§5 TMG) +# ═══════════════════════════════════════════════════════════════ + +MANDATORY_IMPRESSUM_CONTENT = [ + { + "id": "geschaeftsfuehrer", + "name": "Geschaeftsfuehrer / Vertretungsberechtigter", + "legal_ref": "§5 Abs. 1 Nr. 1 TMG", + "keywords": ["geschaeftsfuehrer", "geschäftsführer", "ceo", "managing director", + "vertretungsberechtig", "vorstand"], + "severity": "HIGH", + }, + { + "id": "handelsregister", + "name": "Handelsregisternummer", + "legal_ref": "§5 Abs. 1 Nr. 4 TMG", + "keywords": ["handelsregister", "hrb", "hra", "amtsgericht", "registergericht", + "commercial register"], + "severity": "HIGH", + }, + { + "id": "ust_id", + "name": "Umsatzsteuer-Identifikationsnummer", + "legal_ref": "§5 Abs. 1 Nr. 6 TMG", + "keywords": ["ust-id", "ust.-id", "umsatzsteuer", "vat", "de\\d{9}"], + "severity": "MEDIUM", + }, + { + "id": "anschrift", + "name": "Anschrift (Strasse, PLZ, Ort)", + "legal_ref": "§5 Abs. 1 Nr. 1 TMG", + "keywords": ["str.", "straße", "strasse", "plz", "postleitzahl"], + "severity": "HIGH", + }, + { + "id": "kontakt", + "name": "Kontaktmoeglichkeit (Email oder Telefon)", + "legal_ref": "§5 Abs. 1 Nr. 2 TMG", + "keywords": ["@", "telefon", "phone", "e-mail", "email", "kontakt"], + "severity": "HIGH", + }, +] + + +def check_mandatory_documents( + scanned_pages: list[str], page_status: dict[str, int], +) -> list[MandatoryFinding]: + """Check if mandatory documents/pages exist on the website.""" + findings = [] + + for doc in MANDATORY_DOCUMENTS: + found = False + for page in scanned_pages: + if any(re.search(p, page, re.IGNORECASE) for p in doc["patterns"]): + status = page_status.get(page, 200) + if status < 400: + found = True + else: + findings.append(MandatoryFinding( + code=f"DOC-ERROR-{doc['id'].upper()}", + severity="HIGH", + category="document_error", + text=f"{doc['name']} existiert aber gibt HTTP {status} zurueck (Ladefehler!)", + legal_ref=doc["legal_ref"], + expected=doc["name"], + suggestion=f"Seite {page} ist nicht erreichbar. Pruefen ob ein Deployment-Fehler vorliegt.", + )) + found = True # Exists but broken + break + + if not found: + findings.append(MandatoryFinding( + code=f"DOC-MISSING-{doc['id'].upper()}", + severity=doc["severity"], + category="document_missing", + text=f"{doc['name']} nicht auf der Website gefunden ({doc['legal_ref']})", + legal_ref=doc["legal_ref"], + expected=f"Link zu {doc['name']} muss von jeder Seite erreichbar sein", + )) + + return findings + + +def check_dse_mandatory_content( + sections: list[DSESection], full_text: str, +) -> list[MandatoryFinding]: + """Check if privacy policy contains all mandatory sections per Art. 13 DSGVO.""" + findings = [] + text_lower = full_text.lower() + + for req in MANDATORY_DSE_CONTENT: + found = any(kw in text_lower for kw in req["keywords"]) + if not found: + # Also check section headings + found = any( + any(kw in s.heading.lower() or kw in s.content.lower()[:200] + for kw in req["keywords"]) + for s in sections + ) + + if not found: + findings.append(MandatoryFinding( + code=f"DSE-CONTENT-{req['id'].upper()}", + severity=req["severity"], + category="section_missing", + text=f"Pflichtangabe fehlt: {req['name']} ({req['legal_ref']})", + legal_ref=req["legal_ref"], + expected=req["name"], + )) + + return findings + + +def check_impressum_mandatory_content( + impressum_text: str, +) -> list[MandatoryFinding]: + """Check if Impressum contains all mandatory info per §5 TMG.""" + findings = [] + text_lower = impressum_text.lower() + + for req in MANDATORY_IMPRESSUM_CONTENT: + found = any(re.search(kw, text_lower) for kw in req["keywords"]) + if not found: + findings.append(MandatoryFinding( + code=f"IMP-CONTENT-{req['id'].upper()}", + severity=req["severity"], + category="info_missing", + text=f"Impressum: {req['name']} fehlt ({req['legal_ref']})", + legal_ref=req["legal_ref"], + expected=req["name"], + )) + + return findings