""" Mandatory Content Checker — verifies that legally required content is present on a website. Checks for missing documents, sections, and mandatory information within documents. Knows what MUST be there (not just what IS there). """ import logging import re from dataclasses import dataclass, field from compliance.services.dse_parser import DSESection logger = logging.getLogger(__name__) @dataclass class MandatoryFinding: code: str severity: str # "HIGH", "MEDIUM", "LOW" category: str # "document_missing", "section_missing", "info_missing" text: str legal_ref: str expected: str # What should be there suggestion: str = "" # How to fix # ═══════════════════════════════════════════════════════════════ # MANDATORY DOCUMENTS (must exist as pages/links on the website) # ═══════════════════════════════════════════════════════════════ MANDATORY_DOCUMENTS = [ { "id": "impressum", "name": "Impressum", "legal_ref": "§5 TMG, §18 MStV", "patterns": [r"impressum", r"imprint", r"legal.?notice"], "severity": "HIGH", }, { "id": "datenschutz", "name": "Datenschutzerklaerung", "legal_ref": "Art. 13/14 DSGVO", "patterns": [r"datenschutz", r"privacy", r"dsgvo"], "severity": "HIGH", }, { "id": "agb", "name": "AGB / Nutzungsbedingungen", "legal_ref": "§305 BGB (bei Vertragsschluss)", "patterns": [r"agb", r"nutzungsbedingung", r"terms"], "severity": "MEDIUM", "only_ecommerce": True, # Nur bei Shops/Buchungsseiten }, { "id": "widerruf", "name": "Widerrufsbelehrung", "legal_ref": "§355 BGB, Art. 246a §1 EGBGB (nur Fernabsatz)", "patterns": [r"widerruf", r"cancellation.?policy", r"right.?of.?withdrawal"], "severity": "MEDIUM", "only_ecommerce": True, # Nur bei Fernabsatzvertraegen }, ] # ═══════════════════════════════════════════════════════════════ # MANDATORY DSE SECTIONS (Art. 13 DSGVO Pflichtangaben) # ═══════════════════════════════════════════════════════════════ MANDATORY_DSE_CONTENT = [ { "id": "verantwortlicher", "name": "Name und Kontakt des Verantwortlichen", "legal_ref": "Art. 13 Abs. 1 lit. a DSGVO", "keywords": ["verantwortlich", "responsible", "controller", "betreiber"], "severity": "HIGH", }, { "id": "dsb_kontakt", "name": "Kontaktdaten des Datenschutzbeauftragten", "legal_ref": "Art. 13 Abs. 1 lit. b DSGVO", "keywords": ["datenschutzbeauftragt", "data protection officer", "dsb", "dpo", "behördlichen datenschutz", "behoerdlichen datenschutz", "datenschutz@", "datenschutzbeauftragter"], "severity": "HIGH", }, { "id": "zwecke", "name": "Zwecke der Datenverarbeitung", "legal_ref": "Art. 13 Abs. 1 lit. c DSGVO", "keywords": ["zweck", "purpose", "verarbeitungszweck", "verarbeitungszwecke", "wozu", "wofuer", "zu welchem zweck", "nutzungszweck", "zweck und rechtsgrundlage", "zwecke der verarbeitung"], "severity": "HIGH", }, { "id": "rechtsgrundlage", "name": "Rechtsgrundlagen der Verarbeitung", "legal_ref": "Art. 13 Abs. 1 lit. c DSGVO", "keywords": ["rechtsgrundlage", "legal basis", "art. 6", "art.6", "berechtigtes interesse", "einwilligung", "vertragserfuellung", "vertragserfüllung", "rechtliche verpflichtung"], "severity": "HIGH", }, { "id": "speicherdauer", "name": "Speicherdauer / Loeschfristen", "legal_ref": "Art. 13 Abs. 2 lit. a DSGVO", "keywords": ["speicherdauer", "aufbewahrung", "loeschung", "loeschfrist", "storage period", "retention", "deletion"], "severity": "HIGH", }, { "id": "betroffenenrechte", "name": "Betroffenenrechte (Auskunft, Loeschung, etc.)", "legal_ref": "Art. 13 Abs. 2 lit. b-d DSGVO", "keywords": ["betroffenenrecht", "auskunft", "berichtigung", "loeschung", "einschraenkung", "widerspruch", "data subject rights", "right to access", "right to erasure"], "severity": "HIGH", }, { "id": "beschwerderecht", "name": "Beschwerderecht bei Aufsichtsbehoerde", "legal_ref": "Art. 13 Abs. 2 lit. d DSGVO", "keywords": ["aufsichtsbehoerde", "aufsichtsbehörde", "beschwerde", "supervisory authority", "datenschutzbehoerde", "landesbeauftragte", "bundesdatenschutz", "bfdi"], "severity": "MEDIUM", }, { "id": "drittlandtransfer", "name": "Drittlandtransfer-Information", "legal_ref": "Art. 13 Abs. 1 lit. f DSGVO", "keywords": ["drittland", "drittst", "third countr", "usa", "transfer", "standardvertragsklausel", "adequacy"], "severity": "MEDIUM", }, { "id": "automatisierte_entscheidung", "name": "Automatisierte Entscheidungsfindung / Profiling", "legal_ref": "Art. 13 Abs. 2 lit. f DSGVO", "keywords": ["automatisiert", "profiling", "automated decision", "scoring"], "severity": "MEDIUM", }, ] # ═══════════════════════════════════════════════════════════════ # MANDATORY IMPRESSUM CONTENT (§5 TMG) # ═══════════════════════════════════════════════════════════════ MANDATORY_IMPRESSUM_CONTENT = [ { "id": "geschaeftsfuehrer", "name": "Geschaeftsfuehrer / Vertretungsberechtigter", "legal_ref": "§5 Abs. 1 Nr. 1 TMG", "keywords": ["geschaeftsfuehrer", "geschäftsführer", "ceo", "managing director", "vertretungsberechtig", "vorstand"], "severity": "HIGH", }, { "id": "handelsregister", "name": "Handelsregisternummer", "legal_ref": "§5 Abs. 1 Nr. 4 TMG", "keywords": ["handelsregister", "hrb", "hra", "amtsgericht", "registergericht", "commercial register"], "severity": "HIGH", }, { "id": "ust_id", "name": "Umsatzsteuer-Identifikationsnummer", "legal_ref": "§5 Abs. 1 Nr. 6 TMG", "keywords": ["ust-id", "ust.-id", "umsatzsteuer", "vat", "de\\d{9}"], "severity": "MEDIUM", }, { "id": "anschrift", "name": "Anschrift (Strasse, PLZ, Ort)", "legal_ref": "§5 Abs. 1 Nr. 1 TMG", "keywords": ["str.", "straße", "strasse", "plz", "postleitzahl"], "severity": "HIGH", }, { "id": "kontakt", "name": "Kontaktmoeglichkeit (Email oder Telefon)", "legal_ref": "§5 Abs. 1 Nr. 2 TMG", "keywords": ["@", "telefon", "phone", "e-mail", "email", "kontakt"], "severity": "HIGH", }, ] ECOMMERCE_INDICATORS = [ r"warenkorb", r"cart", r"shop", r"bestell", r"order", r"checkout", r"kasse", r"buy", r"kaufen", r"add.?to.?cart", r"stripe|paypal|klarna|mollie|adyen", # Payment providers ] def _is_ecommerce(scanned_pages: list[str], html_content: str = "") -> bool: """Detect if website is an e-commerce/transactional site.""" all_text = " ".join(scanned_pages).lower() + " " + html_content.lower() return any(re.search(p, all_text) for p in ECOMMERCE_INDICATORS) def check_mandatory_documents( scanned_pages: list[str], page_status: dict[str, int], html_content: str = "", ) -> list[MandatoryFinding]: """Check if mandatory documents/pages exist on the website.""" findings = [] is_shop = _is_ecommerce(scanned_pages, html_content) for doc in MANDATORY_DOCUMENTS: # Skip e-commerce-only checks for non-shop websites if doc.get("only_ecommerce") and not is_shop: continue found = False for page in scanned_pages: if any(re.search(p, page, re.IGNORECASE) for p in doc["patterns"]): status = page_status.get(page, 200) if status < 400: found = True else: findings.append(MandatoryFinding( code=f"DOC-ERROR-{doc['id'].upper()}", severity="HIGH", category="document_error", text=f"{doc['name']} existiert aber gibt HTTP {status} zurueck (Ladefehler!)", legal_ref=doc["legal_ref"], expected=doc["name"], suggestion=f"Seite {page} ist nicht erreichbar. Pruefen ob ein Deployment-Fehler vorliegt.", )) found = True # Exists but broken break if not found: findings.append(MandatoryFinding( code=f"DOC-MISSING-{doc['id'].upper()}", severity=doc["severity"], category="document_missing", text=f"{doc['name']} nicht auf der Website gefunden ({doc['legal_ref']})", legal_ref=doc["legal_ref"], expected=f"Link zu {doc['name']} muss von jeder Seite erreichbar sein", )) return findings def check_dse_mandatory_content( sections: list[DSESection], full_text: str, ) -> list[MandatoryFinding]: """Check if privacy policy contains all mandatory sections per Art. 13 DSGVO.""" findings = [] text_lower = full_text.lower() for req in MANDATORY_DSE_CONTENT: found = any(kw in text_lower for kw in req["keywords"]) if not found: # Also check section headings found = any( any(kw in s.heading.lower() or kw in s.content.lower()[:200] for kw in req["keywords"]) for s in sections ) if not found: findings.append(MandatoryFinding( code=f"DSE-CONTENT-{req['id'].upper()}", severity=req["severity"], category="section_missing", text=f"Pflichtangabe fehlt: {req['name']} ({req['legal_ref']})", legal_ref=req["legal_ref"], expected=req["name"], )) return findings def check_impressum_mandatory_content( impressum_text: str, ) -> list[MandatoryFinding]: """Check if Impressum contains all mandatory info per §5 TMG.""" findings = [] text_lower = impressum_text.lower() for req in MANDATORY_IMPRESSUM_CONTENT: found = any(re.search(kw, text_lower) for kw in req["keywords"]) if not found: findings.append(MandatoryFinding( code=f"IMP-CONTENT-{req['id'].upper()}", severity=req["severity"], category="info_missing", text=f"Impressum: {req['name']} fehlt ({req['legal_ref']})", legal_ref=req["legal_ref"], expected=req["name"], )) return findings