Files
breakpilot-compliance/backend-compliance/compliance/services/mandatory_content_checker.py
T
Benjamin Admin 642382cbe8 feat: Mandatory Content Checker — knows what MUST be there
Three check levels:
1. Documents: Impressum, DSE, AGB, Widerrufsbelehrung must exist as pages
2. DSE content: 9 Art. 13 DSGVO mandatory sections (Verantwortlicher,
   DSB-Kontakt, Zwecke, Rechtsgrundlagen, Speicherdauer, Betroffenenrechte,
   Beschwerderecht, Drittlandtransfer, Profiling)
3. Impressum content: 5 §5 TMG mandatory fields (GF, Handelsregister,
   USt-ID, Anschrift, Kontakt)

Detects both missing documents AND missing content within documents.
Also catches HTTP errors (page exists but returns 404/500).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 14:23:22 +02:00

275 lines
10 KiB
Python

"""
Mandatory Content Checker — verifies that legally required content
is present on a website. Checks for missing documents, sections,
and mandatory information within documents.
Knows what MUST be there (not just what IS there).
"""
import logging
import re
from dataclasses import dataclass, field
from compliance.services.dse_parser import DSESection
logger = logging.getLogger(__name__)
@dataclass
class MandatoryFinding:
code: str
severity: str # "HIGH", "MEDIUM", "LOW"
category: str # "document_missing", "section_missing", "info_missing"
text: str
legal_ref: str
expected: str # What should be there
suggestion: str = "" # How to fix
# ═══════════════════════════════════════════════════════════════
# MANDATORY DOCUMENTS (must exist as pages/links on the website)
# ═══════════════════════════════════════════════════════════════
MANDATORY_DOCUMENTS = [
{
"id": "impressum",
"name": "Impressum",
"legal_ref": "§5 TMG, §18 MStV",
"patterns": [r"impressum", r"imprint", r"legal.?notice"],
"severity": "HIGH",
},
{
"id": "datenschutz",
"name": "Datenschutzerklaerung",
"legal_ref": "Art. 13/14 DSGVO",
"patterns": [r"datenschutz", r"privacy", r"dsgvo"],
"severity": "HIGH",
},
{
"id": "agb",
"name": "AGB / Nutzungsbedingungen",
"legal_ref": "§305 BGB (bei Vertragsschluss)",
"patterns": [r"agb", r"nutzungsbedingung", r"terms"],
"severity": "MEDIUM",
},
{
"id": "widerruf",
"name": "Widerrufsbelehrung",
"legal_ref": "§355 BGB, Art. 246a §1 EGBGB (nur Fernabsatz)",
"patterns": [r"widerruf", r"cancellation.?policy", r"right.?of.?withdrawal"],
"severity": "MEDIUM",
},
]
# ═══════════════════════════════════════════════════════════════
# MANDATORY DSE SECTIONS (Art. 13 DSGVO Pflichtangaben)
# ═══════════════════════════════════════════════════════════════
MANDATORY_DSE_CONTENT = [
{
"id": "verantwortlicher",
"name": "Name und Kontakt des Verantwortlichen",
"legal_ref": "Art. 13 Abs. 1 lit. a DSGVO",
"keywords": ["verantwortlich", "responsible", "controller", "betreiber"],
"severity": "HIGH",
},
{
"id": "dsb_kontakt",
"name": "Kontaktdaten des Datenschutzbeauftragten",
"legal_ref": "Art. 13 Abs. 1 lit. b DSGVO",
"keywords": ["datenschutzbeauftragt", "data protection officer", "dsb", "dpo"],
"severity": "HIGH",
},
{
"id": "zwecke",
"name": "Zwecke der Datenverarbeitung",
"legal_ref": "Art. 13 Abs. 1 lit. c DSGVO",
"keywords": ["zweck", "purpose", "verarbeitungszweck"],
"severity": "HIGH",
},
{
"id": "rechtsgrundlage",
"name": "Rechtsgrundlagen der Verarbeitung",
"legal_ref": "Art. 13 Abs. 1 lit. c DSGVO",
"keywords": ["rechtsgrundlage", "legal basis", "art. 6", "art.6"],
"severity": "HIGH",
},
{
"id": "speicherdauer",
"name": "Speicherdauer / Loeschfristen",
"legal_ref": "Art. 13 Abs. 2 lit. a DSGVO",
"keywords": ["speicherdauer", "aufbewahrung", "loeschung", "loeschfrist",
"storage period", "retention", "deletion"],
"severity": "HIGH",
},
{
"id": "betroffenenrechte",
"name": "Betroffenenrechte (Auskunft, Loeschung, etc.)",
"legal_ref": "Art. 13 Abs. 2 lit. b-d DSGVO",
"keywords": ["betroffenenrecht", "auskunft", "berichtigung", "loeschung",
"einschraenkung", "widerspruch", "data subject rights",
"right to access", "right to erasure"],
"severity": "HIGH",
},
{
"id": "beschwerderecht",
"name": "Beschwerderecht bei Aufsichtsbehoerde",
"legal_ref": "Art. 13 Abs. 2 lit. d DSGVO",
"keywords": ["aufsichtsbehoerde", "beschwerde", "supervisory authority",
"datenschutzbehoerde"],
"severity": "MEDIUM",
},
{
"id": "drittlandtransfer",
"name": "Drittlandtransfer-Information",
"legal_ref": "Art. 13 Abs. 1 lit. f DSGVO",
"keywords": ["drittland", "drittst", "third countr", "usa", "transfer",
"standardvertragsklausel", "adequacy"],
"severity": "MEDIUM",
},
{
"id": "automatisierte_entscheidung",
"name": "Automatisierte Entscheidungsfindung / Profiling",
"legal_ref": "Art. 13 Abs. 2 lit. f DSGVO",
"keywords": ["automatisiert", "profiling", "automated decision", "scoring"],
"severity": "MEDIUM",
},
]
# ═══════════════════════════════════════════════════════════════
# MANDATORY IMPRESSUM CONTENT (§5 TMG)
# ═══════════════════════════════════════════════════════════════
MANDATORY_IMPRESSUM_CONTENT = [
{
"id": "geschaeftsfuehrer",
"name": "Geschaeftsfuehrer / Vertretungsberechtigter",
"legal_ref": "§5 Abs. 1 Nr. 1 TMG",
"keywords": ["geschaeftsfuehrer", "geschäftsführer", "ceo", "managing director",
"vertretungsberechtig", "vorstand"],
"severity": "HIGH",
},
{
"id": "handelsregister",
"name": "Handelsregisternummer",
"legal_ref": "§5 Abs. 1 Nr. 4 TMG",
"keywords": ["handelsregister", "hrb", "hra", "amtsgericht", "registergericht",
"commercial register"],
"severity": "HIGH",
},
{
"id": "ust_id",
"name": "Umsatzsteuer-Identifikationsnummer",
"legal_ref": "§5 Abs. 1 Nr. 6 TMG",
"keywords": ["ust-id", "ust.-id", "umsatzsteuer", "vat", "de\\d{9}"],
"severity": "MEDIUM",
},
{
"id": "anschrift",
"name": "Anschrift (Strasse, PLZ, Ort)",
"legal_ref": "§5 Abs. 1 Nr. 1 TMG",
"keywords": ["str.", "straße", "strasse", "plz", "postleitzahl"],
"severity": "HIGH",
},
{
"id": "kontakt",
"name": "Kontaktmoeglichkeit (Email oder Telefon)",
"legal_ref": "§5 Abs. 1 Nr. 2 TMG",
"keywords": ["@", "telefon", "phone", "e-mail", "email", "kontakt"],
"severity": "HIGH",
},
]
def check_mandatory_documents(
scanned_pages: list[str], page_status: dict[str, int],
) -> list[MandatoryFinding]:
"""Check if mandatory documents/pages exist on the website."""
findings = []
for doc in MANDATORY_DOCUMENTS:
found = False
for page in scanned_pages:
if any(re.search(p, page, re.IGNORECASE) for p in doc["patterns"]):
status = page_status.get(page, 200)
if status < 400:
found = True
else:
findings.append(MandatoryFinding(
code=f"DOC-ERROR-{doc['id'].upper()}",
severity="HIGH",
category="document_error",
text=f"{doc['name']} existiert aber gibt HTTP {status} zurueck (Ladefehler!)",
legal_ref=doc["legal_ref"],
expected=doc["name"],
suggestion=f"Seite {page} ist nicht erreichbar. Pruefen ob ein Deployment-Fehler vorliegt.",
))
found = True # Exists but broken
break
if not found:
findings.append(MandatoryFinding(
code=f"DOC-MISSING-{doc['id'].upper()}",
severity=doc["severity"],
category="document_missing",
text=f"{doc['name']} nicht auf der Website gefunden ({doc['legal_ref']})",
legal_ref=doc["legal_ref"],
expected=f"Link zu {doc['name']} muss von jeder Seite erreichbar sein",
))
return findings
def check_dse_mandatory_content(
sections: list[DSESection], full_text: str,
) -> list[MandatoryFinding]:
"""Check if privacy policy contains all mandatory sections per Art. 13 DSGVO."""
findings = []
text_lower = full_text.lower()
for req in MANDATORY_DSE_CONTENT:
found = any(kw in text_lower for kw in req["keywords"])
if not found:
# Also check section headings
found = any(
any(kw in s.heading.lower() or kw in s.content.lower()[:200]
for kw in req["keywords"])
for s in sections
)
if not found:
findings.append(MandatoryFinding(
code=f"DSE-CONTENT-{req['id'].upper()}",
severity=req["severity"],
category="section_missing",
text=f"Pflichtangabe fehlt: {req['name']} ({req['legal_ref']})",
legal_ref=req["legal_ref"],
expected=req["name"],
))
return findings
def check_impressum_mandatory_content(
impressum_text: str,
) -> list[MandatoryFinding]:
"""Check if Impressum contains all mandatory info per §5 TMG."""
findings = []
text_lower = impressum_text.lower()
for req in MANDATORY_IMPRESSUM_CONTENT:
found = any(re.search(kw, text_lower) for kw in req["keywords"])
if not found:
findings.append(MandatoryFinding(
code=f"IMP-CONTENT-{req['id'].upper()}",
severity=req["severity"],
category="info_missing",
text=f"Impressum: {req['name']} fehlt ({req['legal_ref']})",
legal_ref=req["legal_ref"],
expected=req["name"],
))
return findings