Files
breakpilot-compliance/backend-compliance/compliance/services/mandatory_content_checker.py
T
Benjamin Admin fff47cc52e fix: 4 bugs from IHK Konstanz scan validation
1. DSE-Matcher: Google/YouTube false match — now requires 2+ word match
   for provider-name fallback, not just "Google" matching YouTube section
2. AGB/Widerrufsbelehrung: only_ecommerce flag — skips for non-shop
   websites (detected via payment providers, cart keywords)
3. DSE-internal link following — scanner now discovers links WITHIN the
   privacy policy and scans those too (finds regional DSE sub-pages)
4. Expanded keyword synonyms for DSE mandatory checks:
   - "Zweck und Rechtsgrundlage" now matches "zwecke"
   - "behoerdlichen datenschutzbeauftragt" matches DSB
   - "aufsichtsbehörde" with umlaut matches

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 17:57:19 +02:00

303 lines
12 KiB
Python

"""
Mandatory Content Checker — verifies that legally required content
is present on a website. Checks for missing documents, sections,
and mandatory information within documents.
Knows what MUST be there (not just what IS there).
"""
import logging
import re
from dataclasses import dataclass, field
from compliance.services.dse_parser import DSESection
logger = logging.getLogger(__name__)
@dataclass
class MandatoryFinding:
code: str
severity: str # "HIGH", "MEDIUM", "LOW"
category: str # "document_missing", "section_missing", "info_missing"
text: str
legal_ref: str
expected: str # What should be there
suggestion: str = "" # How to fix
# ═══════════════════════════════════════════════════════════════
# MANDATORY DOCUMENTS (must exist as pages/links on the website)
# ═══════════════════════════════════════════════════════════════
MANDATORY_DOCUMENTS = [
{
"id": "impressum",
"name": "Impressum",
"legal_ref": "§5 TMG, §18 MStV",
"patterns": [r"impressum", r"imprint", r"legal.?notice"],
"severity": "HIGH",
},
{
"id": "datenschutz",
"name": "Datenschutzerklaerung",
"legal_ref": "Art. 13/14 DSGVO",
"patterns": [r"datenschutz", r"privacy", r"dsgvo"],
"severity": "HIGH",
},
{
"id": "agb",
"name": "AGB / Nutzungsbedingungen",
"legal_ref": "§305 BGB (bei Vertragsschluss)",
"patterns": [r"agb", r"nutzungsbedingung", r"terms"],
"severity": "MEDIUM",
"only_ecommerce": True, # Nur bei Shops/Buchungsseiten
},
{
"id": "widerruf",
"name": "Widerrufsbelehrung",
"legal_ref": "§355 BGB, Art. 246a §1 EGBGB (nur Fernabsatz)",
"patterns": [r"widerruf", r"cancellation.?policy", r"right.?of.?withdrawal"],
"severity": "MEDIUM",
"only_ecommerce": True, # Nur bei Fernabsatzvertraegen
},
]
# ═══════════════════════════════════════════════════════════════
# MANDATORY DSE SECTIONS (Art. 13 DSGVO Pflichtangaben)
# ═══════════════════════════════════════════════════════════════
MANDATORY_DSE_CONTENT = [
{
"id": "verantwortlicher",
"name": "Name und Kontakt des Verantwortlichen",
"legal_ref": "Art. 13 Abs. 1 lit. a DSGVO",
"keywords": ["verantwortlich", "responsible", "controller", "betreiber"],
"severity": "HIGH",
},
{
"id": "dsb_kontakt",
"name": "Kontaktdaten des Datenschutzbeauftragten",
"legal_ref": "Art. 13 Abs. 1 lit. b DSGVO",
"keywords": ["datenschutzbeauftragt", "data protection officer", "dsb", "dpo",
"behördlichen datenschutz", "behoerdlichen datenschutz",
"datenschutz@", "datenschutzbeauftragter"],
"severity": "HIGH",
},
{
"id": "zwecke",
"name": "Zwecke der Datenverarbeitung",
"legal_ref": "Art. 13 Abs. 1 lit. c DSGVO",
"keywords": ["zweck", "purpose", "verarbeitungszweck", "verarbeitungszwecke",
"wozu", "wofuer", "zu welchem zweck", "nutzungszweck",
"zweck und rechtsgrundlage", "zwecke der verarbeitung"],
"severity": "HIGH",
},
{
"id": "rechtsgrundlage",
"name": "Rechtsgrundlagen der Verarbeitung",
"legal_ref": "Art. 13 Abs. 1 lit. c DSGVO",
"keywords": ["rechtsgrundlage", "legal basis", "art. 6", "art.6",
"berechtigtes interesse", "einwilligung", "vertragserfuellung",
"vertragserfüllung", "rechtliche verpflichtung"],
"severity": "HIGH",
},
{
"id": "speicherdauer",
"name": "Speicherdauer / Loeschfristen",
"legal_ref": "Art. 13 Abs. 2 lit. a DSGVO",
"keywords": ["speicherdauer", "aufbewahrung", "loeschung", "loeschfrist",
"storage period", "retention", "deletion"],
"severity": "HIGH",
},
{
"id": "betroffenenrechte",
"name": "Betroffenenrechte (Auskunft, Loeschung, etc.)",
"legal_ref": "Art. 13 Abs. 2 lit. b-d DSGVO",
"keywords": ["betroffenenrecht", "auskunft", "berichtigung", "loeschung",
"einschraenkung", "widerspruch", "data subject rights",
"right to access", "right to erasure"],
"severity": "HIGH",
},
{
"id": "beschwerderecht",
"name": "Beschwerderecht bei Aufsichtsbehoerde",
"legal_ref": "Art. 13 Abs. 2 lit. d DSGVO",
"keywords": ["aufsichtsbehoerde", "aufsichtsbehörde", "beschwerde",
"supervisory authority", "datenschutzbehoerde",
"landesbeauftragte", "bundesdatenschutz", "bfdi"],
"severity": "MEDIUM",
},
{
"id": "drittlandtransfer",
"name": "Drittlandtransfer-Information",
"legal_ref": "Art. 13 Abs. 1 lit. f DSGVO",
"keywords": ["drittland", "drittst", "third countr", "usa", "transfer",
"standardvertragsklausel", "adequacy"],
"severity": "MEDIUM",
},
{
"id": "automatisierte_entscheidung",
"name": "Automatisierte Entscheidungsfindung / Profiling",
"legal_ref": "Art. 13 Abs. 2 lit. f DSGVO",
"keywords": ["automatisiert", "profiling", "automated decision", "scoring"],
"severity": "MEDIUM",
},
]
# ═══════════════════════════════════════════════════════════════
# MANDATORY IMPRESSUM CONTENT (§5 TMG)
# ═══════════════════════════════════════════════════════════════
MANDATORY_IMPRESSUM_CONTENT = [
{
"id": "geschaeftsfuehrer",
"name": "Geschaeftsfuehrer / Vertretungsberechtigter",
"legal_ref": "§5 Abs. 1 Nr. 1 TMG",
"keywords": ["geschaeftsfuehrer", "geschäftsführer", "ceo", "managing director",
"vertretungsberechtig", "vorstand"],
"severity": "HIGH",
},
{
"id": "handelsregister",
"name": "Handelsregisternummer",
"legal_ref": "§5 Abs. 1 Nr. 4 TMG",
"keywords": ["handelsregister", "hrb", "hra", "amtsgericht", "registergericht",
"commercial register"],
"severity": "HIGH",
},
{
"id": "ust_id",
"name": "Umsatzsteuer-Identifikationsnummer",
"legal_ref": "§5 Abs. 1 Nr. 6 TMG",
"keywords": ["ust-id", "ust.-id", "umsatzsteuer", "vat", "de\\d{9}"],
"severity": "MEDIUM",
},
{
"id": "anschrift",
"name": "Anschrift (Strasse, PLZ, Ort)",
"legal_ref": "§5 Abs. 1 Nr. 1 TMG",
"keywords": ["str.", "straße", "strasse", "plz", "postleitzahl"],
"severity": "HIGH",
},
{
"id": "kontakt",
"name": "Kontaktmoeglichkeit (Email oder Telefon)",
"legal_ref": "§5 Abs. 1 Nr. 2 TMG",
"keywords": ["@", "telefon", "phone", "e-mail", "email", "kontakt"],
"severity": "HIGH",
},
]
ECOMMERCE_INDICATORS = [
r"warenkorb", r"cart", r"shop", r"bestell", r"order",
r"checkout", r"kasse", r"buy", r"kaufen", r"add.?to.?cart",
r"stripe|paypal|klarna|mollie|adyen", # Payment providers
]
def _is_ecommerce(scanned_pages: list[str], html_content: str = "") -> bool:
"""Detect if website is an e-commerce/transactional site."""
all_text = " ".join(scanned_pages).lower() + " " + html_content.lower()
return any(re.search(p, all_text) for p in ECOMMERCE_INDICATORS)
def check_mandatory_documents(
scanned_pages: list[str], page_status: dict[str, int],
html_content: str = "",
) -> list[MandatoryFinding]:
"""Check if mandatory documents/pages exist on the website."""
findings = []
is_shop = _is_ecommerce(scanned_pages, html_content)
for doc in MANDATORY_DOCUMENTS:
# Skip e-commerce-only checks for non-shop websites
if doc.get("only_ecommerce") and not is_shop:
continue
found = False
for page in scanned_pages:
if any(re.search(p, page, re.IGNORECASE) for p in doc["patterns"]):
status = page_status.get(page, 200)
if status < 400:
found = True
else:
findings.append(MandatoryFinding(
code=f"DOC-ERROR-{doc['id'].upper()}",
severity="HIGH",
category="document_error",
text=f"{doc['name']} existiert aber gibt HTTP {status} zurueck (Ladefehler!)",
legal_ref=doc["legal_ref"],
expected=doc["name"],
suggestion=f"Seite {page} ist nicht erreichbar. Pruefen ob ein Deployment-Fehler vorliegt.",
))
found = True # Exists but broken
break
if not found:
findings.append(MandatoryFinding(
code=f"DOC-MISSING-{doc['id'].upper()}",
severity=doc["severity"],
category="document_missing",
text=f"{doc['name']} nicht auf der Website gefunden ({doc['legal_ref']})",
legal_ref=doc["legal_ref"],
expected=f"Link zu {doc['name']} muss von jeder Seite erreichbar sein",
))
return findings
def check_dse_mandatory_content(
sections: list[DSESection], full_text: str,
) -> list[MandatoryFinding]:
"""Check if privacy policy contains all mandatory sections per Art. 13 DSGVO."""
findings = []
text_lower = full_text.lower()
for req in MANDATORY_DSE_CONTENT:
found = any(kw in text_lower for kw in req["keywords"])
if not found:
# Also check section headings
found = any(
any(kw in s.heading.lower() or kw in s.content.lower()[:200]
for kw in req["keywords"])
for s in sections
)
if not found:
findings.append(MandatoryFinding(
code=f"DSE-CONTENT-{req['id'].upper()}",
severity=req["severity"],
category="section_missing",
text=f"Pflichtangabe fehlt: {req['name']} ({req['legal_ref']})",
legal_ref=req["legal_ref"],
expected=req["name"],
))
return findings
def check_impressum_mandatory_content(
impressum_text: str,
) -> list[MandatoryFinding]:
"""Check if Impressum contains all mandatory info per §5 TMG."""
findings = []
text_lower = impressum_text.lower()
for req in MANDATORY_IMPRESSUM_CONTENT:
found = any(re.search(kw, text_lower) for kw in req["keywords"])
if not found:
findings.append(MandatoryFinding(
code=f"IMP-CONTENT-{req['id'].upper()}",
severity=req["severity"],
category="info_missing",
text=f"Impressum: {req['name']} fehlt ({req['legal_ref']})",
legal_ref=req["legal_ref"],
expected=req["name"],
))
return findings