Files
breakpilot-compliance/backend-compliance/compliance/services/mandatory_content_checker.py
T
Benjamin Admin 275bdf9848
Build + Deploy / build-admin-compliance (push) Successful in 1m49s
Build + Deploy / build-backend-compliance (push) Successful in 2m57s
Build + Deploy / build-ai-sdk (push) Successful in 50s
Build + Deploy / build-developer-portal (push) Successful in 1m2s
Build + Deploy / build-tts (push) Successful in 1m23s
Build + Deploy / build-document-crawler (push) Successful in 39s
Build + Deploy / build-dsms-gateway (push) Successful in 23s
Build + Deploy / build-dsms-node (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 21s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m31s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 41s
CI / test-python-backend (push) Successful in 40s
CI / test-python-document-crawler (push) Successful in 25s
CI / test-python-dsms-gateway (push) Successful in 20s
CI / validate-canonical-controls (push) Successful in 13s
Build + Deploy / trigger-orca (push) Successful in 2m46s
fix: Add missing service modules required by agent_scan_routes
These files existed on the feature branch but were never cherry-picked
to main, causing ModuleNotFoundError on import:
- dse_parser.py — parses DSE HTML into structured sections
- dse_matcher.py — matches detected services against DSE sections
- mandatory_content_checker.py — checks Art. 13 DSGVO mandatory fields
- legal_basis_validator.py — validates legal basis (lit. a-f)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-04 23:23:02 +02:00

303 lines
12 KiB
Python

"""
Mandatory Content Checker — verifies that legally required content
is present on a website. Checks for missing documents, sections,
and mandatory information within documents.
Knows what MUST be there (not just what IS there).
"""
import logging
import re
from dataclasses import dataclass, field
from compliance.services.dse_parser import DSESection
logger = logging.getLogger(__name__)
@dataclass
class MandatoryFinding:
code: str
severity: str # "HIGH", "MEDIUM", "LOW"
category: str # "document_missing", "section_missing", "info_missing"
text: str
legal_ref: str
expected: str # What should be there
suggestion: str = "" # How to fix
# ═══════════════════════════════════════════════════════════════
# MANDATORY DOCUMENTS (must exist as pages/links on the website)
# ═══════════════════════════════════════════════════════════════
MANDATORY_DOCUMENTS = [
{
"id": "impressum",
"name": "Impressum",
"legal_ref": "§5 TMG, §18 MStV",
"patterns": [r"impressum", r"imprint", r"legal.?notice"],
"severity": "HIGH",
},
{
"id": "datenschutz",
"name": "Datenschutzerklaerung",
"legal_ref": "Art. 13/14 DSGVO",
"patterns": [r"datenschutz", r"privacy", r"dsgvo"],
"severity": "HIGH",
},
{
"id": "agb",
"name": "AGB / Nutzungsbedingungen",
"legal_ref": "§305 BGB (bei Vertragsschluss)",
"patterns": [r"agb", r"nutzungsbedingung", r"terms"],
"severity": "MEDIUM",
"only_ecommerce": True, # Nur bei Shops/Buchungsseiten
},
{
"id": "widerruf",
"name": "Widerrufsbelehrung",
"legal_ref": "§355 BGB, Art. 246a §1 EGBGB (nur Fernabsatz)",
"patterns": [r"widerruf", r"cancellation.?policy", r"right.?of.?withdrawal"],
"severity": "MEDIUM",
"only_ecommerce": True, # Nur bei Fernabsatzvertraegen
},
]
# ═══════════════════════════════════════════════════════════════
# MANDATORY DSE SECTIONS (Art. 13 DSGVO Pflichtangaben)
# ═══════════════════════════════════════════════════════════════
MANDATORY_DSE_CONTENT = [
{
"id": "verantwortlicher",
"name": "Name und Kontakt des Verantwortlichen",
"legal_ref": "Art. 13 Abs. 1 lit. a DSGVO",
"keywords": ["verantwortlich", "responsible", "controller", "betreiber"],
"severity": "HIGH",
},
{
"id": "dsb_kontakt",
"name": "Kontaktdaten des Datenschutzbeauftragten",
"legal_ref": "Art. 13 Abs. 1 lit. b DSGVO",
"keywords": ["datenschutzbeauftragt", "data protection officer", "dsb", "dpo",
"behördlichen datenschutz", "behoerdlichen datenschutz",
"datenschutz@", "datenschutzbeauftragter"],
"severity": "HIGH",
},
{
"id": "zwecke",
"name": "Zwecke der Datenverarbeitung",
"legal_ref": "Art. 13 Abs. 1 lit. c DSGVO",
"keywords": ["zweck", "purpose", "verarbeitungszweck", "verarbeitungszwecke",
"wozu", "wofuer", "zu welchem zweck", "nutzungszweck",
"zweck und rechtsgrundlage", "zwecke der verarbeitung"],
"severity": "HIGH",
},
{
"id": "rechtsgrundlage",
"name": "Rechtsgrundlagen der Verarbeitung",
"legal_ref": "Art. 13 Abs. 1 lit. c DSGVO",
"keywords": ["rechtsgrundlage", "legal basis", "art. 6", "art.6",
"berechtigtes interesse", "einwilligung", "vertragserfuellung",
"vertragserfüllung", "rechtliche verpflichtung"],
"severity": "HIGH",
},
{
"id": "speicherdauer",
"name": "Speicherdauer / Loeschfristen",
"legal_ref": "Art. 13 Abs. 2 lit. a DSGVO",
"keywords": ["speicherdauer", "aufbewahrung", "loeschung", "loeschfrist",
"storage period", "retention", "deletion"],
"severity": "HIGH",
},
{
"id": "betroffenenrechte",
"name": "Betroffenenrechte (Auskunft, Loeschung, etc.)",
"legal_ref": "Art. 13 Abs. 2 lit. b-d DSGVO",
"keywords": ["betroffenenrecht", "auskunft", "berichtigung", "loeschung",
"einschraenkung", "widerspruch", "data subject rights",
"right to access", "right to erasure"],
"severity": "HIGH",
},
{
"id": "beschwerderecht",
"name": "Beschwerderecht bei Aufsichtsbehoerde",
"legal_ref": "Art. 13 Abs. 2 lit. d DSGVO",
"keywords": ["aufsichtsbehoerde", "aufsichtsbehörde", "beschwerde",
"supervisory authority", "datenschutzbehoerde",
"landesbeauftragte", "bundesdatenschutz", "bfdi"],
"severity": "MEDIUM",
},
{
"id": "drittlandtransfer",
"name": "Drittlandtransfer-Information",
"legal_ref": "Art. 13 Abs. 1 lit. f DSGVO",
"keywords": ["drittland", "drittst", "third countr", "usa", "transfer",
"standardvertragsklausel", "adequacy"],
"severity": "MEDIUM",
},
{
"id": "automatisierte_entscheidung",
"name": "Automatisierte Entscheidungsfindung / Profiling",
"legal_ref": "Art. 13 Abs. 2 lit. f DSGVO",
"keywords": ["automatisiert", "profiling", "automated decision", "scoring"],
"severity": "MEDIUM",
},
]
# ═══════════════════════════════════════════════════════════════
# MANDATORY IMPRESSUM CONTENT (§5 TMG)
# ═══════════════════════════════════════════════════════════════
MANDATORY_IMPRESSUM_CONTENT = [
{
"id": "geschaeftsfuehrer",
"name": "Geschaeftsfuehrer / Vertretungsberechtigter",
"legal_ref": "§5 Abs. 1 Nr. 1 TMG",
"keywords": ["geschaeftsfuehrer", "geschäftsführer", "ceo", "managing director",
"vertretungsberechtig", "vorstand"],
"severity": "HIGH",
},
{
"id": "handelsregister",
"name": "Handelsregisternummer",
"legal_ref": "§5 Abs. 1 Nr. 4 TMG",
"keywords": ["handelsregister", "hrb", "hra", "amtsgericht", "registergericht",
"commercial register"],
"severity": "HIGH",
},
{
"id": "ust_id",
"name": "Umsatzsteuer-Identifikationsnummer",
"legal_ref": "§5 Abs. 1 Nr. 6 TMG",
"keywords": ["ust-id", "ust.-id", "umsatzsteuer", "vat", "de\\d{9}"],
"severity": "MEDIUM",
},
{
"id": "anschrift",
"name": "Anschrift (Strasse, PLZ, Ort)",
"legal_ref": "§5 Abs. 1 Nr. 1 TMG",
"keywords": ["str.", "straße", "strasse", "plz", "postleitzahl"],
"severity": "HIGH",
},
{
"id": "kontakt",
"name": "Kontaktmoeglichkeit (Email oder Telefon)",
"legal_ref": "§5 Abs. 1 Nr. 2 TMG",
"keywords": ["@", "telefon", "phone", "e-mail", "email", "kontakt"],
"severity": "HIGH",
},
]
ECOMMERCE_INDICATORS = [
r"warenkorb", r"cart", r"shop", r"bestell", r"order",
r"checkout", r"kasse", r"buy", r"kaufen", r"add.?to.?cart",
r"stripe|paypal|klarna|mollie|adyen", # Payment providers
]
def _is_ecommerce(scanned_pages: list[str], html_content: str = "") -> bool:
"""Detect if website is an e-commerce/transactional site."""
all_text = " ".join(scanned_pages).lower() + " " + html_content.lower()
return any(re.search(p, all_text) for p in ECOMMERCE_INDICATORS)
def check_mandatory_documents(
scanned_pages: list[str], page_status: dict[str, int],
html_content: str = "",
) -> list[MandatoryFinding]:
"""Check if mandatory documents/pages exist on the website."""
findings = []
is_shop = _is_ecommerce(scanned_pages, html_content)
for doc in MANDATORY_DOCUMENTS:
# Skip e-commerce-only checks for non-shop websites
if doc.get("only_ecommerce") and not is_shop:
continue
found = False
for page in scanned_pages:
if any(re.search(p, page, re.IGNORECASE) for p in doc["patterns"]):
status = page_status.get(page, 200)
if status < 400:
found = True
else:
findings.append(MandatoryFinding(
code=f"DOC-ERROR-{doc['id'].upper()}",
severity="HIGH",
category="document_error",
text=f"{doc['name']} existiert aber gibt HTTP {status} zurueck (Ladefehler!)",
legal_ref=doc["legal_ref"],
expected=doc["name"],
suggestion=f"Seite {page} ist nicht erreichbar. Pruefen ob ein Deployment-Fehler vorliegt.",
))
found = True # Exists but broken
break
if not found:
findings.append(MandatoryFinding(
code=f"DOC-MISSING-{doc['id'].upper()}",
severity=doc["severity"],
category="document_missing",
text=f"{doc['name']} nicht auf der Website gefunden ({doc['legal_ref']})",
legal_ref=doc["legal_ref"],
expected=f"Link zu {doc['name']} muss von jeder Seite erreichbar sein",
))
return findings
def check_dse_mandatory_content(
sections: list[DSESection], full_text: str,
) -> list[MandatoryFinding]:
"""Check if privacy policy contains all mandatory sections per Art. 13 DSGVO."""
findings = []
text_lower = full_text.lower()
for req in MANDATORY_DSE_CONTENT:
found = any(kw in text_lower for kw in req["keywords"])
if not found:
# Also check section headings
found = any(
any(kw in s.heading.lower() or kw in s.content.lower()[:200]
for kw in req["keywords"])
for s in sections
)
if not found:
findings.append(MandatoryFinding(
code=f"DSE-CONTENT-{req['id'].upper()}",
severity=req["severity"],
category="section_missing",
text=f"Pflichtangabe fehlt: {req['name']} ({req['legal_ref']})",
legal_ref=req["legal_ref"],
expected=req["name"],
))
return findings
def check_impressum_mandatory_content(
impressum_text: str,
) -> list[MandatoryFinding]:
"""Check if Impressum contains all mandatory info per §5 TMG."""
findings = []
text_lower = impressum_text.lower()
for req in MANDATORY_IMPRESSUM_CONTENT:
found = any(re.search(kw, text_lower) for kw in req["keywords"])
if not found:
findings.append(MandatoryFinding(
code=f"IMP-CONTENT-{req['id'].upper()}",
severity=req["severity"],
category="info_missing",
text=f"Impressum: {req['name']} fehlt ({req['legal_ref']})",
legal_ref=req["legal_ref"],
expected=req["name"],
))
return findings