Files
breakpilot-compliance/backend-compliance/compliance/services/cross_domain_doc_check.py
T
Benjamin Admin d208a2bde2
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Successful in 13s
CI / go-lint (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / python-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-python-backend (push) Successful in 30s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat: Mail-Restrukturierung + B22 Cross-Domain-Doc-Detector
User-Feedback BMW v5: "740 Cookies verschwunden auf 31, Übersicht
verloren". Drei Anpassungen:

Mail-Restrukturierung (_executive_summary.py + _compose.py):
  - render_executive_summary(): Top-of-mail TL;DR mit
    Compliance-Score (gross + farbig), Top-3-Findings nach
    Severity, Cookie-Statistik (deklariert/Browser/Drittland),
    Severity-Verteilungs-Chips.
  - collapsible(): wrapt jeden Block in <details>/<summary>.
    Mailpit + alle modernen Mail-Clients rendern das nativ.
  - _compose.py: alle 18+ B-Blöcke + per_doc + per_theme +
    legacy_html in Akkordeons. NUR Critical-Findings + Sofort-
    massnahmen sind immer offen — Reviewer sieht ~15 Zeilen
    Übersicht und klappt selektiv auf.
  - Cookie-Inventar (742) hat jetzt eigene Sektion ganz oben
    (Akkordeon "🍪 Cookie-Inventar"), Vendor-Karten parallel.

B22 Cross-Domain-Legal-Doc-Detector (cross_domain_doc_check.py):
  Real-Beispiel User-Feedback: Elli's AGB liegt auf docs.logpay.de
  statt elli.eco. Detektor erkennt SLD-Mismatch:
  - HIGH bei agb / widerruf (vertragsrelevant)
  - MEDIUM bei dse / nutzungsbedingungen
  - INFO bei cookie / impressum (Best-Practice)
  Norm: DSGVO Art. 28 (AVV-Pflicht für Hosting) + Art. 13 Abs. 1
  lit. e (Empfänger) + § 312i BGB (Cool-URLs).
  9/9 Tests grün inkl. Elli/LogPay Pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-08 11:35:55 +02:00

152 lines
5.0 KiB
Python

"""B22 — Cross-Domain-Legal-Doc-Detector.
Erkennt: vertragsrelevante Dokumente (AGB, DSE, Widerrufsbelehrung,
Nutzungsbedingungen) liegen auf einer anderen Second-Level-Domain als
die Site selbst. Beispiel Elli/LogPay: AGB von Elli (elli.eco) liegt
auf docs.logpay.de.
Norm-Argument:
- DSGVO Art. 28: das Hosten von Vertragsdokumenten durch einen
Dritten ist Auftragsverarbeitung — AVV-Pflicht.
- DSGVO Art. 13 Abs. 1 lit. e: Empfänger / Auftragsverarbeiter
müssen in der DSE benannt sein.
- Vertragsrechtlich: AGB-Verbindlichkeit wackelig wenn der
Dokumenten-Host wechselt — was passiert wenn der externe Host
den Pfad ändert (Cool-URLs-Problem § 312i BGB).
Severity:
- HIGH bei AGB / Widerrufsbelehrung (vertragsrelevant)
- MEDIUM bei DSE / Nutzungsbedingungen
- INFO bei Cookie-Policy / Impressum (eher Best-Practice)
"""
from __future__ import annotations
import logging
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
_COMPOUND_TLDS = {
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
}
_SEVERITY_BY_DOC = {
"agb": "HIGH",
"widerruf": "HIGH",
"dse": "MEDIUM",
"nutzungsbedingungen": "MEDIUM",
"cookie": "INFO",
"impressum": "INFO",
"social_media": "INFO",
}
def _sld(host: str) -> str:
"""Extract the second-level domain. Handles compound TLDs."""
if not host:
return ""
host = host.lower().lstrip("www.")
parts = host.split(".")
if len(parts) < 2:
return host
if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
return parts[-3]
return parts[-2]
def _site_origin_sld(state: dict) -> str:
"""Find the primary site SLD by counting most common host in
submitted URLs."""
counter: dict[str, int] = {}
for e in (state.get("doc_entries") or []):
url = (e.get("url") or "").strip()
if not url or "://" not in url:
continue
# Skip auto-discovered docs (they may already be cross-domain
# by design — we want the USER's stated origin).
if e.get("auto_discovered"):
continue
try:
host = urlparse(url).netloc
sld = _sld(host)
if sld:
counter[sld] = counter.get(sld, 0) + 1
except Exception:
continue
if not counter:
# Fallback: use any URL
for e in (state.get("doc_entries") or []):
url = (e.get("url") or "").strip()
if url and "://" in url:
return _sld(urlparse(url).netloc)
return ""
return max(counter, key=counter.get)
def check_cross_domain_docs(state: dict) -> list[dict]:
"""Emit findings for doc_entries whose URL has a different SLD
than the site origin."""
primary = _site_origin_sld(state)
if not primary:
return []
findings: list[dict] = []
for e in (state.get("doc_entries") or []):
url = (e.get("url") or "").strip()
doc_type = (e.get("doc_type") or "").lower()
if not url or "://" not in url:
continue
try:
host = urlparse(url).netloc
url_sld = _sld(host)
except Exception:
continue
if not url_sld or url_sld == primary:
continue
# Cross-Domain detected
severity = _SEVERITY_BY_DOC.get(doc_type, "MEDIUM")
doc_label = {
"agb": "Allgemeine Geschäftsbedingungen",
"widerruf": "Widerrufsbelehrung",
"dse": "Datenschutzerklärung",
"nutzungsbedingungen": "Nutzungsbedingungen",
"cookie": "Cookie-Richtlinie",
"impressum": "Impressum",
"social_media": "Social-Media-Hinweise",
}.get(doc_type, doc_type.upper())
findings.append({
"check_id": "CROSS-DOMAIN-DOC-001",
"severity": severity,
"severity_reason": "third_party_hosted",
"doc_type": doc_type,
"site_sld": primary,
"host_sld": url_sld,
"url": url,
"title": (
f"{doc_label} liegt auf Drittanbieter-Domain "
f"({host}) statt {primary}"
),
"norm": (
"DSGVO Art. 28 (AVV) + Art. 13 Abs. 1 lit. e (Empfänger) + "
"§ 312i BGB (Cool-URLs / Vertragspflicht)"
),
"evidence": (
f"Site-Origin: {primary} · "
f"Dokument gehostet auf: {host} · "
f"URL: {url[:120]}"
),
"recommended_action": (
f"Entweder das Dokument auf eigene Domain ({primary}) "
"migrieren ODER (a) den externen Host {host} als "
"Auftragsverarbeiter in der DSE benennen, (b) AVV "
"abschließen, (c) sicherstellen dass URL-Stabilität "
f"vertraglich garantiert ist (§ 312i BGB Cool-URL-Pflicht)."
),
})
if findings:
logger.info("B22 cross-domain: %d finding(s)", len(findings))
return findings