28a078ccb4
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Neuer Service cookie_policy_architecture.detect_architecture(...) prueft
vier Diagnose-Punkte der Cookie-Policy einer Website:
1. Layer-Trennung: single (BMW-Pattern: Banner + Info in EINER URL)
| separate (Best Practice: getrennte Layer)
2. Versionierung: "Stand vom DD.MM.JJJJ" / "Version X.Y" / ...
3. Dynamic content: CMP-Capture auf Doc-URL oder Marker-Texte
4. Vendor-Count im Text: Indikator ob Liste statisch drinsteht
Risiko-Ampel:
- gruen: separate + versioned + statisch
- gelb : single+unversioned (BMW) ODER separate+unversioned
- rot : weder noch (Pflicht-Info fehlt)
Wire-in im Compliance-Check-Worker: nach Exec-Summary-Block wird der
Architecture-Block gerendert (build_architecture_html) mit konkreter
Empfehlung. Bei BMW-Pattern: "Snapshot der dynamischen Vendor-Tabelle
als versioniertes PDF im Archiv."
Hintergrund: BMW hat eine HTML-Seite die GLEICHZEITIG Banner-Re-Trigger
und Cookie-Richtlinie ist. Mindestanforderung nach §25 TDDDG + Art. 13
DSGVO erfuellt, aber bei einer Aufsichtsbehoerden-Pruefung kann nicht
belegt werden welche Vendor-Liste an einem bestimmten Stichtag aktiv
war. Das ist kein Verstoss aber best-practice-Luecke.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
256 lines
9.3 KiB
Python
256 lines
9.3 KiB
Python
"""
|
|
Cookie-Policy-Architecture-Detection.
|
|
|
|
Erkennt vier Diagnose-Punkte zur rechtlichen Bewertung der Cookie-Policy
|
|
einer Website. Hintergrund: die DSGVO + TDDDG verlangen ZWEI Layer
|
|
(Banner fuer Consent + Cookie-Richtlinie fuer Information), aber lassen
|
|
offen ob das in einem oder zwei HTML-Dokumenten umgesetzt wird.
|
|
|
|
BMW-Pattern: eine HTML-Seite ist GLEICHZEITIG der Banner-Re-Trigger und
|
|
die Cookie-Richtlinie. Mindestanforderung erfuellt, aber kein
|
|
versionierter Audit-Trail moeglich -> "gelbes" Risiko.
|
|
|
|
Output-Format:
|
|
{
|
|
"layer_separation": "single" | "separate" | "unknown",
|
|
"versioned": bool,
|
|
"dynamic_content": bool,
|
|
"vendor_count_in_text": int,
|
|
"risk_label": "gruen" | "gelb" | "rot",
|
|
"recommendation": str,
|
|
"signals": [{"src": ..., "detail": ...}],
|
|
}
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
# Regex fuer "Stand vom DD.MM.JJJJ" / "Stand: DD.MM.JJJJ" / "Version X.Y"
|
|
_VERSION_PATTERNS = [
|
|
r"stand\s*[:\-]?\s*(?:vom\s+)?\d{1,2}\.\s*\d{1,2}\.\s*\d{4}",
|
|
r"stand\s*[:\-]?\s*\d{1,2}\.\s*\w+\s+\d{4}", # "Stand: 1. Mai 2026"
|
|
r"letzte\s+(?:aktualisierung|aenderung|änderung)\s*[:\-]?\s*\d{1,2}\.",
|
|
r"version\s*[:\-]?\s*\d+(?:\.\d+)?",
|
|
r"stand\s+der\s+(?:information|cookie)\w*\s*[:\-]?\s*\d{1,2}\.",
|
|
r"(?:gueltig|gültig)\s+ab\s+\d{1,2}\.\s*\d{1,2}\.\s*\d{4}",
|
|
]
|
|
|
|
# Hinweise auf dynamische Generierung
|
|
_DYNAMIC_MARKERS = [
|
|
"wird automatisch aktualisiert",
|
|
"wird dynamisch generiert",
|
|
"wird laufend angepasst",
|
|
"cookie-einstellungen ändern",
|
|
"cookie-einstellungen aendern",
|
|
"cookie-praeferenzen verwalten",
|
|
"cookie-präferenzen verwalten",
|
|
"consent aktualisieren",
|
|
"einwilligung verwalten",
|
|
"einwilligungs-einstellungen",
|
|
]
|
|
|
|
# CMP-Trigger-Marker (Container-/Button-Texte die typischerweise das
|
|
# Banner re-oeffnen)
|
|
_BANNER_TRIGGER_MARKERS = [
|
|
"cookie-einstellungen öffnen",
|
|
"cookie einstellungen öffnen",
|
|
"ihre cookie-präferenzen",
|
|
"ihre cookie praeferenzen",
|
|
"consent banner",
|
|
"datenschutz-einstellungen",
|
|
"cookie-banner anzeigen",
|
|
]
|
|
|
|
|
|
def _normalize_url(u: str) -> str:
|
|
if not u:
|
|
return ""
|
|
if "://" not in u:
|
|
u = "https://" + u
|
|
p = urlparse(u)
|
|
path = p.path.rstrip("/").lower()
|
|
host = p.netloc.lower().replace("www.", "")
|
|
return f"{host}{path}"
|
|
|
|
|
|
def _check_versioned(text_lower: str) -> tuple[bool, str | None]:
|
|
for pat in _VERSION_PATTERNS:
|
|
m = re.search(pat, text_lower)
|
|
if m:
|
|
return True, m.group()[:80]
|
|
return False, None
|
|
|
|
|
|
def _check_dynamic(text_lower: str) -> tuple[bool, str | None]:
|
|
for marker in _DYNAMIC_MARKERS:
|
|
if marker in text_lower:
|
|
return True, marker
|
|
return False, None
|
|
|
|
|
|
def _check_banner_trigger(text_lower: str) -> tuple[bool, str | None]:
|
|
for marker in _BANNER_TRIGGER_MARKERS:
|
|
if marker in text_lower:
|
|
return True, marker
|
|
return False, None
|
|
|
|
|
|
def _count_vendor_signals(text_lower: str) -> int:
|
|
"""Zaehle wieviele Vendor-Namen im Text — Indikator ob die Liste statisch
|
|
drinsteht oder dynamisch nachgeladen wird."""
|
|
vendor_signals = [
|
|
"google", "meta", "facebook", "adobe", "microsoft", "linkedin",
|
|
"tiktok", "amazon", "hotjar", "cloudflare", "stripe", "salesforce",
|
|
"hubspot", "mailchimp", "pinterest", "snapchat", "youtube", "vimeo",
|
|
]
|
|
return sum(1 for v in vendor_signals if v in text_lower)
|
|
|
|
|
|
def detect_architecture(
|
|
doc_url: str,
|
|
doc_text: str,
|
|
cmp_payloads: list[dict] | None = None,
|
|
homepage_cmp_payloads: list[dict] | None = None,
|
|
) -> dict:
|
|
"""Pruefe die Layer-Architektur einer Cookie-Richtlinie.
|
|
|
|
Args:
|
|
doc_url: URL des erkannten Cookie-Richtlinie-Dokuments
|
|
doc_text: Volltext der Cookie-Richtlinie
|
|
cmp_payloads: CMP-Capture die WAEHREND des doc-Crawls passiert sind
|
|
homepage_cmp_payloads: CMP-Capture vom initialen Homepage-Crawl
|
|
"""
|
|
text_lower = (doc_text or "").lower()
|
|
signals: list[dict] = []
|
|
|
|
# 1. Single- vs Separate-Layer
|
|
cmp_on_doc = bool(cmp_payloads)
|
|
banner_trigger, trigger_marker = _check_banner_trigger(text_lower)
|
|
if cmp_on_doc and banner_trigger:
|
|
layer = "single"
|
|
signals.append({"src": "cmp+marker",
|
|
"detail": f"CMP feuerte auf Doc-URL + Marker '{trigger_marker}'"})
|
|
elif cmp_on_doc:
|
|
layer = "single"
|
|
signals.append({"src": "cmp", "detail": "CMP-Payload waehrend Doc-Crawl"})
|
|
elif banner_trigger:
|
|
layer = "single"
|
|
signals.append({"src": "marker", "detail": f"Trigger-Marker: '{trigger_marker}'"})
|
|
elif homepage_cmp_payloads and not cmp_on_doc:
|
|
layer = "separate"
|
|
signals.append({"src": "topology",
|
|
"detail": "Banner triggert nur auf Homepage, Cookie-Doc ist eigene Seite"})
|
|
else:
|
|
layer = "unknown"
|
|
|
|
# 2. Versionierung
|
|
versioned, version_marker = _check_versioned(text_lower)
|
|
if versioned:
|
|
signals.append({"src": "version", "detail": f"Marker: '{version_marker}'"})
|
|
|
|
# 3. Dynamic content
|
|
dynamic, dyn_marker = _check_dynamic(text_lower)
|
|
if dynamic or cmp_on_doc:
|
|
dynamic = True
|
|
if dyn_marker:
|
|
signals.append({"src": "dynamic", "detail": dyn_marker})
|
|
|
|
# 4. Vendor-Count (Indikator ob Liste statisch im Text steht)
|
|
vendor_count = _count_vendor_signals(text_lower)
|
|
|
|
# Risiko-Bewertung
|
|
if layer == "unknown" and vendor_count < 3:
|
|
risk = "rot"
|
|
rec = (
|
|
"Cookie-Richtlinie konnte nicht eindeutig identifiziert oder ist "
|
|
"unzureichend. Pruefen Sie ob die Pflicht-Information nach "
|
|
"Art. 13 DSGVO + §25 TDDDG ueberhaupt erreichbar ist."
|
|
)
|
|
elif layer == "single" and not versioned:
|
|
risk = "gelb"
|
|
rec = (
|
|
"BMW-Pattern erkannt: Single-Layer-CMP (Banner-Trigger + "
|
|
"Info-Layer in einer URL). Mindestanforderung erfuellt, aber "
|
|
"OHNE Versionierung. Bei einer Aufsichtsbehoerden-Pruefung "
|
|
"kann nicht belegt werden welche Vendor-Liste an einem "
|
|
"bestimmten Stichtag aktiv war. Empfehlung: monatlicher "
|
|
"Snapshot der dynamischen Vendor-Tabelle als versioniertes "
|
|
"PDF im Archiv."
|
|
)
|
|
elif layer == "single" and versioned:
|
|
risk = "gelb"
|
|
rec = (
|
|
"Single-Layer mit Versionierung — gute Mindestloesung. "
|
|
"Best Practice waere zusaetzlich eine getrennte statische "
|
|
"Vendor-Tabelle die Crawler indexieren koennen."
|
|
)
|
|
elif layer == "separate" and versioned:
|
|
risk = "gruen"
|
|
rec = (
|
|
"Best Practice umgesetzt: separater Banner + versionierte "
|
|
"Cookie-Richtlinie."
|
|
)
|
|
elif layer == "separate" and not versioned:
|
|
risk = "gelb"
|
|
rec = (
|
|
"Separate Cookie-Richtlinie vorhanden, aber ohne Versionierung. "
|
|
"Snapshot-Archiv empfohlen."
|
|
)
|
|
else:
|
|
risk = "gelb"
|
|
rec = "Cookie-Policy-Architektur uneindeutig — manuelle Pruefung empfohlen."
|
|
|
|
return {
|
|
"layer_separation": layer,
|
|
"versioned": versioned,
|
|
"dynamic_content": dynamic,
|
|
"vendor_count_in_text": vendor_count,
|
|
"risk_label": risk,
|
|
"recommendation": rec,
|
|
"signals": signals,
|
|
"doc_url_normalized": _normalize_url(doc_url),
|
|
}
|
|
|
|
|
|
def build_architecture_html(arch: dict) -> str:
|
|
"""Render the architecture block for the executive summary."""
|
|
if not arch:
|
|
return ""
|
|
risk_colors = {
|
|
"gruen": ("#16a34a", "#dcfce7", "#166534"),
|
|
"gelb": ("#d97706", "#fef3c7", "#92400e"),
|
|
"rot": ("#dc2626", "#fee2e2", "#991b1b"),
|
|
}
|
|
border, bg, fg = risk_colors.get(arch["risk_label"], ("#94a3b8", "#f1f5f9", "#475569"))
|
|
|
|
layer_label = {"single": "Single-Layer (kombiniert)",
|
|
"separate": "Separate Layer (Best Practice)",
|
|
"unknown": "Nicht eindeutig"}[arch["layer_separation"]]
|
|
versioned_lbl = "ja" if arch["versioned"] else "nein"
|
|
dynamic_lbl = "ja (CMP-generiert)" if arch["dynamic_content"] else "statisch"
|
|
|
|
return (
|
|
f'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
|
|
f'max-width:700px;margin:0 auto 14px;padding:12px 16px;'
|
|
f'background:{bg};border:1px solid {border};border-radius:8px;color:{fg}">'
|
|
f'<div style="font-size:11px;text-transform:uppercase;letter-spacing:1px;'
|
|
f'font-weight:600;margin-bottom:6px">Cookie-Policy-Architektur</div>'
|
|
f'<table style="width:100%;font-size:12px;margin:0">'
|
|
f'<tr><td style="padding:2px 0;width:50%">Layer-Trennung</td>'
|
|
f'<td><strong>{layer_label}</strong></td></tr>'
|
|
f'<tr><td style="padding:2px 0">Versionierung</td>'
|
|
f'<td><strong>{versioned_lbl}</strong></td></tr>'
|
|
f'<tr><td style="padding:2px 0">Vendor-Liste</td>'
|
|
f'<td><strong>{dynamic_lbl}</strong></td></tr>'
|
|
f'<tr><td style="padding:2px 0">Vendor-Namen im Text</td>'
|
|
f'<td><strong>{arch["vendor_count_in_text"]}</strong></td></tr>'
|
|
f'</table>'
|
|
f'<div style="font-size:11px;margin-top:8px;padding-top:8px;'
|
|
f'border-top:1px solid {border};font-style:italic">'
|
|
f'{arch["recommendation"]}</div>'
|
|
f'</div>'
|
|
)
|