6dc427a754
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
VW-404-Fix: submitted_types zaehlt jetzt nur Doc-Types mit >= 200 Zeichen echtem Text. Eine eingegebene URL die 404/Mini-Text liefert (VW cookie- richtlinie.html) wird als 'missing' behandelt, sodass Auto-Discovery alternative URLs auf der Homepage probiert. In-place-Update statt Duplicate-Entry, rejected_url wird fuer Audit-Transparenz aufgehoben. P52 LLM-Cascade Merge: vendor_llm_extractor laeuft jetzt bei < 5 Vendors (nicht nur bei 0), und die Ergebnisse werden MIT existing cmp_vendors gemerged statt zu ueberschreiben. VW-typische Setups (Generic CMP + 0 cmp_payloads) bekommen damit den Text-basierten Vendor-Layer dazu. P51 — banner_consistency_checks erweitert: * check_banner_copyability: scannt banner_html nach user-select:none / oncopy=return false / onselectstart. MEDIUM Finding wenn Banner-Text nicht kopierbar (Art. 7 (2) DSGVO). * check_consent_history: prueft auf 'Meine Einwilligungen' / Consent- Historie / Datenschutz-Cockpit. MEDIUM wenn keine sichtbare Historie (Art. 7 (3) — Widerruf muss so einfach wie Erteilung sein). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
459 lines
17 KiB
Python
459 lines
17 KiB
Python
"""
|
|
P92 + P94 — Banner-Konsistenz-Checks (Post-hoc auf banner_result).
|
|
|
|
P92 — CMP-Tool-Verfuegbarkeit:
|
|
Wenn "Anpassen"/"Einstellungen" angeklickt wurde und das Tool laed
|
|
nicht (Network-Error, Timeout, weisse Seite, fehlende
|
|
consent-Elemente nach Klick), ist das ein HIGH-Verstoss — der
|
|
Nutzer hat formal die Moeglichkeit zur granularen Wahl, aber sie
|
|
funktioniert nicht.
|
|
|
|
P94 — Banner-Init-vs-Cookie-Footer-Konsistenz:
|
|
Cookie-Liste im Initial-Banner-Settings darf nicht von der Liste
|
|
im permanenten Cookie-Richtlinien-Dokument abweichen. Wenn Banner
|
|
12 Cookies nennt, die Cookie-Doc aber 47, ist mindestens eine der
|
|
beiden Quellen unvollstaendig → MEDIUM-Finding.
|
|
|
|
Beide liefern dict mit shape:
|
|
{"severity": "HIGH"|"MEDIUM", "code": str, "label": str, "detail": str}
|
|
oder None, wenn der Check nicht greift.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_ANPASSEN_KEYS = (
|
|
"anpassen", "einstellungen", "customize", "preferences",
|
|
"settings", "individuelle", "auswahl", "manage",
|
|
)
|
|
|
|
|
|
def _phases(banner_result: dict) -> dict:
|
|
if not isinstance(banner_result, dict):
|
|
return {}
|
|
return banner_result.get("phases") or {}
|
|
|
|
|
|
def check_cmp_tool_availability(banner_result: dict) -> dict | None:
|
|
"""P92 — Anpassen-Klick aber Settings-Tool defekt / leer."""
|
|
phases = _phases(banner_result)
|
|
settings_ph = phases.get("settings") or phases.get("after_settings_click")
|
|
if not isinstance(settings_ph, dict):
|
|
return None
|
|
|
|
initial_ph = phases.get("initial") or phases.get("before_accept") or {}
|
|
initial_text = (initial_ph.get("banner_text") or "").lower()
|
|
if not any(k in initial_text for k in _ANPASSEN_KEYS):
|
|
return None # Wenn kein Anpassen-Button gar nicht im Initial-Banner,
|
|
# ist das P100s Job — nicht hier doppelt melden.
|
|
|
|
error = settings_ph.get("error") or settings_ph.get("status_error")
|
|
settings_text = (settings_ph.get("banner_text") or "").strip()
|
|
has_categories = bool(
|
|
settings_ph.get("categories")
|
|
or settings_ph.get("category_tests")
|
|
or (settings_ph.get("structured_checks") or [])
|
|
)
|
|
has_toggles = bool(re.search(r"checkbox|toggle|switch|aria-checked",
|
|
(settings_ph.get("banner_html") or ""), re.I))
|
|
timed_out = bool(settings_ph.get("timeout"))
|
|
|
|
failure_signals: list[str] = []
|
|
if error:
|
|
failure_signals.append(f'Fehler: {str(error)[:120]}')
|
|
if timed_out:
|
|
failure_signals.append('Zeitueberschreitung beim Laden')
|
|
if len(settings_text) < 80 and not has_categories:
|
|
failure_signals.append(
|
|
f'Settings-Bereich nur {len(settings_text)} Zeichen, '
|
|
'keine Kategorien sichtbar'
|
|
)
|
|
if not has_toggles and not has_categories:
|
|
failure_signals.append(
|
|
'Keine Checkboxen / Toggles im Settings-Bereich'
|
|
)
|
|
|
|
if not failure_signals:
|
|
return None
|
|
|
|
return {
|
|
"severity": "HIGH",
|
|
"code": "cmp_tool_unavailable",
|
|
"label": 'Cookie-Einstellungen ueber "Anpassen" formal vorhanden, '
|
|
'Tool laed aber nicht oder ist leer',
|
|
"detail": " | ".join(failure_signals),
|
|
"legal_basis": "Art. 7 (3) DSGVO + EDPB 03/2022 — die Moeglichkeit "
|
|
"zur granularen Auswahl muss tatsaechlich funktionieren.",
|
|
}
|
|
|
|
|
|
def _normalize_cookie_names(items) -> set[str]:
|
|
out: set[str] = set()
|
|
if not items:
|
|
return out
|
|
for it in items:
|
|
if isinstance(it, str):
|
|
name = it.strip()
|
|
elif isinstance(it, dict):
|
|
name = (it.get("name") or it.get("cookie") or it.get("id") or "").strip()
|
|
else:
|
|
continue
|
|
if name and len(name) <= 120:
|
|
out.add(name.lower())
|
|
return out
|
|
|
|
|
|
def check_init_banner_vs_cookie_doc(
|
|
banner_result: dict,
|
|
cookie_doc_text: str | None,
|
|
) -> dict | None:
|
|
"""P94 — Cookie-Liste im Init-Banner vs in der Cookie-Richtlinie."""
|
|
if not cookie_doc_text or len(cookie_doc_text) < 500:
|
|
return None
|
|
|
|
phases = _phases(banner_result)
|
|
banner_cookies = _normalize_cookie_names(
|
|
(phases.get("settings") or {}).get("cookies") or []
|
|
) | _normalize_cookie_names(
|
|
(phases.get("initial") or phases.get("before_accept") or {}).get("cookies") or []
|
|
)
|
|
|
|
# Aus dem Cookie-Doc-Text: Cookie-Namen sind typischerweise
|
|
# camelCase oder _underscored, 4-40 Zeichen, ohne Leerzeichen.
|
|
candidates = set(re.findall(
|
|
r"\b([A-Za-z_][A-Za-z0-9_\-\.]{3,40})\b", cookie_doc_text
|
|
))
|
|
# Filter: heuristisch wahrscheinliche Cookie-Namen
|
|
doc_cookies: set[str] = set()
|
|
for c in candidates:
|
|
cl = c.lower()
|
|
if any(p in cl for p in (
|
|
"_ga", "_gid", "_gcl", "_fbp", "uc_", "ot_",
|
|
"cookieconsent", "sessionid", "csrf", "ajs_", "amp_",
|
|
"datadome", "incap_", "_pk_", "wp-", "yt-",
|
|
)):
|
|
doc_cookies.add(cl)
|
|
elif re.match(r"^[a-z][a-z0-9_]{3,30}$", cl) and (
|
|
"cookie" in cl or "consent" in cl or "track" in cl or "session" in cl
|
|
):
|
|
doc_cookies.add(cl)
|
|
|
|
if len(doc_cookies) < 5 or not banner_cookies:
|
|
return None # Datenlage zu duenn fuer sinnvolle Aussage.
|
|
|
|
only_in_doc = doc_cookies - banner_cookies
|
|
only_in_banner = banner_cookies - doc_cookies
|
|
|
|
if len(only_in_doc) < 5 and len(only_in_banner) < 3:
|
|
return None # Tolerable Abweichung.
|
|
|
|
severity = "MEDIUM"
|
|
# HIGH wenn beide Seiten massiv abweichen — dann fehlt klar
|
|
# die Cross-Reference.
|
|
if len(only_in_doc) >= 15 and len(only_in_banner) >= 5:
|
|
severity = "HIGH"
|
|
|
|
return {
|
|
"severity": severity,
|
|
"code": "banner_cookie_doc_mismatch",
|
|
"label": (
|
|
f"Cookie-Liste im Banner-Einstellungen ({len(banner_cookies)}) "
|
|
f"weicht von Cookie-Richtlinie ({len(doc_cookies)}) ab"
|
|
),
|
|
"detail": (
|
|
f"Nur im Cookie-Dokument: {len(only_in_doc)} Cookies (Beispiele: "
|
|
f"{', '.join(sorted(only_in_doc)[:5])}). "
|
|
f"Nur im Banner: {len(only_in_banner)} Cookies. "
|
|
"Empfehlung: eine der beiden Quellen als Single-Source-of-Truth "
|
|
"definieren und die andere automatisch generieren."
|
|
),
|
|
"legal_basis": (
|
|
"Art. 13(1)(c) DSGVO + Art. 12 DSGVO — Informationen ueber die "
|
|
"Verarbeitung muessen vollstaendig und konsistent sein."
|
|
),
|
|
}
|
|
|
|
|
|
_VENDOR_LIST_SIGNALS = (
|
|
"google analytics", "google ads", "facebook pixel", "meta pixel",
|
|
"hotjar", "matomo", "etracker", "salesforce", "hubspot",
|
|
"linkedin insight", "twitter conversion", "tiktok pixel",
|
|
"criteo", "the trade desk", "doubleclick",
|
|
)
|
|
|
|
|
|
def _vendors_mentioned_in_text(text: str) -> set[str]:
|
|
if not text:
|
|
return set()
|
|
t = text.lower()
|
|
return {v for v in _VENDOR_LIST_SIGNALS if v in t}
|
|
|
|
|
|
def check_three_source_vendor_consistency(
|
|
doc_texts: dict[str, str] | None,
|
|
cmp_vendors: list | None,
|
|
) -> dict | None:
|
|
"""P33 — 3-Spalten-Konsistenz: DSE vs Cookie-Doc vs Banner-Vendors.
|
|
|
|
Wenn ein Vendor (z.B. 'Google Analytics') in der DSE und in der
|
|
Cookie-Richtlinie genannt wird, aber NICHT in der Banner-Vendor-
|
|
Liste auftaucht (oder umgekehrt), ist die Drei-Quellen-Aussage
|
|
nicht konsistent. MEDIUM-Finding mit Liste der jeweils fehlenden
|
|
Vendors.
|
|
"""
|
|
if not doc_texts:
|
|
return None
|
|
dse_v = _vendors_mentioned_in_text(doc_texts.get("dse") or "")
|
|
cookie_v = _vendors_mentioned_in_text(doc_texts.get("cookie") or "")
|
|
banner_v: set[str] = set()
|
|
for v in (cmp_vendors or []):
|
|
name = (v.get("name") or "").lower()
|
|
for sig in _VENDOR_LIST_SIGNALS:
|
|
if sig in name or name in sig:
|
|
banner_v.add(sig)
|
|
|
|
sources_with_data = sum(1 for s in (dse_v, cookie_v, banner_v) if s)
|
|
if sources_with_data < 2:
|
|
return None
|
|
|
|
# Vendors in mind. einer Quelle aber nicht in allen vorhandenen
|
|
universe = dse_v | cookie_v | banner_v
|
|
issues: list[str] = []
|
|
for vendor in sorted(universe):
|
|
missing_in = []
|
|
if dse_v and vendor not in dse_v:
|
|
missing_in.append("DSE")
|
|
if cookie_v and vendor not in cookie_v:
|
|
missing_in.append("Cookie-Doc")
|
|
if banner_v and vendor not in banner_v:
|
|
missing_in.append("Banner-Liste")
|
|
if missing_in and len(missing_in) < sources_with_data:
|
|
issues.append(f'{vendor} (fehlt in: {", ".join(missing_in)})')
|
|
|
|
if not issues:
|
|
return None
|
|
|
|
return {
|
|
"severity": "MEDIUM",
|
|
"code": "three_source_vendor_inconsistency",
|
|
"label": (
|
|
f"{len(issues)} Vendor{'en' if len(issues) != 1 else ''} "
|
|
"nicht konsistent zwischen DSE, Cookie-Richtlinie und Banner"
|
|
),
|
|
"detail": (
|
|
"Folgende Vendors sind nicht in allen Quellen genannt: "
|
|
+ "; ".join(issues[:8])
|
|
+ (" ..." if len(issues) > 8 else "")
|
|
+ ". Empfehlung: zentrale Vendor-Liste pflegen und in alle "
|
|
"drei Dokumenttypen propagieren."
|
|
),
|
|
"legal_basis": "Art. 13(1)(c)+(e) DSGVO + EDPB 5/2020 — die "
|
|
"Empfaenger / Drittlandtransfers muessen ueber alle "
|
|
"Touch-Points konsistent kommuniziert werden.",
|
|
}
|
|
|
|
|
|
def check_banner_vs_cmp_partner_count(
|
|
banner_result: dict,
|
|
cmp_vendors: list | None,
|
|
) -> dict | None:
|
|
"""P75 — Banner nennt N Partner, CMP-Payload listet viel mehr.
|
|
|
|
Wenn der Banner-Text behauptet "5 Partner" oder "Wir und unsere
|
|
Partner", die CMP-Payload aber 100+ Vendors enthaelt, wird der
|
|
User getaeuscht.
|
|
"""
|
|
cmp_count = len(cmp_vendors or [])
|
|
if cmp_count < 20:
|
|
return None
|
|
initial_ph = (_phases(banner_result).get("initial")
|
|
or _phases(banner_result).get("before_accept") or {})
|
|
banner_text = (initial_ph.get("banner_text") or "")[:5000]
|
|
if not banner_text:
|
|
return None
|
|
m = re.search(r"\b(\d{1,4})\s*(?:partner|drittanbieter|vendor|"
|
|
r"anbieter|dienstleister)", banner_text, re.I)
|
|
if not m:
|
|
return None
|
|
claimed = int(m.group(1))
|
|
if claimed >= cmp_count * 0.6:
|
|
return None # Zahl im Banner ist plausibel.
|
|
return {
|
|
"severity": "HIGH",
|
|
"code": "banner_understates_vendor_count",
|
|
"label": (
|
|
f"Banner-Text nennt {claimed} Partner, CMP-Payload listet "
|
|
f"{cmp_count} Vendors"
|
|
),
|
|
"detail": (
|
|
f"Die im Banner-Text genannte Zahl ({claimed}) unterschaetzt die "
|
|
f"tatsaechliche Anzahl der Empfaenger ({cmp_count}) deutlich. "
|
|
"Empfehlung: Banner-Text auf die echte Vendor-Zahl heben oder "
|
|
"die Vendor-Liste reduzieren."
|
|
),
|
|
"legal_basis": (
|
|
"Art. 13(1)(e) DSGVO + EDPB 5/2020 — die Empfaenger / "
|
|
"Empfaengerkategorien muessen vollstaendig und nicht "
|
|
"verharmlosend angegeben sein."
|
|
),
|
|
}
|
|
|
|
|
|
def check_banner_copyability(banner_result: dict) -> dict | None:
|
|
"""P51a — Banner-Text muss kopierbar sein. CSS user-select:none oder
|
|
-webkit-user-select:none verhindert das (Article 7(2) DSGVO — verstaendlich
|
|
und in einer Form, die spaetere Pruefung ermoeglicht).
|
|
"""
|
|
if not isinstance(banner_result, dict):
|
|
return None
|
|
phases = banner_result.get("phases") or {}
|
|
initial = phases.get("initial") or phases.get("before_accept") or {}
|
|
html = (initial.get("banner_html") or "")[:50000].lower()
|
|
if not html:
|
|
return None
|
|
blocked_signals = [
|
|
"user-select:none", "user-select: none",
|
|
"-webkit-user-select:none", "-webkit-user-select: none",
|
|
"-moz-user-select:none", "pointer-events:none",
|
|
"oncopy=\"return false", "onselectstart=\"return false",
|
|
]
|
|
hits = [s for s in blocked_signals if s in html]
|
|
if not hits:
|
|
return None
|
|
return {
|
|
"severity": "MEDIUM",
|
|
"code": "banner_not_copyable",
|
|
"label": "Banner-Text laesst sich nicht kopieren "
|
|
"(user-select:none / oncopy disabled)",
|
|
"detail": (
|
|
f'Im Banner-HTML gefunden: {", ".join(hits[:3])}. Der Nutzer '
|
|
"kann den Banner-Text nicht in eine Mail / Doku einfuegen, was "
|
|
"die spaetere Pruefung erschwert. Empfehlung: das CSS entfernen "
|
|
"oder explizit auf 'auto' setzen."
|
|
),
|
|
"legal_basis": "Art. 7 (1)+(2) DSGVO + EDPB 5/2020 — Einwilligungen "
|
|
"muessen in verstaendlicher und zugaenglicher Form "
|
|
"erteilt werden; eine spaetere Pruefung darf nicht "
|
|
"technisch erschwert werden.",
|
|
}
|
|
|
|
|
|
def check_consent_history(banner_result: dict) -> dict | None:
|
|
"""P51b — Es muss eine Moeglichkeit geben, die eigene Einwilligungs-
|
|
Historie einzusehen (Art. 7 (3) — Widerruf muss so einfach wie die
|
|
Erteilung sein; das setzt voraus dass man WEISS was man einwilligt hat).
|
|
"""
|
|
if not isinstance(banner_result, dict):
|
|
return None
|
|
phases = banner_result.get("phases") or {}
|
|
blob_parts: list[str] = []
|
|
for ph in phases.values():
|
|
if isinstance(ph, dict):
|
|
blob_parts.append((ph.get("banner_text") or "")[:5000])
|
|
blob_parts.append((ph.get("banner_html") or "")[:20000])
|
|
blob = " ".join(blob_parts).lower()
|
|
if not blob:
|
|
return None
|
|
history_signals = [
|
|
"meine einwilligung", "consent-historie", "consent history",
|
|
"einwilligungshistorie", "einwilligungs-historie",
|
|
"ihre einwilligungen", "datenschutz-cockpit",
|
|
"privacy dashboard", "einwilligungs-protokoll",
|
|
"consent record", "consent log",
|
|
]
|
|
if any(s in blob for s in history_signals):
|
|
return None
|
|
return {
|
|
"severity": "MEDIUM",
|
|
"code": "consent_history_missing",
|
|
"label": "Keine sichtbare Consent-Historie / 'Meine Einwilligungen'-Ansicht",
|
|
"detail": (
|
|
"Im Banner und in den verlinkten Footer-Bereichen ist keine "
|
|
"Moeglichkeit erkennbar, die eigene Einwilligungs-Historie "
|
|
"einzusehen oder zu exportieren. Empfehlung: einen "
|
|
"'Meine Einwilligungen'-Bereich verlinken (Borlabs / Cookiebot / "
|
|
"Usercentrics bieten dafuer fertige Komponenten)."
|
|
),
|
|
"legal_basis": "Art. 7 (3) DSGVO + EDPB 5/2020 — der Widerruf muss "
|
|
"ebenso einfach sein wie die Erteilung, was eine "
|
|
"Sichtbarmachung der eigenen Einwilligungen voraussetzt.",
|
|
}
|
|
|
|
|
|
def run_all(banner_result: dict, cookie_doc_text: str | None = None,
|
|
cmp_vendors: list | None = None,
|
|
doc_texts: dict[str, str] | None = None) -> list[dict]:
|
|
findings: list[dict] = []
|
|
try:
|
|
f1 = check_cmp_tool_availability(banner_result)
|
|
if f1:
|
|
findings.append(f1)
|
|
except Exception as e:
|
|
logger.warning("P92 cmp_tool_availability failed: %s", e)
|
|
try:
|
|
f2 = check_init_banner_vs_cookie_doc(banner_result, cookie_doc_text)
|
|
if f2:
|
|
findings.append(f2)
|
|
except Exception as e:
|
|
logger.warning("P94 init_vs_cookie_doc failed: %s", e)
|
|
try:
|
|
f3 = check_banner_vs_cmp_partner_count(banner_result, cmp_vendors)
|
|
if f3:
|
|
findings.append(f3)
|
|
except Exception as e:
|
|
logger.warning("P75 banner_vs_cmp_count failed: %s", e)
|
|
try:
|
|
f4 = check_three_source_vendor_consistency(doc_texts, cmp_vendors)
|
|
if f4:
|
|
findings.append(f4)
|
|
except Exception as e:
|
|
logger.warning("P33 three_source_vendor failed: %s", e)
|
|
try:
|
|
f5 = check_banner_copyability(banner_result)
|
|
if f5:
|
|
findings.append(f5)
|
|
except Exception as e:
|
|
logger.warning("P51a copyability failed: %s", e)
|
|
try:
|
|
f6 = check_consent_history(banner_result)
|
|
if f6:
|
|
findings.append(f6)
|
|
except Exception as e:
|
|
logger.warning("P51b consent_history failed: %s", e)
|
|
return findings
|
|
|
|
|
|
def build_consistency_block_html(findings: list[dict]) -> str:
|
|
if not findings:
|
|
return ""
|
|
items: list[str] = []
|
|
for f in findings:
|
|
sev = f.get("severity", "MEDIUM")
|
|
sev_color = "#dc2626" if sev == "HIGH" else "#d97706"
|
|
items.append(
|
|
f'<li style="margin-bottom:10px;font-size:11px;line-height:1.5">'
|
|
f'<strong style="color:{sev_color}">[{sev}] {f.get("label","")}</strong>'
|
|
f'<div style="color:#475569;margin-top:3px">{f.get("detail","")}</div>'
|
|
f'<div style="color:#94a3b8;margin-top:2px;font-style:italic">'
|
|
f'{f.get("legal_basis","")}</div>'
|
|
f'</li>'
|
|
)
|
|
return (
|
|
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
|
|
'max-width:760px;margin:0 auto 16px;padding:14px 18px;'
|
|
'background:#fef3c7;border:1px solid #fcd34d;border-radius:8px">'
|
|
'<div style="font-size:11px;color:#92400e;text-transform:uppercase;'
|
|
'letter-spacing:1.2px;margin-bottom:4px;font-weight:600">'
|
|
'Banner-Konsistenz-Pruefung</div>'
|
|
f'<h3 style="margin:0 0 6px;font-size:14px;color:#1e293b">'
|
|
f'{len(findings)} Konsistenz-Finding{"s" if len(findings) != 1 else ""} '
|
|
'zwischen Banner-UI und Cookie-Richtlinie</h3>'
|
|
'<ul style="margin:8px 0 0 18px;padding:0">'
|
|
+ "".join(items) +
|
|
'</ul></div>'
|
|
)
|