57c0f940a2
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
P56 Anti-Auditing-Detection als constructive Compliance-Finding (Audit-API-
Empfehlung statt Anklage, weil Mercedes berechtigt Bots blockiert)
P57 Phase G vendor_details Union mit cmp_vendors -> 42 Anbieter sichtbar
P58 Anti-Audit-Detection robuster (Script-Domain-Check + Settings-spezifisch)
P59 Cookie-Behavior-Validator (4 Layer, 3-Tier-Severity: MEDIUM=Kategorie-
Mismatch / HIGH=Zweck-Mismatch / CRITICAL=beide=Vorsatz-Indiz)
+ Open Cookie Database (CC0) als Library-Seed (2264 Cookies)
P59b Cookie-Behavior in Banner-Check verdrahtet + Mail-Block (BUGFIX:
SessionLocal selbst oeffnen, db war im Background-Task nicht im Scope)
Mail-Polish nach Mercedes-Review:
P63 Banner-Footer-Links auch im wb7-link/role=link erkennen (Shadow-DOM-
Walker label-based statt nur <a href>)
P64 Re-Access-Severity: MEDIUM statt HIGH, wenn Footer "Einstellungen" oder
Mercedes-typisch existiert; OEM-Footer-Detection (wb7-footer)
P65 Text-Truncation: Word-Boundary statt Zeichen-Cut (kein "einfa"-Bruch
mehr in Sofortmassnahmen)
P66 GF-Aktionen: Service-Zweck vs Cookie-Zweck explizit erklaert
(haeufige Verwechslung Marketing/GF: "Akamai-Beschreibung" != Cookie-
Zweck pro DSK-OH 2024)
P67 Stirring-Finding mit "Verlust-Framing"-Erklaerung + Alt-vs-Neutral-
Beispiel, statt nur EDPB-Fachbegriff
Compliance-Advisor FAQ (admin agent-core/soul):
+ CNIL/EDPB Top-Bussgelder (Google 100M, Meta 60M, Amazon 35M)
+ Deutsche Praezedenz (LG Muenchen Google Fonts, EuGH Planet49, BGH I ZR 7/16)
+ 4 Risiko-Pfade (Bussgeld/Abmahnung/Sammelklage/NOYB) + Berechnungs-Methodik
Document-Generator Templates: AGB-DE (142), Impressum (140), Widerrufs-
formular-Anlage (143), DSR-Process-Dedup (139), Cookie-Library (144).
Architektur: doc_action_mappings.py + banner_dom_walkers.py +
cookie_behavior_validator.py + vendor_detail_extractor.py rausgezogen,
um die 500-LOC-Caps in agent_doc_check_report.py und
banner_text_checker.py einzuhalten.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
304 lines
13 KiB
Python
304 lines
13 KiB
Python
"""
|
|
P59 — Cookie-Behavior-Validator.
|
|
|
|
4 Layer:
|
|
A) Open Cookie Database lookup (declared category vs library category)
|
|
B) Network-Traffic-Analyse (cookie value sent to third-party domains)
|
|
C) Value-Pattern (Hash/UUID/PII heuristics on "essential"-declared cookies)
|
|
D) Cross-Site frequency (from library metadata, when available)
|
|
|
|
Returns list of findings with severity + Art. 5(1)(b) DSGVO reference.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from typing import Iterable
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# --- Patterns für Layer C ---
|
|
_HASH_PATTERN = re.compile(r"^[a-f0-9]{32,64}$", re.IGNORECASE)
|
|
_UUID_PATTERN = re.compile(
|
|
r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
|
|
re.IGNORECASE,
|
|
)
|
|
_BASE64_LONG = re.compile(r"^[A-Za-z0-9+/=]{40,}$")
|
|
_PII_KEYS = ("email", "@", "user_id", "userid", "username", "phone")
|
|
|
|
# --- Purpose-Keyword-Bags für Layer A2 (Zweck-Match) ---
|
|
_PURPOSE_KEYWORDS = {
|
|
"marketing": {
|
|
"tracking", "tracker", "targeting", "profiling", "profile",
|
|
"advertis", "marketing", "remarket", "retargeting", "conversion",
|
|
"audience", "behavioral", "behaviour", "personali", "interest",
|
|
"campaign", "promotion", "pixel", "fingerprint",
|
|
},
|
|
"statistics": {
|
|
"analytic", "analyse", "analyz", "measure", "measurement", "metric",
|
|
"statistic", "performance", "telemetr", "monitoring", "usage",
|
|
"reichweite", "auswert",
|
|
},
|
|
"essential": {
|
|
"session", "sitzung", "authentic", "anmeld", "login", "logout",
|
|
"security", "sicherheit", "csrf", "xsrf", "cookie consent",
|
|
"cookie-einwilligung", "technisch notwendig", "load balanc",
|
|
"lastverteil",
|
|
},
|
|
"functional": {
|
|
"preference", "praeferen", "language", "sprache", "layout", "design",
|
|
"cart", "warenkorb", "wishlist", "merkliste", "favorit", "theme",
|
|
"darkmode", "darstellung",
|
|
},
|
|
"social_media": {
|
|
"social", "facebook", "twitter", "linkedin", "instagram", "youtube",
|
|
"embed", "share", "teilen",
|
|
},
|
|
}
|
|
|
|
|
|
def _classify_purpose_text(text_value: str) -> set[str]:
|
|
"""Return set of categories whose keywords appear in the purpose-text."""
|
|
if not text_value:
|
|
return set()
|
|
t = text_value.lower()
|
|
matches = set()
|
|
for cat, kws in _PURPOSE_KEYWORDS.items():
|
|
if any(k in t for k in kws):
|
|
matches.add(cat)
|
|
return matches
|
|
|
|
|
|
def _lookup_library(db: Session, cookie_name: str,
|
|
cookie_domain: str) -> dict | None:
|
|
"""Layer A: find best library match."""
|
|
# Exact domain match first, then wildcard
|
|
cur = db.execute(text("""
|
|
SELECT actual_category, purpose_en, purpose_de, vendor_name,
|
|
data_receivers, source_name, source_url, confidence
|
|
FROM compliance.cookie_library
|
|
WHERE cookie_name = :name
|
|
ORDER BY
|
|
CASE WHEN domain_pattern = :domain THEN 0
|
|
WHEN :domain ILIKE replace(domain_pattern, '*', '%') THEN 1
|
|
ELSE 2 END,
|
|
confidence DESC
|
|
LIMIT 1
|
|
"""), {"name": cookie_name, "domain": cookie_domain or ""})
|
|
row = cur.fetchone()
|
|
if not row:
|
|
return None
|
|
return {
|
|
"actual_category": row[0], "purpose_en": row[1],
|
|
"purpose_de": row[2], "vendor_name": row[3],
|
|
"data_receivers": row[4] or [],
|
|
"source_name": row[5], "source_url": row[6],
|
|
"confidence": float(row[7] or 0),
|
|
}
|
|
|
|
|
|
def _value_pattern_flag(value: str | None, declared_category: str) -> str | None:
|
|
"""Layer C: detect tracking-typical patterns in essential-declared cookies."""
|
|
if not value or declared_category not in ("essential", "functional"):
|
|
return None
|
|
v = value.strip()
|
|
if not v or len(v) < 16:
|
|
return None
|
|
if _UUID_PATTERN.match(v):
|
|
return "UUID (Persistent Identifier)"
|
|
if _HASH_PATTERN.match(v):
|
|
return f"Hash-Wert ({len(v)} Hex-Zeichen — typisch User-ID)"
|
|
if _BASE64_LONG.match(v):
|
|
return f"Base64-Long ({len(v)} Zeichen — typisch Tracking-Payload)"
|
|
vlow = v.lower()
|
|
for kw in _PII_KEYS:
|
|
if kw in vlow:
|
|
return f"PII-Marker '{kw}' im Wert"
|
|
return None
|
|
|
|
|
|
def _category_label(cat: str) -> str:
|
|
return {
|
|
"essential": "technisch notwendig",
|
|
"functional": "funktional",
|
|
"statistics": "Analyse/Statistik",
|
|
"marketing": "Marketing/Werbung",
|
|
"social_media": "Social Media",
|
|
"unknown": "unbekannt",
|
|
}.get(cat, cat)
|
|
|
|
|
|
def validate_cookie_behavior(
|
|
db: Session,
|
|
cookies_set: Iterable[dict],
|
|
network_requests: list[dict] | None = None,
|
|
first_party_domain: str = "",
|
|
) -> list[dict]:
|
|
"""Run all 4 layers, return list of finding dicts.
|
|
|
|
Each cookie dict should have: name, domain (optional), value (optional),
|
|
declared_category (e.g. 'essential'), max_age_seconds (optional)."""
|
|
findings: list[dict] = []
|
|
network_requests = network_requests or []
|
|
fp_domain = (first_party_domain or "").lower().lstrip(".")
|
|
|
|
# Pre-index network: which receivers got which cookie?
|
|
receivers_by_cookie: dict[str, set[str]] = {}
|
|
for req in network_requests:
|
|
try:
|
|
host = (req.get("host") or req.get("url", "")).lower()
|
|
for cname in (req.get("cookies_sent") or []):
|
|
receivers_by_cookie.setdefault(cname, set()).add(host)
|
|
except Exception:
|
|
continue
|
|
|
|
for c in cookies_set or []:
|
|
name = (c.get("name") or "").strip()
|
|
if not name:
|
|
continue
|
|
declared = (c.get("declared_category") or "").lower()
|
|
domain = (c.get("domain") or "").lstrip(".").lower()
|
|
value = c.get("value")
|
|
|
|
# Layer A: library lookup + 3-Tier-Severity (Kategorie / Zweck / Kombi)
|
|
lib = _lookup_library(db, name, domain)
|
|
declared_purpose = (c.get("declared_purpose") or "").strip()
|
|
if lib and lib["actual_category"] != "unknown":
|
|
# Layer A1: Kategorie-Mismatch (NUR wenn relevant — declared ist
|
|
# essential/functional aber library sagt marketing/statistics)
|
|
category_mismatch = (
|
|
declared
|
|
and lib["actual_category"] != declared
|
|
and declared in ("essential", "functional")
|
|
and lib["actual_category"] in ("marketing", "statistics",
|
|
"social_media")
|
|
)
|
|
# Layer A2: Zweck-Text-Mismatch
|
|
purpose_mismatch = False
|
|
purpose_explain = ""
|
|
if declared_purpose:
|
|
declared_cats = _classify_purpose_text(declared_purpose)
|
|
actual_cat = lib["actual_category"]
|
|
# Mismatch wenn deklarierter Zweck-Text auf andere Kategorie
|
|
# zeigt als die Library-Realität (z.B. declared "Sitzung" aber
|
|
# tatsaechlich Marketing-Cookie)
|
|
if actual_cat in ("marketing", "statistics", "social_media"):
|
|
# Verdacht wenn deklarierter Zweck NUR essential/functional
|
|
# Patterns hat (nichts zu Marketing/Analytics)
|
|
if declared_cats and actual_cat not in declared_cats:
|
|
# ausserdem: irgendein "harmloser" Keyword da
|
|
if declared_cats & {"essential", "functional"}:
|
|
purpose_mismatch = True
|
|
purpose_explain = (
|
|
f"Beschriebener Zweck deutet auf "
|
|
f"{', '.join(_category_label(c) for c in declared_cats)}, "
|
|
f"das Cookie wird aber tatsaechlich fuer "
|
|
f"{_category_label(actual_cat)} eingesetzt"
|
|
)
|
|
|
|
# 3-Tier-Severity
|
|
if category_mismatch and purpose_mismatch:
|
|
# CRITICAL — Vorsatz / Boeswilligkeit-Indiz
|
|
findings.append({
|
|
"layer": "A1+A2",
|
|
"cookie_name": name,
|
|
"severity": "CRITICAL",
|
|
"type": "DUAL_MISMATCH_INTENT",
|
|
"text": (
|
|
f"Cookie '{name}' weist DOPPELTE Diskrepanz auf: "
|
|
f"deklarierte Kategorie '{_category_label(declared)}' UND "
|
|
f"deklarierter Zweck stimmen NICHT mit dem realen Verhalten "
|
|
f"('{_category_label(lib['actual_category'])}') ueberein. "
|
|
f"{purpose_explain}. {lib['source_name']}-Quelle: "
|
|
f"{lib['purpose_en'][:120] if lib['purpose_en'] else ''}. "
|
|
f"Doppel-Mismatch indiziert Vorsatz nach DSK Beschluss 2024-02 "
|
|
f"(Cookie gezielt verschleiert) — siehe Bussgeld-Risiko Art. 83 "
|
|
f"DSGVO bei wissentlicher Taeuschung. Konstruktive Annahme: "
|
|
f"haeufig Marketing-/Agentur-Versehen ohne DSB-Kontrolle."
|
|
),
|
|
"legal_ref": "Art. 5(1)(a)+(b) DSGVO + DSK Beschluss 2024-02",
|
|
"source": lib["source_url"] or lib["source_name"],
|
|
})
|
|
elif purpose_mismatch:
|
|
# HIGH — Zweck stimmt nicht (Ahnungslosigkeit oder Vorsatz)
|
|
findings.append({
|
|
"layer": "A2",
|
|
"cookie_name": name,
|
|
"severity": "HIGH",
|
|
"type": "PURPOSE_TEXT_MISMATCH",
|
|
"text": (
|
|
f"Cookie '{name}': {purpose_explain}. {lib['source_name']}: "
|
|
f"{(lib['purpose_en'] or '')[:140]}. Deutet auf fehlende "
|
|
f"Detail-Pruefung des Cookie-Verhaltens — Beschreibung sollte "
|
|
f"das tatsaechliche Verhalten reflektieren (Art. 13 DSGVO + "
|
|
f"Transparenz)."
|
|
),
|
|
"legal_ref": "Art. 13(1)(c) DSGVO (Zweck-Angabe muss korrekt sein)",
|
|
"source": lib["source_url"] or lib["source_name"],
|
|
})
|
|
elif category_mismatch:
|
|
# MEDIUM — Kategorie-Tag falsch, kann Fluechtigkeitsfehler sein
|
|
findings.append({
|
|
"layer": "A1",
|
|
"cookie_name": name,
|
|
"severity": "MEDIUM",
|
|
"type": "CATEGORY_MISMATCH",
|
|
"text": (
|
|
f"Cookie '{name}' ist als '{_category_label(declared)}' "
|
|
f"kategorisiert. {lib['source_name']} klassifiziert ihn als "
|
|
f"'{_category_label(lib['actual_category'])}'"
|
|
+ (f" — {lib['purpose_en'][:120]}" if lib['purpose_en'] else "")
|
|
+ f". Vermutlich Konfigurations-Versehen im Consent-Tool "
|
|
f"(haeufig bei Migrations zwischen CMP-Anbietern). "
|
|
f"Korrektur: Cookie auf '{_category_label(lib['actual_category'])}'"
|
|
f" umstellen, Consent neu einholen."
|
|
),
|
|
"legal_ref": "Art. 5(1)(b) DSGVO (Zweckbindung)",
|
|
"source": lib["source_url"] or lib["source_name"],
|
|
})
|
|
|
|
# Layer B: network traffic
|
|
receivers = receivers_by_cookie.get(name, set())
|
|
third_party = [r for r in receivers
|
|
if r and fp_domain and not r.endswith(fp_domain)]
|
|
if third_party and declared in ("essential", "functional"):
|
|
findings.append({
|
|
"layer": "B",
|
|
"cookie_name": name,
|
|
"severity": "HIGH",
|
|
"type": "THIRD_PARTY_DESPITE_ESSENTIAL",
|
|
"text": (
|
|
f"Cookie '{name}' ist als '{_category_label(declared)}' "
|
|
f"deklariert, der Wert wird aber an {len(third_party)} "
|
|
f"externe(n) Empfaenger uebertragen: "
|
|
f"{', '.join(sorted(third_party))[:200]}. "
|
|
f"Damit liegt eine Drittlandstransfer-/Drittanbieter-Verarbeitung "
|
|
f"vor, die nicht durch die deklarierte Zweckbestimmung gedeckt ist."
|
|
),
|
|
"legal_ref": "Art. 5(1)(b) Zweckbindung + Art. 13(1)(f) DSGVO",
|
|
})
|
|
|
|
# Layer C: value pattern
|
|
flag = _value_pattern_flag(value, declared)
|
|
if flag:
|
|
findings.append({
|
|
"layer": "C",
|
|
"cookie_name": name,
|
|
"severity": "MEDIUM",
|
|
"type": "TRACKING_PATTERN_DESPITE_ESSENTIAL",
|
|
"text": (
|
|
f"Cookie '{name}' ist als '{_category_label(declared)}' "
|
|
f"deklariert, enthaelt aber: {flag}. Werte mit Tracking-Charakter "
|
|
f"sind in nicht einwilligungsbeduerftigen Kategorien fragwuerdig."
|
|
),
|
|
"legal_ref": "Art. 5(1)(b) DSGVO + DSK-OH Telemedien 2024",
|
|
})
|
|
|
|
# Layer D: cross-site frequency (later — needs metadata import)
|
|
|
|
return findings
|