""" P59 — Cookie-Behavior-Validator. 4 Layer: A) Open Cookie Database lookup (declared category vs library category) B) Network-Traffic-Analyse (cookie value sent to third-party domains) C) Value-Pattern (Hash/UUID/PII heuristics on "essential"-declared cookies) D) Cross-Site frequency (from library metadata, when available) Returns list of findings with severity + Art. 5(1)(b) DSGVO reference. """ from __future__ import annotations import logging import re from typing import Iterable from sqlalchemy import text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) # --- Patterns für Layer C --- _HASH_PATTERN = re.compile(r"^[a-f0-9]{32,64}$", re.IGNORECASE) _UUID_PATTERN = re.compile( r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE, ) _BASE64_LONG = re.compile(r"^[A-Za-z0-9+/=]{40,}$") _PII_KEYS = ("email", "@", "user_id", "userid", "username", "phone") # --- Purpose-Keyword-Bags für Layer A2 (Zweck-Match) --- _PURPOSE_KEYWORDS = { "marketing": { "tracking", "tracker", "targeting", "profiling", "profile", "advertis", "marketing", "remarket", "retargeting", "conversion", "audience", "behavioral", "behaviour", "personali", "interest", "campaign", "promotion", "pixel", "fingerprint", }, "statistics": { "analytic", "analyse", "analyz", "measure", "measurement", "metric", "statistic", "performance", "telemetr", "monitoring", "usage", "reichweite", "auswert", }, "essential": { "session", "sitzung", "authentic", "anmeld", "login", "logout", "security", "sicherheit", "csrf", "xsrf", "cookie consent", "cookie-einwilligung", "technisch notwendig", "load balanc", "lastverteil", }, "functional": { "preference", "praeferen", "language", "sprache", "layout", "design", "cart", "warenkorb", "wishlist", "merkliste", "favorit", "theme", "darkmode", "darstellung", }, "social_media": { "social", "facebook", "twitter", "linkedin", "instagram", "youtube", "embed", "share", "teilen", }, } def _classify_purpose_text(text_value: str) -> set[str]: """Return set of categories whose keywords appear in the purpose-text.""" if not text_value: return set() t = text_value.lower() matches = set() for cat, kws in _PURPOSE_KEYWORDS.items(): if any(k in t for k in kws): matches.add(cat) return matches def _lookup_library(db: Session, cookie_name: str, cookie_domain: str) -> dict | None: """Layer A: find best library match.""" # Exact domain match first, then wildcard cur = db.execute(text(""" SELECT actual_category, purpose_en, purpose_de, vendor_name, data_receivers, source_name, source_url, confidence FROM compliance.cookie_library WHERE cookie_name = :name ORDER BY CASE WHEN domain_pattern = :domain THEN 0 WHEN :domain ILIKE replace(domain_pattern, '*', '%') THEN 1 ELSE 2 END, confidence DESC LIMIT 1 """), {"name": cookie_name, "domain": cookie_domain or ""}) row = cur.fetchone() if not row: return None return { "actual_category": row[0], "purpose_en": row[1], "purpose_de": row[2], "vendor_name": row[3], "data_receivers": row[4] or [], "source_name": row[5], "source_url": row[6], "confidence": float(row[7] or 0), } def _value_pattern_flag(value: str | None, declared_category: str) -> str | None: """Layer C: detect tracking-typical patterns in essential-declared cookies.""" if not value or declared_category not in ("essential", "functional"): return None v = value.strip() if not v or len(v) < 16: return None if _UUID_PATTERN.match(v): return "UUID (Persistent Identifier)" if _HASH_PATTERN.match(v): return f"Hash-Wert ({len(v)} Hex-Zeichen — typisch User-ID)" if _BASE64_LONG.match(v): return f"Base64-Long ({len(v)} Zeichen — typisch Tracking-Payload)" vlow = v.lower() for kw in _PII_KEYS: if kw in vlow: return f"PII-Marker '{kw}' im Wert" return None def _category_label(cat: str) -> str: return { "essential": "technisch notwendig", "functional": "funktional", "statistics": "Analyse/Statistik", "marketing": "Marketing/Werbung", "social_media": "Social Media", "unknown": "unbekannt", }.get(cat, cat) def validate_cookie_behavior( db: Session, cookies_set: Iterable[dict], network_requests: list[dict] | None = None, first_party_domain: str = "", ) -> list[dict]: """Run all 4 layers, return list of finding dicts. Each cookie dict should have: name, domain (optional), value (optional), declared_category (e.g. 'essential'), max_age_seconds (optional).""" findings: list[dict] = [] network_requests = network_requests or [] fp_domain = (first_party_domain or "").lower().lstrip(".") # Pre-index network: which receivers got which cookie? receivers_by_cookie: dict[str, set[str]] = {} for req in network_requests: try: host = (req.get("host") or req.get("url", "")).lower() for cname in (req.get("cookies_sent") or []): receivers_by_cookie.setdefault(cname, set()).add(host) except Exception: continue for c in cookies_set or []: name = (c.get("name") or "").strip() if not name: continue declared = (c.get("declared_category") or "").lower() domain = (c.get("domain") or "").lstrip(".").lower() value = c.get("value") # Layer A: library lookup + 3-Tier-Severity (Kategorie / Zweck / Kombi) lib = _lookup_library(db, name, domain) declared_purpose = (c.get("declared_purpose") or "").strip() if lib and lib["actual_category"] != "unknown": # Layer A1: Kategorie-Mismatch (NUR wenn relevant — declared ist # essential/functional aber library sagt marketing/statistics) category_mismatch = ( declared and lib["actual_category"] != declared and declared in ("essential", "functional") and lib["actual_category"] in ("marketing", "statistics", "social_media") ) # Layer A2: Zweck-Text-Mismatch purpose_mismatch = False purpose_explain = "" if declared_purpose: declared_cats = _classify_purpose_text(declared_purpose) actual_cat = lib["actual_category"] # Mismatch wenn deklarierter Zweck-Text auf andere Kategorie # zeigt als die Library-Realität (z.B. declared "Sitzung" aber # tatsaechlich Marketing-Cookie) if actual_cat in ("marketing", "statistics", "social_media"): # Verdacht wenn deklarierter Zweck NUR essential/functional # Patterns hat (nichts zu Marketing/Analytics) if declared_cats and actual_cat not in declared_cats: # ausserdem: irgendein "harmloser" Keyword da if declared_cats & {"essential", "functional"}: purpose_mismatch = True purpose_explain = ( f"Beschriebener Zweck deutet auf " f"{', '.join(_category_label(c) for c in declared_cats)}, " f"das Cookie wird aber tatsaechlich fuer " f"{_category_label(actual_cat)} eingesetzt" ) # 3-Tier-Severity if category_mismatch and purpose_mismatch: # CRITICAL — Vorsatz / Boeswilligkeit-Indiz findings.append({ "layer": "A1+A2", "cookie_name": name, "severity": "CRITICAL", "type": "DUAL_MISMATCH_INTENT", "text": ( f"Cookie '{name}' weist DOPPELTE Diskrepanz auf: " f"deklarierte Kategorie '{_category_label(declared)}' UND " f"deklarierter Zweck stimmen NICHT mit dem realen Verhalten " f"('{_category_label(lib['actual_category'])}') ueberein. " f"{purpose_explain}. {lib['source_name']}-Quelle: " f"{lib['purpose_en'][:120] if lib['purpose_en'] else ''}. " f"Doppel-Mismatch indiziert Vorsatz nach DSK Beschluss 2024-02 " f"(Cookie gezielt verschleiert) — siehe Bussgeld-Risiko Art. 83 " f"DSGVO bei wissentlicher Taeuschung. Konstruktive Annahme: " f"haeufig Marketing-/Agentur-Versehen ohne DSB-Kontrolle." ), "legal_ref": "Art. 5(1)(a)+(b) DSGVO + DSK Beschluss 2024-02", "source": lib["source_url"] or lib["source_name"], }) elif purpose_mismatch: # HIGH — Zweck stimmt nicht (Ahnungslosigkeit oder Vorsatz) findings.append({ "layer": "A2", "cookie_name": name, "severity": "HIGH", "type": "PURPOSE_TEXT_MISMATCH", "text": ( f"Cookie '{name}': {purpose_explain}. {lib['source_name']}: " f"{(lib['purpose_en'] or '')[:140]}. Deutet auf fehlende " f"Detail-Pruefung des Cookie-Verhaltens — Beschreibung sollte " f"das tatsaechliche Verhalten reflektieren (Art. 13 DSGVO + " f"Transparenz)." ), "legal_ref": "Art. 13(1)(c) DSGVO (Zweck-Angabe muss korrekt sein)", "source": lib["source_url"] or lib["source_name"], }) elif category_mismatch: # MEDIUM — Kategorie-Tag falsch, kann Fluechtigkeitsfehler sein findings.append({ "layer": "A1", "cookie_name": name, "severity": "MEDIUM", "type": "CATEGORY_MISMATCH", "text": ( f"Cookie '{name}' ist als '{_category_label(declared)}' " f"kategorisiert. {lib['source_name']} klassifiziert ihn als " f"'{_category_label(lib['actual_category'])}'" + (f" — {lib['purpose_en'][:120]}" if lib['purpose_en'] else "") + f". Vermutlich Konfigurations-Versehen im Consent-Tool " f"(haeufig bei Migrations zwischen CMP-Anbietern). " f"Korrektur: Cookie auf '{_category_label(lib['actual_category'])}'" f" umstellen, Consent neu einholen." ), "legal_ref": "Art. 5(1)(b) DSGVO (Zweckbindung)", "source": lib["source_url"] or lib["source_name"], }) # Layer B: network traffic receivers = receivers_by_cookie.get(name, set()) third_party = [r for r in receivers if r and fp_domain and not r.endswith(fp_domain)] if third_party and declared in ("essential", "functional"): findings.append({ "layer": "B", "cookie_name": name, "severity": "HIGH", "type": "THIRD_PARTY_DESPITE_ESSENTIAL", "text": ( f"Cookie '{name}' ist als '{_category_label(declared)}' " f"deklariert, der Wert wird aber an {len(third_party)} " f"externe(n) Empfaenger uebertragen: " f"{', '.join(sorted(third_party))[:200]}. " f"Damit liegt eine Drittlandstransfer-/Drittanbieter-Verarbeitung " f"vor, die nicht durch die deklarierte Zweckbestimmung gedeckt ist." ), "legal_ref": "Art. 5(1)(b) Zweckbindung + Art. 13(1)(f) DSGVO", }) # Layer C: value pattern flag = _value_pattern_flag(value, declared) if flag: findings.append({ "layer": "C", "cookie_name": name, "severity": "MEDIUM", "type": "TRACKING_PATTERN_DESPITE_ESSENTIAL", "text": ( f"Cookie '{name}' ist als '{_category_label(declared)}' " f"deklariert, enthaelt aber: {flag}. Werte mit Tracking-Charakter " f"sind in nicht einwilligungsbeduerftigen Kategorien fragwuerdig." ), "legal_ref": "Art. 5(1)(b) DSGVO + DSK-OH Telemedien 2024", }) # Layer D: cross-site frequency (later — needs metadata import) return findings