"""B19 — Cookie-Coherence-Check. Pro Cookie aus state["cmp_vendors"]: Lookup in 3-Layer-DB und Vergleich der DEKLARATION (was die Site behauptet) mit der TRUTH (was die Open Cookie Database / BreakPilot-KB sagt). Emittiert Findings für die Salesforce-as-essential Falsch-Klassifikation. Finding-Typen: - MARKETING_AS_ESSENTIAL: actual=marketing, declared=essential/functional - LIFETIME_TOO_LONG_FOR_ESSENTIAL: declared=essential, lifetime >90d - PSEUDO_PURPOSE: purpose ist Floskel ("Siehe dazugehörige Datenverarbeitung", "Sehen Sie unter ...") - DUPLICATE_VENDOR: derselbe Vendor in mehreren Kategorien - UNKNOWN_VENDOR_NO_LIBRARY: Cookie nicht in cookie_library, nicht in OCD → muss menschlich klassifiziert werden - MISSING_COUNTRY: vendor_country leer in Deklaration - MISSING_RETENTION: declared duration leer Jedes Finding kommt mit `recommended_action` — konkretes was-zu-tun. """ from __future__ import annotations import logging import re from collections import defaultdict from .cookie_library_lookup import lookup as kb_lookup logger = logging.getLogger(__name__) _PSEUDO_PURPOSE_PATTERNS = ( "siehe dazugehörige datenverarbeitung", "siehe dazugehoerige datenverarbeitung", "siehe oben", "see related", "see corresponding", "wird unter", "see above", "see vendor", "wie oben beschrieben", ) def _is_essential_category(decl: str) -> bool: s = (decl or "").lower() return any(t in s for t in ( "essential", "essenziell", "essentiell", "necessary", "erforderlich", "technisch notwendig", "strictly necessary", "notwendig", "required", )) def _is_marketing_category(actual: str) -> bool: return (actual or "").lower() in ( "marketing", "advertising", "social_media", ) def _parse_lifetime_to_days(text: str) -> float | None: if not text: return None try: from .retention_comparator import parse_duration_to_days days, kind = parse_duration_to_days(text) if kind == "session": return 0.0 if kind in ("persistent", "unknown"): return None return days except Exception: return None def _is_pseudo_purpose(purpose: str) -> bool: if not purpose: return True s = purpose.lower().strip() if any(p in s for p in _PSEUDO_PURPOSE_PATTERNS): return True # Less than 4 words counts as "no real purpose given" if len(re.findall(r"\w+", s)) < 4: return True return False def _norm_vendor(name: str) -> str: s = (name or "").lower().strip() s = re.sub(r"\binc\.?$|\bllc\.?$|\bsas\.?$|\bgmbh\.?$|" r"\bag\.?$|\bb\.v\.?$|\bs\.a\.?$", "", s) s = s.replace(",", " ").strip() return re.sub(r"\s+", " ", s) def check_cookie_coherence(state: dict) -> list[dict]: """Iterate cmp_vendors + cookies, emit B19 findings.""" cmp_vendors = state.get("cmp_vendors") or [] if not cmp_vendors: return [] findings: list[dict] = [] # Track vendor → set of declared categories (DUPLICATE_VENDOR-Detector) vendor_categories: dict[str, set[str]] = defaultdict(set) for v in cmp_vendors: vendor_name = (v.get("name") or "").strip() vendor_country = (v.get("country") or "").strip() vendor_category = (v.get("category") or "").strip().lower() if vendor_name and vendor_category: vendor_categories[_norm_vendor(vendor_name)].add(vendor_category) for c in (v.get("cookies") or []): cname = (c.get("name") or "").strip() if not cname: continue declared_cat = (c.get("category") or vendor_category).lower() declared_purpose = (c.get("purpose") or v.get("purpose") or "").strip() declared_lifetime = (c.get("duration") or c.get("persistence") or c.get("expiry") or "").strip() declared_days = _parse_lifetime_to_days(declared_lifetime) kb = kb_lookup(cname) actual = (kb.get("actual_category") or kb.get("consensus_category") or "").lower() layer = kb.get("_layer") # FINDING 1: MARKETING-AS-ESSENTIAL if actual and _is_marketing_category(actual): if _is_essential_category(declared_cat): findings.append({ "check_id": "COOKIE-COHERENCE-MAE-001", "severity": "HIGH", "severity_reason": "misclassified", "cookie_name": cname, "vendor": vendor_name, "declared_category": declared_cat, "actual_category": actual, "kb_source": layer, "title": ( f"Marketing-Cookie '{cname}' ({vendor_name}) " "als technisch notwendig deklariert" ), "norm": ( "DSGVO Art. 6 Abs. 1 lit. a + § 25 Abs. 1 TDDDG" ), "evidence": ( f"Open Cookie Database / BreakPilot-KB " f"klassifiziert '{cname}' als '{actual}'. " f"Site deklariert als '{declared_cat}' — " "Einwilligung wird umgangen." ), "recommended_action": ( f"Cookie '{cname}' aus Kategorie " f"'{declared_cat}' entfernen und in " f"'Marketing/Werbung' einsortieren. " "Banner-Toggle für diesen Cookie pflichtig." ), }) # FINDING 2: LIFETIME-TOO-LONG-FOR-ESSENTIAL if (_is_essential_category(declared_cat) and declared_days is not None and declared_days > 90): findings.append({ "check_id": "COOKIE-COHERENCE-LIFE-001", "severity": "MEDIUM", "severity_reason": "implausible", "cookie_name": cname, "vendor": vendor_name, "declared_category": declared_cat, "declared_lifetime": declared_lifetime, "lifetime_days": declared_days, "title": ( f"Essential-Cookie '{cname}' mit Lifetime " f"{int(declared_days)} Tage — Plausibilität " "fragwürdig" ), "norm": "DSGVO Art. 5 Abs. 1 lit. c (Datenminimierung)", "evidence": ( f"Cookie deklariert als '{declared_cat}' " f"({vendor_name}) hat Speicherdauer " f"'{declared_lifetime}'. Echte technisch-" "notwendige Cookies sind typischerweise " "Session-Cookies oder max. 30 Tage." ), "recommended_action": ( "Speicherdauer reduzieren (Session oder <30 Tage) " "ODER Kategorie korrekt setzen (functional / " "marketing) wenn Lifetime tatsächlich nötig ist." ), }) # FINDING 3: PSEUDO_PURPOSE if _is_pseudo_purpose(c.get("purpose") or ""): # Suppress if vendor-level purpose is substantial AND # cookie just inherits (we don't double-count). if not (v.get("purpose") and len(re.findall(r"\w+", v["purpose"])) >= 6): findings.append({ "check_id": "COOKIE-COHERENCE-PURP-001", "severity": "LOW", "severity_reason": "incomplete", "cookie_name": cname, "vendor": vendor_name, "title": ( f"Cookie '{cname}' ohne konkreten Zweck — " "nur generischer Verweis / Floskel" ), "norm": "DSGVO Art. 13 Abs. 1 lit. c", "evidence": ( f"Zweck: '{(c.get('purpose') or '')[:120]}'" ), "recommended_action": ( f"Konkreten Zweck für '{cname}' angeben " "(was wird damit konkret gespeichert / " "verarbeitet) — nicht nur Vendor-Verweis." ), }) # FINDING 4: MISSING_COUNTRY if not vendor_country and actual: findings.append({ "check_id": "COOKIE-COHERENCE-CTRY-001", "severity": "LOW", "severity_reason": "missing", "cookie_name": cname, "vendor": vendor_name, "title": ( f"Sitzland für '{cname}' ({vendor_name}) fehlt" ), "norm": "DSGVO Art. 13 Abs. 1 lit. f (Drittlandtransfer)", "evidence": "vendor_country leer in Deklaration", "recommended_action": ( f"Sitzland von {vendor_name} ergänzen. " f"KB-Hinweis: laut Bibliothek " f"{kb.get('vendor_country') or '?'}" ), }) # FINDING 5: UNKNOWN_VENDOR if layer == "unknown": findings.append({ "check_id": "COOKIE-COHERENCE-UNK-001", "severity": "LOW", "severity_reason": "unknown", "cookie_name": cname, "vendor": vendor_name, "title": ( f"Cookie '{cname}' nicht in Open Cookie Database / " "BreakPilot-KB" ), "norm": "Auto-Learning-Kandidat", "evidence": ( "Keine Reference-Klassifikation verfügbar. " "Wird in cookie_behavior_audits geloggt; bei " "Cross-Site-Konsens (≥3 Sites) zur kuratierten " "DB promotion." ), "recommended_action": ( "Manuell prüfen + ggf. zu BreakPilot-KB hinzufügen." ), }) # FINDING 6: DUPLICATE_VENDOR (across categories) for vnorm, cats in vendor_categories.items(): if len(cats) > 1: # Filter empty real_cats = {c for c in cats if c} if len(real_cats) > 1: findings.append({ "check_id": "COOKIE-COHERENCE-DUP-001", "severity": "MEDIUM", "severity_reason": "split_stack", "vendor": vnorm, "categories": sorted(real_cats), "title": ( f"Vendor '{vnorm}' in {len(real_cats)} " "Kategorien gleichzeitig deklariert" ), "norm": "DSGVO Art. 13 Abs. 1 lit. c (Klarheit)", "evidence": ( f"Vendor erscheint in: " f"{', '.join(sorted(real_cats))}. Aufspaltung " "schmuggelt oft Marketing-Funktionen unter " "'erforderlich'." ), "recommended_action": ( f"Vendor '{vnorm}' auf EINE Kategorie " "konsolidieren (höchste Schutzkategorie wählen — " "wenn Marketing-Funktionen dabei sind: " "vollständig zu Marketing)." ), }) if findings: logger.info("B19 cookie-coherence: %d finding(s)", len(findings)) return findings