c908fcd5eb
Adressiert das BMW-Beispiel (740 Cookies, Salesforce als "essential"
mit 1-Jahres-Lifetime, Pseudo-Zwecke wie "Siehe dazugehörige
Datenverarbeitung"). User-Konzept "Regulation als Code".
Step 1 — cookie_library_lookup.py (3 Layer):
1. Override = cookie_knowledge_db.py + extended (74) für
Schrems-II / EUGH / EU-Alternative — BreakPilot-juristische-IP.
2. Truth-Base = compliance.cookie_library (2287 aus Open Cookie
Database, CC0). actual_category als Wahrheit.
3. Auto-Learning = cookie_behavior_audits — Cross-Site-Konsens
wenn ≥3 Sites denselben Cookie melden.
Match: exact > prefix (mit Separator-Check) > wildcard. Kurze
Library-Namen ("c", "ID") brauchen exact-match — verhindert
False-Positive auf "completely_unknown". Trailing-Underscore
in OCD ("guest_uuid_essential_") wird als implicit-wildcard
interpretiert.
Step 2 — cookie_coherence_check.py (B19, 6 Finding-Typen):
- MARKETING_AS_ESSENTIAL (HIGH): KB sagt actual=marketing, Site
deklariert essential/erforderlich → Einwilligung wird umgangen
- LIFETIME_TOO_LONG_FOR_ESSENTIAL (MED): essential + >90d
- PSEUDO_PURPOSE (LOW): "Siehe dazugehörige Datenverarbeitung"
/ <4 Wörter (suppressed wenn Vendor-Purpose substantial ist)
- MISSING_COUNTRY (LOW): vendor_country leer trotz KB-Hit
- UNKNOWN_VENDOR (LOW): nicht in KB → Auto-Learning-Kandidat
- DUPLICATE_VENDOR (MED): selber Vendor in N Kategorien =
Stack-Aufspaltung um Marketing unter "essential" zu schmuggeln
Jedes Finding mit recommended_action ("Cookie X aus 'erforderlich'
raus und in 'Marketing' setzen").
Step 3 — cookie_observation_logger.py:
Loggt nach jedem Audit alle (cookie, site, declared_purpose) in
compliance.cookie_behavior_audits → Basis für Cross-Site-Konsens
in Layer 3.
Step 4 — cookie_csv_exporter.py:
cookies-full-{check_id}.csv mit 21 Spalten (Name, Vendor decl/KB,
Cat decl/KB, Lifetime decl/KB, Country, Opt-Out, 8x FIND_* flags,
recommended_action). UTF-8 BOM für Excel.
ZIP-Attachment: erweitert audit_walk_zip_builder um extra_files=
parameter; phase_e ruft mit cookies-full-...csv auf.
Step 5 — mail_render_v2/_vendor_cards.py:
Statt 740 Cookie-Rows: Aggregation pro Vendor mit Cookie-Count +
Issue-Count + 1-2 Beispiel-Cookies + Issue-Type-Tags. Top 30
Vendoren in der Mail, Rest nur in CSV. Sortiert nach Issue-Score.
Step 6 — render_info_box_rechtsrahmen():
Generic Header-Info-Box mit Art. 13 DSGVO + § 25 TDDDG + Art. 5
+ § 5 UWG + § 30/130 OWiG. Immer angezeigt, kein explicit-
finding-mapping (User-mündigkeit).
Orchestrator + _compose: run_b19 + render_vendor_cards +
render_info_box_rechtsrahmen ins V2-Layout.
Tests: 28/28 grün (15 lookup + 13 coherence).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
300 lines
12 KiB
Python
300 lines
12 KiB
Python
"""B19 — Cookie-Coherence-Check.
|
|
|
|
Pro Cookie aus state["cmp_vendors"]: Lookup in 3-Layer-DB und
|
|
Vergleich der DEKLARATION (was die Site behauptet) mit der TRUTH
|
|
(was die Open Cookie Database / BreakPilot-KB sagt). Emittiert
|
|
Findings für die Salesforce-as-essential Falsch-Klassifikation.
|
|
|
|
Finding-Typen:
|
|
- MARKETING_AS_ESSENTIAL: actual=marketing, declared=essential/functional
|
|
- LIFETIME_TOO_LONG_FOR_ESSENTIAL: declared=essential, lifetime >90d
|
|
- PSEUDO_PURPOSE: purpose ist Floskel ("Siehe dazugehörige
|
|
Datenverarbeitung", "Sehen Sie unter ...")
|
|
- DUPLICATE_VENDOR: derselbe Vendor in mehreren Kategorien
|
|
- UNKNOWN_VENDOR_NO_LIBRARY: Cookie nicht in cookie_library, nicht
|
|
in OCD → muss menschlich klassifiziert werden
|
|
- MISSING_COUNTRY: vendor_country leer in Deklaration
|
|
- MISSING_RETENTION: declared duration leer
|
|
|
|
Jedes Finding kommt mit `recommended_action` — konkretes was-zu-tun.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from .cookie_library_lookup import lookup as kb_lookup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_PSEUDO_PURPOSE_PATTERNS = (
|
|
"siehe dazugehörige datenverarbeitung",
|
|
"siehe dazugehoerige datenverarbeitung",
|
|
"siehe oben",
|
|
"see related",
|
|
"see corresponding",
|
|
"wird unter",
|
|
"see above",
|
|
"see vendor",
|
|
"wie oben beschrieben",
|
|
)
|
|
|
|
|
|
def _is_essential_category(decl: str) -> bool:
|
|
s = (decl or "").lower()
|
|
return any(t in s for t in (
|
|
"essential", "essenziell", "essentiell", "necessary",
|
|
"erforderlich", "technisch notwendig", "strictly necessary",
|
|
"notwendig", "required",
|
|
))
|
|
|
|
|
|
def _is_marketing_category(actual: str) -> bool:
|
|
return (actual or "").lower() in (
|
|
"marketing", "advertising", "social_media",
|
|
)
|
|
|
|
|
|
def _parse_lifetime_to_days(text: str) -> float | None:
|
|
if not text:
|
|
return None
|
|
try:
|
|
from .retention_comparator import parse_duration_to_days
|
|
days, kind = parse_duration_to_days(text)
|
|
if kind == "session":
|
|
return 0.0
|
|
if kind in ("persistent", "unknown"):
|
|
return None
|
|
return days
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _is_pseudo_purpose(purpose: str) -> bool:
|
|
if not purpose:
|
|
return True
|
|
s = purpose.lower().strip()
|
|
if any(p in s for p in _PSEUDO_PURPOSE_PATTERNS):
|
|
return True
|
|
# Less than 4 words counts as "no real purpose given"
|
|
if len(re.findall(r"\w+", s)) < 4:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _norm_vendor(name: str) -> str:
|
|
s = (name or "").lower().strip()
|
|
s = re.sub(r"\binc\.?$|\bllc\.?$|\bsas\.?$|\bgmbh\.?$|"
|
|
r"\bag\.?$|\bb\.v\.?$|\bs\.a\.?$", "", s)
|
|
s = s.replace(",", " ").strip()
|
|
return re.sub(r"\s+", " ", s)
|
|
|
|
|
|
def check_cookie_coherence(state: dict) -> list[dict]:
|
|
"""Iterate cmp_vendors + cookies, emit B19 findings."""
|
|
cmp_vendors = state.get("cmp_vendors") or []
|
|
if not cmp_vendors:
|
|
return []
|
|
|
|
findings: list[dict] = []
|
|
# Track vendor → set of declared categories (DUPLICATE_VENDOR-Detector)
|
|
vendor_categories: dict[str, set[str]] = defaultdict(set)
|
|
|
|
for v in cmp_vendors:
|
|
vendor_name = (v.get("name") or "").strip()
|
|
vendor_country = (v.get("country") or "").strip()
|
|
vendor_category = (v.get("category") or "").strip().lower()
|
|
if vendor_name and vendor_category:
|
|
vendor_categories[_norm_vendor(vendor_name)].add(vendor_category)
|
|
|
|
for c in (v.get("cookies") or []):
|
|
cname = (c.get("name") or "").strip()
|
|
if not cname:
|
|
continue
|
|
declared_cat = (c.get("category") or vendor_category).lower()
|
|
declared_purpose = (c.get("purpose") or v.get("purpose")
|
|
or "").strip()
|
|
declared_lifetime = (c.get("duration") or c.get("persistence")
|
|
or c.get("expiry") or "").strip()
|
|
declared_days = _parse_lifetime_to_days(declared_lifetime)
|
|
|
|
kb = kb_lookup(cname)
|
|
actual = (kb.get("actual_category")
|
|
or kb.get("consensus_category") or "").lower()
|
|
layer = kb.get("_layer")
|
|
|
|
# FINDING 1: MARKETING-AS-ESSENTIAL
|
|
if actual and _is_marketing_category(actual):
|
|
if _is_essential_category(declared_cat):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-MAE-001",
|
|
"severity": "HIGH",
|
|
"severity_reason": "misclassified",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"declared_category": declared_cat,
|
|
"actual_category": actual,
|
|
"kb_source": layer,
|
|
"title": (
|
|
f"Marketing-Cookie '{cname}' ({vendor_name}) "
|
|
"als technisch notwendig deklariert"
|
|
),
|
|
"norm": (
|
|
"DSGVO Art. 6 Abs. 1 lit. a + § 25 Abs. 1 TDDDG"
|
|
),
|
|
"evidence": (
|
|
f"Open Cookie Database / BreakPilot-KB "
|
|
f"klassifiziert '{cname}' als '{actual}'. "
|
|
f"Site deklariert als '{declared_cat}' — "
|
|
"Einwilligung wird umgangen."
|
|
),
|
|
"recommended_action": (
|
|
f"Cookie '{cname}' aus Kategorie "
|
|
f"'{declared_cat}' entfernen und in "
|
|
f"'Marketing/Werbung' einsortieren. "
|
|
"Banner-Toggle für diesen Cookie pflichtig."
|
|
),
|
|
})
|
|
|
|
# FINDING 2: LIFETIME-TOO-LONG-FOR-ESSENTIAL
|
|
if (_is_essential_category(declared_cat)
|
|
and declared_days is not None
|
|
and declared_days > 90):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-LIFE-001",
|
|
"severity": "MEDIUM",
|
|
"severity_reason": "implausible",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"declared_category": declared_cat,
|
|
"declared_lifetime": declared_lifetime,
|
|
"lifetime_days": declared_days,
|
|
"title": (
|
|
f"Essential-Cookie '{cname}' mit Lifetime "
|
|
f"{int(declared_days)} Tage — Plausibilität "
|
|
"fragwürdig"
|
|
),
|
|
"norm": "DSGVO Art. 5 Abs. 1 lit. c (Datenminimierung)",
|
|
"evidence": (
|
|
f"Cookie deklariert als '{declared_cat}' "
|
|
f"({vendor_name}) hat Speicherdauer "
|
|
f"'{declared_lifetime}'. Echte technisch-"
|
|
"notwendige Cookies sind typischerweise "
|
|
"Session-Cookies oder max. 30 Tage."
|
|
),
|
|
"recommended_action": (
|
|
"Speicherdauer reduzieren (Session oder <30 Tage) "
|
|
"ODER Kategorie korrekt setzen (functional / "
|
|
"marketing) wenn Lifetime tatsächlich nötig ist."
|
|
),
|
|
})
|
|
|
|
# FINDING 3: PSEUDO_PURPOSE
|
|
if _is_pseudo_purpose(c.get("purpose") or ""):
|
|
# Suppress if vendor-level purpose is substantial AND
|
|
# cookie just inherits (we don't double-count).
|
|
if not (v.get("purpose")
|
|
and len(re.findall(r"\w+", v["purpose"])) >= 6):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-PURP-001",
|
|
"severity": "LOW",
|
|
"severity_reason": "incomplete",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"title": (
|
|
f"Cookie '{cname}' ohne konkreten Zweck — "
|
|
"nur generischer Verweis / Floskel"
|
|
),
|
|
"norm": "DSGVO Art. 13 Abs. 1 lit. c",
|
|
"evidence": (
|
|
f"Zweck: '{(c.get('purpose') or '')[:120]}'"
|
|
),
|
|
"recommended_action": (
|
|
f"Konkreten Zweck für '{cname}' angeben "
|
|
"(was wird damit konkret gespeichert / "
|
|
"verarbeitet) — nicht nur Vendor-Verweis."
|
|
),
|
|
})
|
|
|
|
# FINDING 4: MISSING_COUNTRY
|
|
if not vendor_country and actual:
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-CTRY-001",
|
|
"severity": "LOW",
|
|
"severity_reason": "missing",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"title": (
|
|
f"Sitzland für '{cname}' ({vendor_name}) fehlt"
|
|
),
|
|
"norm": "DSGVO Art. 13 Abs. 1 lit. f (Drittlandtransfer)",
|
|
"evidence": "vendor_country leer in Deklaration",
|
|
"recommended_action": (
|
|
f"Sitzland von {vendor_name} ergänzen. "
|
|
f"KB-Hinweis: laut Bibliothek "
|
|
f"{kb.get('vendor_country') or '?'}"
|
|
),
|
|
})
|
|
|
|
# FINDING 5: UNKNOWN_VENDOR
|
|
if layer == "unknown":
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-UNK-001",
|
|
"severity": "LOW",
|
|
"severity_reason": "unknown",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"title": (
|
|
f"Cookie '{cname}' nicht in Open Cookie Database / "
|
|
"BreakPilot-KB"
|
|
),
|
|
"norm": "Auto-Learning-Kandidat",
|
|
"evidence": (
|
|
"Keine Reference-Klassifikation verfügbar. "
|
|
"Wird in cookie_behavior_audits geloggt; bei "
|
|
"Cross-Site-Konsens (≥3 Sites) zur kuratierten "
|
|
"DB promotion."
|
|
),
|
|
"recommended_action": (
|
|
"Manuell prüfen + ggf. zu BreakPilot-KB hinzufügen."
|
|
),
|
|
})
|
|
|
|
# FINDING 6: DUPLICATE_VENDOR (across categories)
|
|
for vnorm, cats in vendor_categories.items():
|
|
if len(cats) > 1:
|
|
# Filter empty
|
|
real_cats = {c for c in cats if c}
|
|
if len(real_cats) > 1:
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-DUP-001",
|
|
"severity": "MEDIUM",
|
|
"severity_reason": "split_stack",
|
|
"vendor": vnorm,
|
|
"categories": sorted(real_cats),
|
|
"title": (
|
|
f"Vendor '{vnorm}' in {len(real_cats)} "
|
|
"Kategorien gleichzeitig deklariert"
|
|
),
|
|
"norm": "DSGVO Art. 13 Abs. 1 lit. c (Klarheit)",
|
|
"evidence": (
|
|
f"Vendor erscheint in: "
|
|
f"{', '.join(sorted(real_cats))}. Aufspaltung "
|
|
"schmuggelt oft Marketing-Funktionen unter "
|
|
"'erforderlich'."
|
|
),
|
|
"recommended_action": (
|
|
f"Vendor '{vnorm}' auf EINE Kategorie "
|
|
"konsolidieren (höchste Schutzkategorie wählen — "
|
|
"wenn Marketing-Funktionen dabei sind: "
|
|
"vollständig zu Marketing)."
|
|
),
|
|
})
|
|
|
|
if findings:
|
|
logger.info("B19 cookie-coherence: %d finding(s)", len(findings))
|
|
return findings
|