327e6a8984
BMW4 zeigte 1037 UNK-Findings — die Mail wurde damit unleserlich. Drei pragmatische Anpassungen: 1. UNK severity: LOW → INFO. Mail-Renderer zeigt jetzt nur HIGH/MEDIUM/LOW; INFO bleibt im API-Payload + CSV. 2. UNK wird NICHT emittiert wenn Vendor=First-Party-Owner (z.B. "BMW AG" auf bmw.de). Heuristik _is_first_party_owner vergleicht Vendor-Name gegen Domain-SLD. 3. auto_learning threshold ≥3 Sites → ≥1 Site. Second-time-Audit einer Site hat ihre eigenen Cookies bereits gelernt → kein UNK mehr. Single-site Auto-Learning ist absichtlich konservativ (Annotation, kein Truth). Effekt: erwartete Reduktion bei BMW von 1037 UNK → ~50-100 (nur unbekannte 3rd-party-Vendoren). Mail wird lesbar, MAE- Findings (Salesforce-as-essential) bleiben prominent sichtbar. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
335 lines
13 KiB
Python
335 lines
13 KiB
Python
"""B19 — Cookie-Coherence-Check.
|
|
|
|
Pro Cookie aus state["cmp_vendors"]: Lookup in 3-Layer-DB und
|
|
Vergleich der DEKLARATION (was die Site behauptet) mit der TRUTH
|
|
(was die Open Cookie Database / BreakPilot-KB sagt). Emittiert
|
|
Findings für die Salesforce-as-essential Falsch-Klassifikation.
|
|
|
|
Finding-Typen:
|
|
- MARKETING_AS_ESSENTIAL: actual=marketing, declared=essential/functional
|
|
- LIFETIME_TOO_LONG_FOR_ESSENTIAL: declared=essential, lifetime >90d
|
|
- PSEUDO_PURPOSE: purpose ist Floskel ("Siehe dazugehörige
|
|
Datenverarbeitung", "Sehen Sie unter ...")
|
|
- DUPLICATE_VENDOR: derselbe Vendor in mehreren Kategorien
|
|
- UNKNOWN_VENDOR_NO_LIBRARY: Cookie nicht in cookie_library, nicht
|
|
in OCD → muss menschlich klassifiziert werden
|
|
- MISSING_COUNTRY: vendor_country leer in Deklaration
|
|
- MISSING_RETENTION: declared duration leer
|
|
|
|
Jedes Finding kommt mit `recommended_action` — konkretes was-zu-tun.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from .cookie_library_lookup import lookup as kb_lookup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_PSEUDO_PURPOSE_PATTERNS = (
|
|
"siehe dazugehörige datenverarbeitung",
|
|
"siehe dazugehoerige datenverarbeitung",
|
|
"siehe oben",
|
|
"see related",
|
|
"see corresponding",
|
|
"wird unter",
|
|
"see above",
|
|
"see vendor",
|
|
"wie oben beschrieben",
|
|
)
|
|
|
|
|
|
def _is_essential_category(decl: str) -> bool:
|
|
s = (decl or "").lower()
|
|
return any(t in s for t in (
|
|
"essential", "essenziell", "essentiell", "necessary",
|
|
"erforderlich", "technisch notwendig", "strictly necessary",
|
|
"notwendig", "required",
|
|
))
|
|
|
|
|
|
def _is_marketing_category(actual: str) -> bool:
|
|
return (actual or "").lower() in (
|
|
"marketing", "advertising", "social_media",
|
|
)
|
|
|
|
|
|
def _parse_lifetime_to_days(text: str) -> float | None:
|
|
if not text:
|
|
return None
|
|
try:
|
|
from .retention_comparator import parse_duration_to_days
|
|
days, kind = parse_duration_to_days(text)
|
|
if kind == "session":
|
|
return 0.0
|
|
if kind in ("persistent", "unknown"):
|
|
return None
|
|
return days
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _is_pseudo_purpose(purpose: str) -> bool:
|
|
if not purpose:
|
|
return True
|
|
s = purpose.lower().strip()
|
|
if any(p in s for p in _PSEUDO_PURPOSE_PATTERNS):
|
|
return True
|
|
# Less than 4 words counts as "no real purpose given"
|
|
if len(re.findall(r"\w+", s)) < 4:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _is_first_party_owner(vendor: str, state: dict) -> bool:
|
|
"""Heuristik: Vendor ist der Site-Betreiber selbst — first-party.
|
|
|
|
Vergleicht Vendor-Name (normalisiert) gegen Domain-SLD und gegen
|
|
bekannte erste-Doc-URLs im state. "BMW AG" matcht bmw.de;
|
|
"Volkswagen Group Charging" matcht elli.eco.
|
|
"""
|
|
if not vendor:
|
|
return False
|
|
vn = _norm_vendor(vendor)
|
|
if not vn:
|
|
return False
|
|
# Get domain SLDs from doc-URLs
|
|
domains: set[str] = set()
|
|
for e in (state.get("doc_entries") or []):
|
|
url = (e.get("url") or "").strip().lower()
|
|
if "://" in url:
|
|
host = url.split("://", 1)[1].split("/", 1)[0]
|
|
host = host.lstrip("www.")
|
|
parts = host.split(".")
|
|
if parts:
|
|
domains.add(parts[0])
|
|
if len(parts) >= 2:
|
|
domains.add(parts[-2])
|
|
for d in domains:
|
|
if d and len(d) >= 3 and d in vn:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _norm_vendor(name: str) -> str:
|
|
s = (name or "").lower().strip()
|
|
s = re.sub(r"\binc\.?$|\bllc\.?$|\bsas\.?$|\bgmbh\.?$|"
|
|
r"\bag\.?$|\bb\.v\.?$|\bs\.a\.?$", "", s)
|
|
s = s.replace(",", " ").strip()
|
|
return re.sub(r"\s+", " ", s)
|
|
|
|
|
|
def check_cookie_coherence(state: dict) -> list[dict]:
|
|
"""Iterate cmp_vendors + cookies, emit B19 findings."""
|
|
cmp_vendors = state.get("cmp_vendors") or []
|
|
if not cmp_vendors:
|
|
return []
|
|
|
|
findings: list[dict] = []
|
|
# Track vendor → set of declared categories (DUPLICATE_VENDOR-Detector)
|
|
vendor_categories: dict[str, set[str]] = defaultdict(set)
|
|
|
|
for v in cmp_vendors:
|
|
vendor_name = (v.get("name") or "").strip()
|
|
vendor_country = (v.get("country") or "").strip()
|
|
vendor_category = (v.get("category") or "").strip().lower()
|
|
if vendor_name and vendor_category:
|
|
vendor_categories[_norm_vendor(vendor_name)].add(vendor_category)
|
|
|
|
for c in (v.get("cookies") or []):
|
|
cname = (c.get("name") or "").strip()
|
|
if not cname:
|
|
continue
|
|
declared_cat = (c.get("category") or vendor_category).lower()
|
|
declared_purpose = (c.get("purpose") or v.get("purpose")
|
|
or "").strip()
|
|
declared_lifetime = (c.get("duration") or c.get("persistence")
|
|
or c.get("expiry") or "").strip()
|
|
declared_days = _parse_lifetime_to_days(declared_lifetime)
|
|
|
|
kb = kb_lookup(cname)
|
|
actual = (kb.get("actual_category")
|
|
or kb.get("consensus_category") or "").lower()
|
|
layer = kb.get("_layer")
|
|
|
|
# FINDING 1: MARKETING-AS-ESSENTIAL
|
|
if actual and _is_marketing_category(actual):
|
|
if _is_essential_category(declared_cat):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-MAE-001",
|
|
"severity": "HIGH",
|
|
"severity_reason": "misclassified",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"declared_category": declared_cat,
|
|
"actual_category": actual,
|
|
"kb_source": layer,
|
|
"title": (
|
|
f"Marketing-Cookie '{cname}' ({vendor_name}) "
|
|
"als technisch notwendig deklariert"
|
|
),
|
|
"norm": (
|
|
"DSGVO Art. 6 Abs. 1 lit. a + § 25 Abs. 1 TDDDG"
|
|
),
|
|
"evidence": (
|
|
f"Open Cookie Database / BreakPilot-KB "
|
|
f"klassifiziert '{cname}' als '{actual}'. "
|
|
f"Site deklariert als '{declared_cat}' — "
|
|
"Einwilligung wird umgangen."
|
|
),
|
|
"recommended_action": (
|
|
f"Cookie '{cname}' aus Kategorie "
|
|
f"'{declared_cat}' entfernen und in "
|
|
f"'Marketing/Werbung' einsortieren. "
|
|
"Banner-Toggle für diesen Cookie pflichtig."
|
|
),
|
|
})
|
|
|
|
# FINDING 2: LIFETIME-TOO-LONG-FOR-ESSENTIAL
|
|
if (_is_essential_category(declared_cat)
|
|
and declared_days is not None
|
|
and declared_days > 90):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-LIFE-001",
|
|
"severity": "MEDIUM",
|
|
"severity_reason": "implausible",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"declared_category": declared_cat,
|
|
"declared_lifetime": declared_lifetime,
|
|
"lifetime_days": declared_days,
|
|
"title": (
|
|
f"Essential-Cookie '{cname}' mit Lifetime "
|
|
f"{int(declared_days)} Tage — Plausibilität "
|
|
"fragwürdig"
|
|
),
|
|
"norm": "DSGVO Art. 5 Abs. 1 lit. c (Datenminimierung)",
|
|
"evidence": (
|
|
f"Cookie deklariert als '{declared_cat}' "
|
|
f"({vendor_name}) hat Speicherdauer "
|
|
f"'{declared_lifetime}'. Echte technisch-"
|
|
"notwendige Cookies sind typischerweise "
|
|
"Session-Cookies oder max. 30 Tage."
|
|
),
|
|
"recommended_action": (
|
|
"Speicherdauer reduzieren (Session oder <30 Tage) "
|
|
"ODER Kategorie korrekt setzen (functional / "
|
|
"marketing) wenn Lifetime tatsächlich nötig ist."
|
|
),
|
|
})
|
|
|
|
# FINDING 3: PSEUDO_PURPOSE
|
|
if _is_pseudo_purpose(c.get("purpose") or ""):
|
|
# Suppress if vendor-level purpose is substantial AND
|
|
# cookie just inherits (we don't double-count).
|
|
if not (v.get("purpose")
|
|
and len(re.findall(r"\w+", v["purpose"])) >= 6):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-PURP-001",
|
|
"severity": "LOW",
|
|
"severity_reason": "incomplete",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"title": (
|
|
f"Cookie '{cname}' ohne konkreten Zweck — "
|
|
"nur generischer Verweis / Floskel"
|
|
),
|
|
"norm": "DSGVO Art. 13 Abs. 1 lit. c",
|
|
"evidence": (
|
|
f"Zweck: '{(c.get('purpose') or '')[:120]}'"
|
|
),
|
|
"recommended_action": (
|
|
f"Konkreten Zweck für '{cname}' angeben "
|
|
"(was wird damit konkret gespeichert / "
|
|
"verarbeitet) — nicht nur Vendor-Verweis."
|
|
),
|
|
})
|
|
|
|
# FINDING 4: MISSING_COUNTRY
|
|
if not vendor_country and actual:
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-CTRY-001",
|
|
"severity": "LOW",
|
|
"severity_reason": "missing",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"title": (
|
|
f"Sitzland für '{cname}' ({vendor_name}) fehlt"
|
|
),
|
|
"norm": "DSGVO Art. 13 Abs. 1 lit. f (Drittlandtransfer)",
|
|
"evidence": "vendor_country leer in Deklaration",
|
|
"recommended_action": (
|
|
f"Sitzland von {vendor_name} ergänzen. "
|
|
f"KB-Hinweis: laut Bibliothek "
|
|
f"{kb.get('vendor_country') or '?'}"
|
|
),
|
|
})
|
|
|
|
# FINDING 5: UNKNOWN_VENDOR — nur emittieren wenn Vendor
|
|
# *fremd* ist (3rd-party). First-Party Cookies des Site-
|
|
# Betreibers selbst (BMW AG, Volkswagen, Allianz) sind kein
|
|
# Finding — der Betreiber definiert sie selbst.
|
|
if layer == "unknown" and not _is_first_party_owner(
|
|
vendor_name, state,
|
|
):
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-UNK-001",
|
|
"severity": "INFO",
|
|
"severity_reason": "auto_learning",
|
|
"cookie_name": cname,
|
|
"vendor": vendor_name,
|
|
"title": (
|
|
f"Cookie '{cname}' nicht in Open Cookie Database / "
|
|
"BreakPilot-KB"
|
|
),
|
|
"norm": "Auto-Learning-Kandidat",
|
|
"evidence": (
|
|
"Keine Reference-Klassifikation verfügbar. "
|
|
"Wird in cookie_behavior_audits geloggt; bei "
|
|
"wiederholter Beobachtung (Cross-Site-Konsens) "
|
|
"automatisch zur DB promotion."
|
|
),
|
|
"recommended_action": (
|
|
"Manuell prüfen + ggf. zu BreakPilot-KB hinzufügen."
|
|
),
|
|
})
|
|
|
|
# FINDING 6: DUPLICATE_VENDOR (across categories)
|
|
for vnorm, cats in vendor_categories.items():
|
|
if len(cats) > 1:
|
|
# Filter empty
|
|
real_cats = {c for c in cats if c}
|
|
if len(real_cats) > 1:
|
|
findings.append({
|
|
"check_id": "COOKIE-COHERENCE-DUP-001",
|
|
"severity": "MEDIUM",
|
|
"severity_reason": "split_stack",
|
|
"vendor": vnorm,
|
|
"categories": sorted(real_cats),
|
|
"title": (
|
|
f"Vendor '{vnorm}' in {len(real_cats)} "
|
|
"Kategorien gleichzeitig deklariert"
|
|
),
|
|
"norm": "DSGVO Art. 13 Abs. 1 lit. c (Klarheit)",
|
|
"evidence": (
|
|
f"Vendor erscheint in: "
|
|
f"{', '.join(sorted(real_cats))}. Aufspaltung "
|
|
"schmuggelt oft Marketing-Funktionen unter "
|
|
"'erforderlich'."
|
|
),
|
|
"recommended_action": (
|
|
f"Vendor '{vnorm}' auf EINE Kategorie "
|
|
"konsolidieren (höchste Schutzkategorie wählen — "
|
|
"wenn Marketing-Funktionen dabei sind: "
|
|
"vollständig zu Marketing)."
|
|
),
|
|
})
|
|
|
|
if findings:
|
|
logger.info("B19 cookie-coherence: %d finding(s)", len(findings))
|
|
return findings
|