Files
breakpilot-compliance/backend-compliance/compliance/services/cookie_to_vendor_fallback.py
T
Benjamin Admin 138d9068c4
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
fix(audit): VW-Cookie-Tabelle — Library-Fallback + Pattern-Extract verstaerkt
VW-Lehre: cmp_vendors=6 (alle LLM-grob) wurde als ausreichend gewertet,
obwohl die echte Cookie-Tabelle 30+ Eintraege hat. 3 Fixes:

1. fallback_vendors_for_run skip-Schwelle: existing_vendor_count >= 3
   war zu niedrig. Jetzt nur skip wenn < 5 Cookies UND >= 5 Vendors
   schon vorhanden.

2. Library-Fallback wird jetzt aufgerufen bei < 20 cmp_vendors (statt
   < 3). VW-typische Setups (6 LLM-grob + 30 aus Library) bekommen
   damit eine vollstaendige Vendor-Liste.

3. _extract_cookie_names_from_doc: regex-Pattern-Extract aus dem
   Cookie-Doc-Text selbst — sucht nach 'NAME Tracking Cookies (Marketing)'
   etc. Findet Cookie-Namen die NICHT im Browser-Jar landen (z.B. nur
   nach Consent geladen werden). Diese werden zusaetzlich durch die
   Library matched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 18:32:07 +02:00

148 lines
4.8 KiB
Python

"""
Cookie-zu-Vendor-Fallback (P52 Lite).
Wenn weder cmp_payloads noch vendor_llm_extract Vendors lieferten,
matchen wir die im after_accept gesehenen Cookies gegen die
compliance.cookie_library und bauen Vendor-Records aus den Library-
Eintraegen (cookie_name → vendor_name, actual_category).
Typisches Szenario: VW nutzt ein Custom-CMP (cookiemgmt-Wrapper),
kein bekanntes IAB-Tool. cmp_payloads = leer, aber after_accept.cookies
hat 28 Eintraege. Diese 28 Cookies sind in der Library = ~15-20 Vendors.
"""
from __future__ import annotations
import logging
import re
from typing import Iterable
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
def _collect_cookie_names(banner_result: dict | None) -> set[str]:
names: set[str] = set()
if not isinstance(banner_result, dict):
return names
for ph in (banner_result.get("phases") or {}).values():
if not isinstance(ph, dict):
continue
for ck in (ph.get("cookies") or []):
if isinstance(ck, str):
names.add(ck.strip())
elif isinstance(ck, dict):
n = (ck.get("name") or "").strip()
if n:
names.add(n)
return {n for n in names if n and len(n) <= 120}
def lookup_vendors_from_library(
db: Session,
cookie_names: Iterable[str],
) -> list[dict]:
"""Resolves cookie names to vendor records via cookie_library."""
names = [n for n in cookie_names if n]
if not names:
return []
rows = db.execute(text(
"""
SELECT cookie_name, actual_category, vendor_name
FROM compliance.cookie_library
WHERE LOWER(cookie_name) = ANY(:lc)
"""
), {"lc": [n.lower() for n in names]}).fetchall()
by_vendor: dict[str, dict] = {}
for cname, cat, vendor in rows:
if not vendor:
continue
entry = by_vendor.setdefault(vendor, {
"name": vendor,
"country": "",
"purpose": "",
"category": cat or "",
"opt_out_url": "",
"privacy_policy_url": "",
"persistence": "",
"cookies": [],
"source": "library_fallback",
})
entry["cookies"].append({
"name": cname, "purpose": "", "expiry": "",
"is_third_party": True,
})
return list(by_vendor.values())
def fallback_vendors_for_run(
db: Session,
banner_result: dict | None,
existing_vendor_count: int,
cookie_doc_text: str | None = None,
) -> list[dict]:
"""Returns extra vendor records to merge with the run's cmp_vendors.
VW-Lehre: cmp_vendors=6 (alle LLM-grob) reicht NICHT — die echte
Cookie-Tabelle hat 30+ Eintraege. Wir fuehren den Lookup jetzt auch
bei mid-tier-Counts aus, solange after_accept >= 15 Cookies hat
ODER der Cookie-Doc-Text Cookie-Tabellen-Signale enthaelt.
"""
names = _collect_cookie_names(banner_result)
# Erweitere names um Cookie-Namen die im Cookie-Doc-Text als
# Tabellen-Eintraege auftauchen (Pattern: NAME gefolgt von
# "Tracking Cookies"/"Session Cookies"/"Funktional"/...).
if cookie_doc_text:
names |= _extract_cookie_names_from_doc(cookie_doc_text)
# Skip-Bedingungen ueberarbeitet:
# - sehr wenige Cookies UND >= 5 Vendors schon vorhanden → skip
# - sonst IMMER versuchen
if len(names) < 5 and existing_vendor_count >= 5:
return []
if not names:
return []
vendors = lookup_vendors_from_library(db, names)
if vendors:
logger.info(
"Cookie-Library-Fallback: %d Vendors aus %d Cookies "
"(existing cmp_vendors=%d)",
len(vendors), len(names), existing_vendor_count,
)
return vendors
_TABLE_ROW_RE = re.compile(
r"\b([A-Za-z_][A-Za-z0-9_\-\.]{2,40})\s+"
r"(?:Tracking Cookies|Session Cookies|Funktional|Marketing|"
r"Analytics|Performance|Notwendig|Strictly\s+Necessary|"
r"Statistik|Werbung|Targeting|Personalisierung)",
re.I,
)
def _extract_cookie_names_from_doc(text: str) -> set[str]:
"""Pattern-basiertes Erkennen von Cookie-Tabellen-Zeilen.
VW-Cookie-Tabelle hat Form:
'IDE Tracking Cookies (Marketing) Dieser Cookie ... 13 Monate'
Das fangen wir mit einem Cookie-Name-vor-Category-Pattern.
"""
out: set[str] = set()
for m in _TABLE_ROW_RE.finditer(text):
name = m.group(1).strip()
# Filter offensichtliche Noise (Pronomen, Verben)
nl = name.lower()
if nl in ("dieser", "diese", "ein", "der", "die", "das",
"session", "permanent", "funktional", "notwendig",
"marketing", "analytics", "werbung", "anbieter",
"google", "facebook", "tracking", "cookie", "cookies"):
continue
if len(name) >= 3:
out.add(name)
return out