From 9f06911ff97a7b0aa77e8b45575b0dd3a2c56631 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 21 May 2026 17:00:49 +0200 Subject: [PATCH] feat(audit): Cookie-Library-Fallback fuer VW-Pattern (kein bekanntes CMP) Wenn nach Standard-Extract + Phase-G + LLM-Cascade weiterhin < 3 cmp_vendors aber >= 5 Cookies im after_accept stehen (typisch: Custom-CMP wie VW 'cookiemgmt'), matcht der Fallback die Cookie-Namen gegen die compliance.cookie_library und rekonstruiert Vendor-Records aus den Library-Eintraegen. Hintergrund: VW Run de2a029e zeigt 4 Vendors trotz 28 after_accept-Cookies. cmp_payloads ist 0 (kein bekanntes IAB-Tool erkannt) und die hinterlegte Cookie-URL liefert 404. Die DSE ist mit 34k zwar substanziell, listet aber keine Vendor-Tabelle. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/agent_compliance_check_routes.py | 29 +++++ .../services/cookie_to_vendor_fallback.py | 100 ++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 backend-compliance/compliance/services/cookie_to_vendor_fallback.py diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 8d0f2935..6c05f815 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -727,6 +727,35 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): logger.info("P57: added %d new vendors from Phase G (total: %d)", added, len(cmp_vendors)) + # Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige + # Vendors aber viele after_accept-Cookies, aus Library auflösen. + if banner_result and len(cmp_vendors) < 3: + try: + from compliance.services.cookie_to_vendor_fallback import ( + fallback_vendors_for_run, + ) + from database import SessionLocal as _SLfb + _fb_db = _SLfb() + try: + extra = fallback_vendors_for_run( + _fb_db, banner_result, len(cmp_vendors), + ) + if extra: + existing_names = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + for v in extra: + if v["name"].lower() in existing_names: + continue + cmp_vendors.append(v) + logger.info( + "Cookie-Library-Fallback: cmp_vendors %d -> %d", + len(cmp_vendors) - len(extra), len(cmp_vendors), + ) + finally: + _fb_db.close() + except Exception as e: + logger.warning("Cookie-Library-Fallback skipped: %s", e) + # P50: enrich vendors with per-vendor detail-modal-extracts # (description, opt-out URL, privacy URL, cookies). Detail # comes from Phase G Info-button-click-through in /scan. diff --git a/backend-compliance/compliance/services/cookie_to_vendor_fallback.py b/backend-compliance/compliance/services/cookie_to_vendor_fallback.py new file mode 100644 index 00000000..322acabc --- /dev/null +++ b/backend-compliance/compliance/services/cookie_to_vendor_fallback.py @@ -0,0 +1,100 @@ +""" +Cookie-zu-Vendor-Fallback (P52 Lite). + +Wenn weder cmp_payloads noch vendor_llm_extract Vendors lieferten, +matchen wir die im after_accept gesehenen Cookies gegen die +compliance.cookie_library und bauen Vendor-Records aus den Library- +Eintraegen (cookie_name → vendor_name, actual_category). + +Typisches Szenario: VW nutzt ein Custom-CMP (cookiemgmt-Wrapper), +kein bekanntes IAB-Tool. cmp_payloads = leer, aber after_accept.cookies +hat 28 Eintraege. Diese 28 Cookies sind in der Library = ~15-20 Vendors. +""" + +from __future__ import annotations + +import logging +from typing import Iterable + +from sqlalchemy import text +from sqlalchemy.orm import Session + +logger = logging.getLogger(__name__) + + +def _collect_cookie_names(banner_result: dict | None) -> set[str]: + names: set[str] = set() + if not isinstance(banner_result, dict): + return names + for ph in (banner_result.get("phases") or {}).values(): + if not isinstance(ph, dict): + continue + for ck in (ph.get("cookies") or []): + if isinstance(ck, str): + names.add(ck.strip()) + elif isinstance(ck, dict): + n = (ck.get("name") or "").strip() + if n: + names.add(n) + return {n for n in names if n and len(n) <= 120} + + +def lookup_vendors_from_library( + db: Session, + cookie_names: Iterable[str], +) -> list[dict]: + """Resolves cookie names to vendor records via cookie_library.""" + names = [n for n in cookie_names if n] + if not names: + return [] + rows = db.execute(text( + """ + SELECT cookie_name, actual_category, vendor_name + FROM compliance.cookie_library + WHERE LOWER(cookie_name) = ANY(:lc) + """ + ), {"lc": [n.lower() for n in names]}).fetchall() + by_vendor: dict[str, dict] = {} + for cname, cat, vendor in rows: + if not vendor: + continue + entry = by_vendor.setdefault(vendor, { + "name": vendor, + "country": "", + "purpose": "", + "category": cat or "", + "opt_out_url": "", + "privacy_policy_url": "", + "persistence": "", + "cookies": [], + "source": "library_fallback", + }) + entry["cookies"].append({ + "name": cname, "purpose": "", "expiry": "", + "is_third_party": True, + }) + return list(by_vendor.values()) + + +def fallback_vendors_for_run( + db: Session, + banner_result: dict | None, + existing_vendor_count: int, +) -> list[dict]: + """Returns extra vendor records to merge with the run's cmp_vendors. + + Only fires when existing_vendor_count is suspiciously low (< 3) AND + we have enough cookies to look up (>= 5). Otherwise skip. + """ + if existing_vendor_count >= 3: + return [] + names = _collect_cookie_names(banner_result) + if len(names) < 5: + return [] + vendors = lookup_vendors_from_library(db, names) + if vendors: + logger.info( + "Cookie-Library-Fallback: %d Vendors aus %d Cookies (vorher %d)", + len(vendors), len(names), existing_vendor_count, + ) + return vendors