""" Cookie-zu-Vendor-Fallback (P52 Lite). Wenn weder cmp_payloads noch vendor_llm_extract Vendors lieferten, matchen wir die im after_accept gesehenen Cookies gegen die compliance.cookie_library und bauen Vendor-Records aus den Library- Eintraegen (cookie_name → vendor_name, actual_category). Typisches Szenario: VW nutzt ein Custom-CMP (cookiemgmt-Wrapper), kein bekanntes IAB-Tool. cmp_payloads = leer, aber after_accept.cookies hat 28 Eintraege. Diese 28 Cookies sind in der Library = ~15-20 Vendors. """ from __future__ import annotations import logging import re from typing import Iterable from sqlalchemy import text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) def _collect_cookie_names(banner_result: dict | None) -> set[str]: names: set[str] = set() if not isinstance(banner_result, dict): return names for ph in (banner_result.get("phases") or {}).values(): if not isinstance(ph, dict): continue for ck in (ph.get("cookies") or []): if isinstance(ck, str): names.add(ck.strip()) elif isinstance(ck, dict): n = (ck.get("name") or "").strip() if n: names.add(n) return {n for n in names if n and len(n) <= 120} def lookup_vendors_from_library( db: Session, cookie_names: Iterable[str], ) -> list[dict]: """Resolves cookie names to vendor records via cookie_library.""" names = [n for n in cookie_names if n] if not names: return [] rows = db.execute(text( """ SELECT cookie_name, actual_category, vendor_name FROM compliance.cookie_library WHERE LOWER(cookie_name) = ANY(:lc) """ ), {"lc": [n.lower() for n in names]}).fetchall() by_vendor: dict[str, dict] = {} for cname, cat, vendor in rows: if not vendor: continue entry = by_vendor.setdefault(vendor, { "name": vendor, "country": "", "purpose": "", "category": cat or "", "opt_out_url": "", "privacy_policy_url": "", "persistence": "", "cookies": [], "source": "library_fallback", }) entry["cookies"].append({ "name": cname, "purpose": "", "expiry": "", "is_third_party": True, }) return list(by_vendor.values()) def fallback_vendors_for_run( db: Session, banner_result: dict | None, existing_vendor_count: int, cookie_doc_text: str | None = None, ) -> list[dict]: """Returns extra vendor records to merge with the run's cmp_vendors. VW-Lehre: cmp_vendors=6 (alle LLM-grob) reicht NICHT — die echte Cookie-Tabelle hat 30+ Eintraege. Wir fuehren den Lookup jetzt auch bei mid-tier-Counts aus, solange after_accept >= 15 Cookies hat ODER der Cookie-Doc-Text Cookie-Tabellen-Signale enthaelt. """ names = _collect_cookie_names(banner_result) # Erweitere names um Cookie-Namen die im Cookie-Doc-Text als # Tabellen-Eintraege auftauchen (Pattern: NAME gefolgt von # "Tracking Cookies"/"Session Cookies"/"Funktional"/...). if cookie_doc_text: names |= _extract_cookie_names_from_doc(cookie_doc_text) # Skip-Bedingungen ueberarbeitet: # - sehr wenige Cookies UND >= 5 Vendors schon vorhanden → skip # - sonst IMMER versuchen if len(names) < 5 and existing_vendor_count >= 5: return [] if not names: return [] vendors = lookup_vendors_from_library(db, names) if vendors: logger.info( "Cookie-Library-Fallback: %d Vendors aus %d Cookies " "(existing cmp_vendors=%d)", len(vendors), len(names), existing_vendor_count, ) return vendors _TABLE_ROW_RE = re.compile( r"\b([A-Za-z_][A-Za-z0-9_\-\.]{2,40})\s+" r"(?:Tracking Cookies|Session Cookies|Funktional|Marketing|" r"Analytics|Performance|Notwendig|Strictly\s+Necessary|" r"Statistik|Werbung|Targeting|Personalisierung)", re.I, ) def _extract_cookie_names_from_doc(text: str) -> set[str]: """Pattern-basiertes Erkennen von Cookie-Tabellen-Zeilen. VW-Cookie-Tabelle hat Form: 'IDE Tracking Cookies (Marketing) Dieser Cookie ... 13 Monate' Das fangen wir mit einem Cookie-Name-vor-Category-Pattern. """ out: set[str] = set() for m in _TABLE_ROW_RE.finditer(text): name = m.group(1).strip() # Filter offensichtliche Noise (Pronomen, Verben) nl = name.lower() if nl in ("dieser", "diese", "ein", "der", "die", "das", "session", "permanent", "funktional", "notwendig", "marketing", "analytics", "werbung", "anbieter", "google", "facebook", "tracking", "cookie", "cookies"): continue if len(name) >= 3: out.add(name) return out