""" Vendor-Deduplizierung und Garbage-Filter. Normalisiert Vendor-Namen (Google + Google DoubleClick + DoubleClick/Google Marketing → eine Eintragung) und entfernt Garbage-Eintraege die fälschlich als Vendor erkannt wurden ('click to select a dealership', 'Mehrere OEMs', URL-Fragmente, etc.). Wird nach allen Vendor-Sources (LLM, Library, Pattern, Phase-G) angewandt bevor die VVT-Tabelle gerendert wird. """ from __future__ import annotations import logging import re logger = logging.getLogger(__name__) # Aliase: alle Schreibweisen → kanonischer Name _VENDOR_ALIASES: dict[str, str] = { # Google-Familie "google": "Google", "google llc": "Google", "google inc": "Google", "google marketing platform": "Google", "google ads": "Google", "google adsense": "Google", "google analytics": "Google Analytics", "google tag manager": "Google Tag Manager", "google doubleclick": "Google", "doubleclick": "Google", "doubleclick/google marketing": "Google", "doubleclick by google": "Google", # Adobe-Familie "adobe": "Adobe", "adobe inc": "Adobe", "adobe systems": "Adobe", "adobe analytics": "Adobe Analytics", "adobe audience manager": "Adobe Audience Manager", "adobe experience cloud": "Adobe Experience Cloud", "adobe target": "Adobe Target", "adobe advertising cloud (everest)": "Adobe Advertising Cloud", # Trade Desk "the trade desk": "The Trade Desk", "tradedesk": "The Trade Desk", "the tradedesk": "The Trade Desk", "trade desk": "The Trade Desk", # Meta "meta": "Meta / Facebook", "meta platforms": "Meta / Facebook", "facebook": "Meta / Facebook", "meta / facebook": "Meta / Facebook", # AdForm "adform": "AdForm", "adform dsp": "AdForm", # Microsoft "microsoft": "Microsoft", "microsoft bing": "Microsoft Bing", "linkedin": "LinkedIn (Microsoft)", "linkedin corporation": "LinkedIn (Microsoft)", # CMP "onetrust": "OneTrust", "cookiebot": "Cookiebot", "usercentrics": "Usercentrics", "borlabs": "Borlabs", "borlabs / cookie-cmp": "Borlabs", # Salesforce "salesforce": "Salesforce", "salesforce liveagent": "Salesforce", "liveagent": "Salesforce", # Cloudflare "cloudflare": "Cloudflare", } # Garbage-Patterns: wenn der Vendor-Name darauf matched → wegfiltern _GARBAGE_PATTERNS = ( re.compile(r"^click to ", re.I), re.compile(r"^mehrere oems", re.I), re.compile(r"^breakpilot[-_ ]?snapshot", re.I), re.compile(r"^https?://", re.I), # URLs re.compile(r"^https?$", re.I), re.compile(r"^javascript:", re.I), re.compile(r"^undefined$|^null$|^none$", re.I), re.compile(r"^[\d\W]+$"), # nur Zahlen/Symbole re.compile(r"^.{1,2}$"), # Ein-/Zwei-Zeichen-"Namen" re.compile(r"^(ein|der|die|das|von|und|aber|oder)$", re.I), re.compile(r"^cookie$|^cookies$", re.I), ) def _is_garbage(name: str) -> bool: if not name or len(name.strip()) < 2: return True if len(name) > 120: return True return any(p.search(name) for p in _GARBAGE_PATTERNS) def _canonical_name(name: str) -> str: nl = name.strip().lower() if nl in _VENDOR_ALIASES: return _VENDOR_ALIASES[nl] # Sub-token-Match: 'doubleclick by google' → enthaelt 'doubleclick' for alias, canonical in _VENDOR_ALIASES.items(): if alias in nl and len(alias) >= 6: return canonical return name.strip() def normalize_vendors(vendors: list[dict]) -> list[dict]: """Filtert Garbage + dedupliziert anhand kanonischer Aliase. Mergt cookies-Listen wenn der gleiche Vendor mehrfach erscheint (z.B. aus LLM + Library + Phase-G). Behaelt Metadaten des Eintrags mit der laengsten cookies-Liste. """ if not vendors: return [] by_canon: dict[str, dict] = {} dropped_garbage = 0 merged = 0 for v in vendors: if not isinstance(v, dict): continue raw_name = (v.get("name") or "").strip() if _is_garbage(raw_name): dropped_garbage += 1 continue canon = _canonical_name(raw_name) if canon in by_canon: # Merge: cookies vereinen, source-Tags joinen ex = by_canon[canon] ex_cookies = ex.get("cookies") or [] new_cookies = v.get("cookies") or [] seen_ck = {(c.get("name") or "").lower() for c in ex_cookies if isinstance(c, dict)} for c in new_cookies: if isinstance(c, dict): nm = (c.get("name") or "").strip().lower() if nm and nm not in seen_ck: ex_cookies.append(c) seen_ck.add(nm) ex["cookies"] = ex_cookies # Source-Tag merging (semicolon-separated) ex_src = (ex.get("source") or "").split(";") new_src = v.get("source") or "" if new_src and new_src not in ex_src: ex_src.append(new_src) ex["source"] = ";".join([s for s in ex_src if s]) # Bessere Metadaten uebernehmen (falls leer) for k in ("country", "opt_out_url", "privacy_policy_url", "purpose", "category", "persistence"): if not ex.get(k) and v.get(k): ex[k] = v[k] merged += 1 else: v["name"] = canon by_canon[canon] = v if dropped_garbage or merged: logger.info( "Vendor-Normalizer: %d garbage dropped, %d duplicate merges, " "%d unique vendors (input: %d)", dropped_garbage, merged, len(by_canon), len(vendors), ) return list(by_canon.values())