breakpilot-compliance/backend-compliance/compliance/services/vendor_normalizer.py

"""
Vendor-Deduplizierung und Garbage-Filter.

Normalisiert Vendor-Namen (Google + Google DoubleClick + DoubleClick/Google
Marketing → eine Eintragung) und entfernt Garbage-Eintraege die fälschlich
als Vendor erkannt wurden ('click to select a dealership', 'Mehrere OEMs',
URL-Fragmente, etc.).

Wird nach allen Vendor-Sources (LLM, Library, Pattern, Phase-G) angewandt
bevor die VVT-Tabelle gerendert wird.
"""

from __future__ import annotations

import logging
import re

logger = logging.getLogger(__name__)


# Aliase: alle Schreibweisen → kanonischer Name
_VENDOR_ALIASES: dict[str, str] = {
    # Google-Familie
    "google": "Google",
    "google llc": "Google",
    "google inc": "Google",
    "google marketing platform": "Google",
    "google ads": "Google",
    "google adsense": "Google",
    "google analytics": "Google Analytics",
    "google tag manager": "Google Tag Manager",
    "google doubleclick": "Google",
    "doubleclick": "Google",
    "doubleclick/google marketing": "Google",
    "doubleclick by google": "Google",
    # Adobe-Familie
    "adobe": "Adobe",
    "adobe inc": "Adobe",
    "adobe systems": "Adobe",
    "adobe analytics": "Adobe Analytics",
    "adobe audience manager": "Adobe Audience Manager",
    "adobe experience cloud": "Adobe Experience Cloud",
    "adobe target": "Adobe Target",
    "adobe advertising cloud (everest)": "Adobe Advertising Cloud",
    # Trade Desk
    "the trade desk": "The Trade Desk",
    "tradedesk": "The Trade Desk",
    "the tradedesk": "The Trade Desk",
    "trade desk": "The Trade Desk",
    # Meta
    "meta": "Meta / Facebook",
    "meta platforms": "Meta / Facebook",
    "facebook": "Meta / Facebook",
    "meta / facebook": "Meta / Facebook",
    # AdForm
    "adform": "AdForm",
    "adform dsp": "AdForm",
    # Microsoft
    "microsoft": "Microsoft",
    "microsoft bing": "Microsoft Bing",
    "linkedin": "LinkedIn (Microsoft)",
    "linkedin corporation": "LinkedIn (Microsoft)",
    # CMP
    "onetrust": "OneTrust",
    "cookiebot": "Cookiebot",
    "usercentrics": "Usercentrics",
    "borlabs": "Borlabs",
    "borlabs / cookie-cmp": "Borlabs",
    # Salesforce
    "salesforce": "Salesforce",
    "salesforce liveagent": "Salesforce",
    "liveagent": "Salesforce",
    # Cloudflare
    "cloudflare": "Cloudflare",
}


# Garbage-Patterns: wenn der Vendor-Name darauf matched → wegfiltern
_GARBAGE_PATTERNS = (
    re.compile(r"^click to ", re.I),
    re.compile(r"^mehrere oems", re.I),
    re.compile(r"^breakpilot[-_ ]?snapshot", re.I),
    re.compile(r"^https?://", re.I),  # URLs
    re.compile(r"^https?$", re.I),
    re.compile(r"^javascript:", re.I),
    re.compile(r"^undefined$|^null$|^none$", re.I),
    re.compile(r"^[\d\W]+$"),  # nur Zahlen/Symbole
    re.compile(r"^.{1,2}$"),    # Ein-/Zwei-Zeichen-"Namen"
    re.compile(r"^(ein|der|die|das|von|und|aber|oder)$", re.I),
    re.compile(r"^cookie$|^cookies$", re.I),
)


def _is_garbage(name: str) -> bool:
    if not name or len(name.strip()) < 2:
        return True
    if len(name) > 120:
        return True
    return any(p.search(name) for p in _GARBAGE_PATTERNS)


def _canonical_name(name: str) -> str:
    nl = name.strip().lower()
    if nl in _VENDOR_ALIASES:
        return _VENDOR_ALIASES[nl]
    # Sub-token-Match: 'doubleclick by google' → enthaelt 'doubleclick'
    for alias, canonical in _VENDOR_ALIASES.items():
        if alias in nl and len(alias) >= 6:
            return canonical
    return name.strip()


def normalize_vendors(vendors: list[dict]) -> list[dict]:
    """Filtert Garbage + dedupliziert anhand kanonischer Aliase.

    Mergt cookies-Listen wenn der gleiche Vendor mehrfach erscheint
    (z.B. aus LLM + Library + Phase-G). Behaelt Metadaten des Eintrags
    mit der laengsten cookies-Liste.
    """
    if not vendors:
        return []
    by_canon: dict[str, dict] = {}
    dropped_garbage = 0
    merged = 0
    for v in vendors:
        if not isinstance(v, dict):
            continue
        raw_name = (v.get("name") or "").strip()
        if _is_garbage(raw_name):
            dropped_garbage += 1
            continue
        canon = _canonical_name(raw_name)
        if canon in by_canon:
            # Merge: cookies vereinen, source-Tags joinen
            ex = by_canon[canon]
            ex_cookies = ex.get("cookies") or []
            new_cookies = v.get("cookies") or []
            seen_ck = {(c.get("name") or "").lower() for c in ex_cookies if isinstance(c, dict)}
            for c in new_cookies:
                if isinstance(c, dict):
                    nm = (c.get("name") or "").strip().lower()
                    if nm and nm not in seen_ck:
                        ex_cookies.append(c)
                        seen_ck.add(nm)
            ex["cookies"] = ex_cookies
            # Source-Tag merging (semicolon-separated)
            ex_src = (ex.get("source") or "").split(";")
            new_src = v.get("source") or ""
            if new_src and new_src not in ex_src:
                ex_src.append(new_src)
            ex["source"] = ";".join([s for s in ex_src if s])
            # Bessere Metadaten uebernehmen (falls leer)
            for k in ("country", "opt_out_url", "privacy_policy_url",
                       "purpose", "category", "persistence"):
                if not ex.get(k) and v.get(k):
                    ex[k] = v[k]
            merged += 1
        else:
            v["name"] = canon
            by_canon[canon] = v
    if dropped_garbage or merged:
        logger.info(
            "Vendor-Normalizer: %d garbage dropped, %d duplicate merges, "
            "%d unique vendors (input: %d)",
            dropped_garbage, merged, len(by_canon), len(vendors),
        )
    return list(by_canon.values())