Files
breakpilot-compliance/backend-compliance/compliance/services/vendor_normalizer.py
T
Benjamin Admin 081e4f057a
CI / detect-changes (push) Successful in 12s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 55s
CI / iace-gt-coverage (push) Successful in 25s
CI / test-python-backend (push) Successful in 44s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
feat(audit): Cookie-Compliance-Audit (3-Quellen-Vergleich) + Vendor-Dedup + Block-Parser
ZENTRALER USP: cookie_compliance_audit.py vergleicht 3 Quellen
* DEKLARIERT in Cookie-Richtlinie (parse_cookie_table + parse_flat)
* TATSAECHLICH im Browser geladen (banner_result.phases.after_accept)
* LIBRARY-Metadaten (cookie_library lookup)

Liefert 3 Listen mit Compliance-Verdict:
* compliant (deklariert UND geladen) — gruener Block
* undeclared_in_browser (geladen NICHT deklariert) — ROTER HIGH-Block
  → Art. 13(1)(c) DSGVO + § 25 TDDDG Verstoss
* declared_not_loaded (deklariert NICHT geladen) — gelber Hinweis
  → Tabelle moeglicherweise veraltet

parse_cookie_table erweitert um Block-Format (5 Zeilen pro Cookie wie
beim User-Copy aus VW). Findet 35+ Cookies aus Copy-Paste statt 0.

vendor_normalizer.py: 50+ Aliases (Google-Familie, Adobe-Familie,
Trade Desk, AdForm, ...) + Garbage-Filter (URLs, leere Strings,
'click to select', 'Mehrere OEMs'). Mergt cookies-Listen beim Dedup.

_guess_vendor erweitert: Adobe-Familie (s_ecid/AMCV/demdex/mbox/...),
Trade Desk (TDID/TDCPM/TTDOptOut), AdForm (uid/cid/otsid),
Salesforce LiveAgent, etracker, Akamai, EDAA.

audit_quality_checks: vendor-thin-Threshold jetzt dynamisch nach
Cookie-Doc-Wörter (3k→10 / 6k→20 / 10k→30 / 15k+→40).

VW-Test-Fixture: tests/fixtures/cookie_gt/vw_cookie_richtlinie.txt
(36-Cookie-Sample fuer Regression-Tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 23:36:45 +02:00

168 lines
5.6 KiB
Python

"""
Vendor-Deduplizierung und Garbage-Filter.
Normalisiert Vendor-Namen (Google + Google DoubleClick + DoubleClick/Google
Marketing → eine Eintragung) und entfernt Garbage-Eintraege die fälschlich
als Vendor erkannt wurden ('click to select a dealership', 'Mehrere OEMs',
URL-Fragmente, etc.).
Wird nach allen Vendor-Sources (LLM, Library, Pattern, Phase-G) angewandt
bevor die VVT-Tabelle gerendert wird.
"""
from __future__ import annotations
import logging
import re
logger = logging.getLogger(__name__)
# Aliase: alle Schreibweisen → kanonischer Name
_VENDOR_ALIASES: dict[str, str] = {
# Google-Familie
"google": "Google",
"google llc": "Google",
"google inc": "Google",
"google marketing platform": "Google",
"google ads": "Google",
"google adsense": "Google",
"google analytics": "Google Analytics",
"google tag manager": "Google Tag Manager",
"google doubleclick": "Google",
"doubleclick": "Google",
"doubleclick/google marketing": "Google",
"doubleclick by google": "Google",
# Adobe-Familie
"adobe": "Adobe",
"adobe inc": "Adobe",
"adobe systems": "Adobe",
"adobe analytics": "Adobe Analytics",
"adobe audience manager": "Adobe Audience Manager",
"adobe experience cloud": "Adobe Experience Cloud",
"adobe target": "Adobe Target",
"adobe advertising cloud (everest)": "Adobe Advertising Cloud",
# Trade Desk
"the trade desk": "The Trade Desk",
"tradedesk": "The Trade Desk",
"the tradedesk": "The Trade Desk",
"trade desk": "The Trade Desk",
# Meta
"meta": "Meta / Facebook",
"meta platforms": "Meta / Facebook",
"facebook": "Meta / Facebook",
"meta / facebook": "Meta / Facebook",
# AdForm
"adform": "AdForm",
"adform dsp": "AdForm",
# Microsoft
"microsoft": "Microsoft",
"microsoft bing": "Microsoft Bing",
"linkedin": "LinkedIn (Microsoft)",
"linkedin corporation": "LinkedIn (Microsoft)",
# CMP
"onetrust": "OneTrust",
"cookiebot": "Cookiebot",
"usercentrics": "Usercentrics",
"borlabs": "Borlabs",
"borlabs / cookie-cmp": "Borlabs",
# Salesforce
"salesforce": "Salesforce",
"salesforce liveagent": "Salesforce",
"liveagent": "Salesforce",
# Cloudflare
"cloudflare": "Cloudflare",
}
# Garbage-Patterns: wenn der Vendor-Name darauf matched → wegfiltern
_GARBAGE_PATTERNS = (
re.compile(r"^click to ", re.I),
re.compile(r"^mehrere oems", re.I),
re.compile(r"^breakpilot[-_ ]?snapshot", re.I),
re.compile(r"^https?://", re.I), # URLs
re.compile(r"^https?$", re.I),
re.compile(r"^javascript:", re.I),
re.compile(r"^undefined$|^null$|^none$", re.I),
re.compile(r"^[\d\W]+$"), # nur Zahlen/Symbole
re.compile(r"^.{1,2}$"), # Ein-/Zwei-Zeichen-"Namen"
re.compile(r"^(ein|der|die|das|von|und|aber|oder)$", re.I),
re.compile(r"^cookie$|^cookies$", re.I),
)
def _is_garbage(name: str) -> bool:
if not name or len(name.strip()) < 2:
return True
if len(name) > 120:
return True
return any(p.search(name) for p in _GARBAGE_PATTERNS)
def _canonical_name(name: str) -> str:
nl = name.strip().lower()
if nl in _VENDOR_ALIASES:
return _VENDOR_ALIASES[nl]
# Sub-token-Match: 'doubleclick by google' → enthaelt 'doubleclick'
for alias, canonical in _VENDOR_ALIASES.items():
if alias in nl and len(alias) >= 6:
return canonical
return name.strip()
def normalize_vendors(vendors: list[dict]) -> list[dict]:
"""Filtert Garbage + dedupliziert anhand kanonischer Aliase.
Mergt cookies-Listen wenn der gleiche Vendor mehrfach erscheint
(z.B. aus LLM + Library + Phase-G). Behaelt Metadaten des Eintrags
mit der laengsten cookies-Liste.
"""
if not vendors:
return []
by_canon: dict[str, dict] = {}
dropped_garbage = 0
merged = 0
for v in vendors:
if not isinstance(v, dict):
continue
raw_name = (v.get("name") or "").strip()
if _is_garbage(raw_name):
dropped_garbage += 1
continue
canon = _canonical_name(raw_name)
if canon in by_canon:
# Merge: cookies vereinen, source-Tags joinen
ex = by_canon[canon]
ex_cookies = ex.get("cookies") or []
new_cookies = v.get("cookies") or []
seen_ck = {(c.get("name") or "").lower() for c in ex_cookies if isinstance(c, dict)}
for c in new_cookies:
if isinstance(c, dict):
nm = (c.get("name") or "").strip().lower()
if nm and nm not in seen_ck:
ex_cookies.append(c)
seen_ck.add(nm)
ex["cookies"] = ex_cookies
# Source-Tag merging (semicolon-separated)
ex_src = (ex.get("source") or "").split(";")
new_src = v.get("source") or ""
if new_src and new_src not in ex_src:
ex_src.append(new_src)
ex["source"] = ";".join([s for s in ex_src if s])
# Bessere Metadaten uebernehmen (falls leer)
for k in ("country", "opt_out_url", "privacy_policy_url",
"purpose", "category", "persistence"):
if not ex.get(k) and v.get(k):
ex[k] = v[k]
merged += 1
else:
v["name"] = canon
by_canon[canon] = v
if dropped_garbage or merged:
logger.info(
"Vendor-Normalizer: %d garbage dropped, %d duplicate merges, "
"%d unique vendors (input: %d)",
dropped_garbage, merged, len(by_canon), len(vendors),
)
return list(by_canon.values())