081e4f057a
CI / detect-changes (push) Successful in 12s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 55s
CI / iace-gt-coverage (push) Successful in 25s
CI / test-python-backend (push) Successful in 44s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
ZENTRALER USP: cookie_compliance_audit.py vergleicht 3 Quellen * DEKLARIERT in Cookie-Richtlinie (parse_cookie_table + parse_flat) * TATSAECHLICH im Browser geladen (banner_result.phases.after_accept) * LIBRARY-Metadaten (cookie_library lookup) Liefert 3 Listen mit Compliance-Verdict: * compliant (deklariert UND geladen) — gruener Block * undeclared_in_browser (geladen NICHT deklariert) — ROTER HIGH-Block → Art. 13(1)(c) DSGVO + § 25 TDDDG Verstoss * declared_not_loaded (deklariert NICHT geladen) — gelber Hinweis → Tabelle moeglicherweise veraltet parse_cookie_table erweitert um Block-Format (5 Zeilen pro Cookie wie beim User-Copy aus VW). Findet 35+ Cookies aus Copy-Paste statt 0. vendor_normalizer.py: 50+ Aliases (Google-Familie, Adobe-Familie, Trade Desk, AdForm, ...) + Garbage-Filter (URLs, leere Strings, 'click to select', 'Mehrere OEMs'). Mergt cookies-Listen beim Dedup. _guess_vendor erweitert: Adobe-Familie (s_ecid/AMCV/demdex/mbox/...), Trade Desk (TDID/TDCPM/TTDOptOut), AdForm (uid/cid/otsid), Salesforce LiveAgent, etracker, Akamai, EDAA. audit_quality_checks: vendor-thin-Threshold jetzt dynamisch nach Cookie-Doc-Wörter (3k→10 / 6k→20 / 10k→30 / 15k+→40). VW-Test-Fixture: tests/fixtures/cookie_gt/vw_cookie_richtlinie.txt (36-Cookie-Sample fuer Regression-Tests). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
168 lines
5.6 KiB
Python
168 lines
5.6 KiB
Python
"""
|
|
Vendor-Deduplizierung und Garbage-Filter.
|
|
|
|
Normalisiert Vendor-Namen (Google + Google DoubleClick + DoubleClick/Google
|
|
Marketing → eine Eintragung) und entfernt Garbage-Eintraege die fälschlich
|
|
als Vendor erkannt wurden ('click to select a dealership', 'Mehrere OEMs',
|
|
URL-Fragmente, etc.).
|
|
|
|
Wird nach allen Vendor-Sources (LLM, Library, Pattern, Phase-G) angewandt
|
|
bevor die VVT-Tabelle gerendert wird.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Aliase: alle Schreibweisen → kanonischer Name
|
|
_VENDOR_ALIASES: dict[str, str] = {
|
|
# Google-Familie
|
|
"google": "Google",
|
|
"google llc": "Google",
|
|
"google inc": "Google",
|
|
"google marketing platform": "Google",
|
|
"google ads": "Google",
|
|
"google adsense": "Google",
|
|
"google analytics": "Google Analytics",
|
|
"google tag manager": "Google Tag Manager",
|
|
"google doubleclick": "Google",
|
|
"doubleclick": "Google",
|
|
"doubleclick/google marketing": "Google",
|
|
"doubleclick by google": "Google",
|
|
# Adobe-Familie
|
|
"adobe": "Adobe",
|
|
"adobe inc": "Adobe",
|
|
"adobe systems": "Adobe",
|
|
"adobe analytics": "Adobe Analytics",
|
|
"adobe audience manager": "Adobe Audience Manager",
|
|
"adobe experience cloud": "Adobe Experience Cloud",
|
|
"adobe target": "Adobe Target",
|
|
"adobe advertising cloud (everest)": "Adobe Advertising Cloud",
|
|
# Trade Desk
|
|
"the trade desk": "The Trade Desk",
|
|
"tradedesk": "The Trade Desk",
|
|
"the tradedesk": "The Trade Desk",
|
|
"trade desk": "The Trade Desk",
|
|
# Meta
|
|
"meta": "Meta / Facebook",
|
|
"meta platforms": "Meta / Facebook",
|
|
"facebook": "Meta / Facebook",
|
|
"meta / facebook": "Meta / Facebook",
|
|
# AdForm
|
|
"adform": "AdForm",
|
|
"adform dsp": "AdForm",
|
|
# Microsoft
|
|
"microsoft": "Microsoft",
|
|
"microsoft bing": "Microsoft Bing",
|
|
"linkedin": "LinkedIn (Microsoft)",
|
|
"linkedin corporation": "LinkedIn (Microsoft)",
|
|
# CMP
|
|
"onetrust": "OneTrust",
|
|
"cookiebot": "Cookiebot",
|
|
"usercentrics": "Usercentrics",
|
|
"borlabs": "Borlabs",
|
|
"borlabs / cookie-cmp": "Borlabs",
|
|
# Salesforce
|
|
"salesforce": "Salesforce",
|
|
"salesforce liveagent": "Salesforce",
|
|
"liveagent": "Salesforce",
|
|
# Cloudflare
|
|
"cloudflare": "Cloudflare",
|
|
}
|
|
|
|
|
|
# Garbage-Patterns: wenn der Vendor-Name darauf matched → wegfiltern
|
|
_GARBAGE_PATTERNS = (
|
|
re.compile(r"^click to ", re.I),
|
|
re.compile(r"^mehrere oems", re.I),
|
|
re.compile(r"^breakpilot[-_ ]?snapshot", re.I),
|
|
re.compile(r"^https?://", re.I), # URLs
|
|
re.compile(r"^https?$", re.I),
|
|
re.compile(r"^javascript:", re.I),
|
|
re.compile(r"^undefined$|^null$|^none$", re.I),
|
|
re.compile(r"^[\d\W]+$"), # nur Zahlen/Symbole
|
|
re.compile(r"^.{1,2}$"), # Ein-/Zwei-Zeichen-"Namen"
|
|
re.compile(r"^(ein|der|die|das|von|und|aber|oder)$", re.I),
|
|
re.compile(r"^cookie$|^cookies$", re.I),
|
|
)
|
|
|
|
|
|
def _is_garbage(name: str) -> bool:
|
|
if not name or len(name.strip()) < 2:
|
|
return True
|
|
if len(name) > 120:
|
|
return True
|
|
return any(p.search(name) for p in _GARBAGE_PATTERNS)
|
|
|
|
|
|
def _canonical_name(name: str) -> str:
|
|
nl = name.strip().lower()
|
|
if nl in _VENDOR_ALIASES:
|
|
return _VENDOR_ALIASES[nl]
|
|
# Sub-token-Match: 'doubleclick by google' → enthaelt 'doubleclick'
|
|
for alias, canonical in _VENDOR_ALIASES.items():
|
|
if alias in nl and len(alias) >= 6:
|
|
return canonical
|
|
return name.strip()
|
|
|
|
|
|
def normalize_vendors(vendors: list[dict]) -> list[dict]:
|
|
"""Filtert Garbage + dedupliziert anhand kanonischer Aliase.
|
|
|
|
Mergt cookies-Listen wenn der gleiche Vendor mehrfach erscheint
|
|
(z.B. aus LLM + Library + Phase-G). Behaelt Metadaten des Eintrags
|
|
mit der laengsten cookies-Liste.
|
|
"""
|
|
if not vendors:
|
|
return []
|
|
by_canon: dict[str, dict] = {}
|
|
dropped_garbage = 0
|
|
merged = 0
|
|
for v in vendors:
|
|
if not isinstance(v, dict):
|
|
continue
|
|
raw_name = (v.get("name") or "").strip()
|
|
if _is_garbage(raw_name):
|
|
dropped_garbage += 1
|
|
continue
|
|
canon = _canonical_name(raw_name)
|
|
if canon in by_canon:
|
|
# Merge: cookies vereinen, source-Tags joinen
|
|
ex = by_canon[canon]
|
|
ex_cookies = ex.get("cookies") or []
|
|
new_cookies = v.get("cookies") or []
|
|
seen_ck = {(c.get("name") or "").lower() for c in ex_cookies if isinstance(c, dict)}
|
|
for c in new_cookies:
|
|
if isinstance(c, dict):
|
|
nm = (c.get("name") or "").strip().lower()
|
|
if nm and nm not in seen_ck:
|
|
ex_cookies.append(c)
|
|
seen_ck.add(nm)
|
|
ex["cookies"] = ex_cookies
|
|
# Source-Tag merging (semicolon-separated)
|
|
ex_src = (ex.get("source") or "").split(";")
|
|
new_src = v.get("source") or ""
|
|
if new_src and new_src not in ex_src:
|
|
ex_src.append(new_src)
|
|
ex["source"] = ";".join([s for s in ex_src if s])
|
|
# Bessere Metadaten uebernehmen (falls leer)
|
|
for k in ("country", "opt_out_url", "privacy_policy_url",
|
|
"purpose", "category", "persistence"):
|
|
if not ex.get(k) and v.get(k):
|
|
ex[k] = v[k]
|
|
merged += 1
|
|
else:
|
|
v["name"] = canon
|
|
by_canon[canon] = v
|
|
if dropped_garbage or merged:
|
|
logger.info(
|
|
"Vendor-Normalizer: %d garbage dropped, %d duplicate merges, "
|
|
"%d unique vendors (input: %d)",
|
|
dropped_garbage, merged, len(by_canon), len(vendors),
|
|
)
|
|
return list(by_canon.values())
|