feat(vvt): V2 — vendor extractors for Cookiebot/Usercentrics/Didomi/TrustArc
Backend vendor_extractor.py gets 4 new per-CMP dispatchers, mirroring the JSON schemas observed in each platform: - Cookiebot: 'Categories[*].Cookies[*]' with Vendor/Host, expiry, purpose - Usercentrics: 'services[*]' with cookieMaxAgeSeconds, processingCompanyCountry - Didomi: 'app.vendors[*]' with country + policyUrl - TrustArc: 'vendors[*]' + per-category 'Cookies' with provider All 6 named CMPs (ePaaS, OneTrust, Cookiebot, Usercentrics, Didomi, TrustArc) plus the generic-shape fallback are now mapped — every site hitting Phase B of the cascade gets a structured vendor list, scored opt-out links, and a VVT-Tabelle in the email.
This commit is contained in:
@@ -58,6 +58,14 @@ def extract_vendors_from_payloads(payloads: list[dict]) -> list[dict]:
|
||||
vendors = _extract_epaas(data)
|
||||
elif kind == "onetrust":
|
||||
vendors = _extract_onetrust(data)
|
||||
elif kind == "cookiebot":
|
||||
vendors = _extract_cookiebot(data)
|
||||
elif kind == "usercentrics":
|
||||
vendors = _extract_usercentrics(data)
|
||||
elif kind == "didomi":
|
||||
vendors = _extract_didomi(data)
|
||||
elif kind == "trustarc":
|
||||
vendors = _extract_trustarc(data)
|
||||
else:
|
||||
# Generic fallback: walk data for vendor-like dicts
|
||||
vendors = _extract_generic(data)
|
||||
@@ -153,6 +161,141 @@ def _extract_onetrust(d: dict) -> list[dict]:
|
||||
return list(out_by_name.values())
|
||||
|
||||
|
||||
# ── Cookiebot ───────────────────────────────────────────────────────
|
||||
|
||||
def _extract_cookiebot(d: dict) -> list[dict]:
|
||||
"""Cookiebot stores 'Categories[*].Cookies[*]' with Vendor/Host."""
|
||||
out: dict[str, dict] = {}
|
||||
for cat in d.get("Categories") or d.get("categories") or []:
|
||||
category = cat.get("Name") or cat.get("name") or ""
|
||||
for c in cat.get("Cookies") or cat.get("cookies") or []:
|
||||
provider = (c.get("Vendor") or c.get("vendor")
|
||||
or c.get("Host") or c.get("host") or "").strip()
|
||||
if not provider:
|
||||
continue
|
||||
cookie = {
|
||||
"name": c.get("Name") or c.get("name") or "",
|
||||
"purpose": _clean(c.get("Purpose") or c.get("purpose")),
|
||||
"expiry": _clean(c.get("Expires") or c.get("expires")),
|
||||
"is_third_party": bool(c.get("IsThirdParty")),
|
||||
}
|
||||
if provider in out:
|
||||
out[provider]["cookies"].append(cookie)
|
||||
else:
|
||||
out[provider] = {
|
||||
"name": provider,
|
||||
"country": "",
|
||||
"purpose": _clean(c.get("Purpose") or category),
|
||||
"category": category,
|
||||
"opt_out_url": "",
|
||||
"privacy_policy_url": (c.get("PrivacyPolicyUrl")
|
||||
or c.get("policyUrl") or ""),
|
||||
"persistence": "",
|
||||
"cookies": [cookie],
|
||||
}
|
||||
return list(out.values())
|
||||
|
||||
|
||||
# ── Usercentrics ────────────────────────────────────────────────────
|
||||
|
||||
def _extract_usercentrics(d: dict) -> list[dict]:
|
||||
"""Usercentrics 'services' / 'dataProcessingServices' shape."""
|
||||
out: list[dict] = []
|
||||
services = (d.get("services") or d.get("dataProcessingServices")
|
||||
or (d.get("settings") or {}).get("services") or [])
|
||||
for s in services:
|
||||
name = s.get("name") or s.get("dataProcessor") or ""
|
||||
if not name:
|
||||
continue
|
||||
max_age = s.get("cookieMaxAgeSeconds")
|
||||
persistence = ""
|
||||
if isinstance(max_age, int) and max_age > 0:
|
||||
persistence = f"{max_age // 86400} Tage"
|
||||
out.append({
|
||||
"name": name,
|
||||
"country": (s.get("processingCompanyCountry")
|
||||
or s.get("country") or "").strip(),
|
||||
"purpose": _clean(s.get("dataPurpose") or s.get("description")),
|
||||
"category": (s.get("categorySlug") or s.get("category") or "").strip(),
|
||||
"opt_out_url": (s.get("optOutUrl") or "").strip(),
|
||||
"privacy_policy_url": (s.get("policyOfProcessorUrl")
|
||||
or s.get("urls", {}).get("privacyPolicy", "")
|
||||
or "").strip(),
|
||||
"persistence": persistence or _clean(s.get("retentionPeriodDescription")),
|
||||
"cookies": [],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
# ── Didomi ──────────────────────────────────────────────────────────
|
||||
|
||||
def _extract_didomi(d: dict) -> list[dict]:
|
||||
"""Didomi 'app.vendors[]' with name, country, policyUrl."""
|
||||
out: list[dict] = []
|
||||
app = d.get("app", d) or {}
|
||||
for v in app.get("vendors") or d.get("vendors") or []:
|
||||
name = v.get("name") or ""
|
||||
if not name:
|
||||
continue
|
||||
out.append({
|
||||
"name": name,
|
||||
"country": (v.get("country") or "").strip(),
|
||||
"purpose": _clean(v.get("description") or v.get("purpose")),
|
||||
"category": (v.get("category") or "").strip(),
|
||||
"opt_out_url": (v.get("optOutUrl") or "").strip(),
|
||||
"privacy_policy_url": (v.get("policyUrl") or v.get("policy_url")
|
||||
or "").strip(),
|
||||
"persistence": "",
|
||||
"cookies": [],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
# ── TrustArc ────────────────────────────────────────────────────────
|
||||
|
||||
def _extract_trustarc(d: dict) -> list[dict]:
|
||||
"""TrustArc 'vendors[]' or per-category 'Cookies' with provider."""
|
||||
out_by_name: dict[str, dict] = {}
|
||||
# vendors
|
||||
for v in d.get("vendors") or d.get("Vendors") or []:
|
||||
name = v.get("name") or v.get("Name") or ""
|
||||
if not name:
|
||||
continue
|
||||
out_by_name[name] = {
|
||||
"name": name,
|
||||
"country": (v.get("country") or "").strip(),
|
||||
"purpose": _clean(v.get("description") or v.get("Description")),
|
||||
"category": (v.get("category") or "").strip(),
|
||||
"opt_out_url": (v.get("optOutUrl") or "").strip(),
|
||||
"privacy_policy_url": (v.get("policyUrl") or "").strip(),
|
||||
"persistence": "",
|
||||
"cookies": [],
|
||||
}
|
||||
# cookies per category
|
||||
for cat in d.get("categories") or d.get("Categories") or []:
|
||||
cat_name = cat.get("name") or cat.get("Name") or ""
|
||||
for c in cat.get("cookies") or cat.get("Cookies") or []:
|
||||
provider = c.get("provider") or c.get("Provider") or ""
|
||||
if not provider:
|
||||
continue
|
||||
cookie = {
|
||||
"name": c.get("name") or c.get("Name") or "",
|
||||
"purpose": _clean(c.get("purpose") or c.get("Purpose")),
|
||||
"expiry": _clean(c.get("expires") or c.get("Expires")),
|
||||
"is_third_party": True,
|
||||
}
|
||||
if provider in out_by_name:
|
||||
out_by_name[provider]["cookies"].append(cookie)
|
||||
else:
|
||||
out_by_name[provider] = {
|
||||
"name": provider, "country": "", "purpose": "",
|
||||
"category": cat_name, "opt_out_url": "",
|
||||
"privacy_policy_url": "", "persistence": "",
|
||||
"cookies": [cookie],
|
||||
}
|
||||
return list(out_by_name.values())
|
||||
|
||||
|
||||
# ── Generic fallback (other CMPs / heuristic captures) ──────────────
|
||||
|
||||
def _extract_generic(d: dict) -> list[dict]:
|
||||
|
||||
Reference in New Issue
Block a user