fix(vvt): correct ePaaS schema mapping + category-aware scoring

The first BMW VVT table rendered all 24 providers at 20% score because
the ePaaS extractor was reading the wrong field names. Actual schema is
nested: providers[].processings[].persistences[], NOT providers[] alone.

Correct ePaaS schema (verified against bmw.com/epaas/.../de_DE.epaas.json):
  Provider:    {id, name, description, processings[]}
  Processing:  {id, name, description, categoryId, optOutLink,
                privacyPolicyLink, persistences[]}
  Persistence: {id, name, domain, type, expiry, description}

Two structural changes:

1. One row per processing (not provider). BMW has 26 providers but ~91
   processings spread across them (Adobe alone has ACMProcessing,
   AdobeAnalytics, AdobeCampaign, AdobeTargetAnalytics, AdobeTargetPers.).
   The cookie widget displays each processing separately — VVT now
   mirrors that. Display name format: 'Provider Name — Processing Name'.

2. Read optOutLink/privacyPolicyLink from PROCESSING (where they live),
   not provider. Persistences flatten to cookies[] with name + expiry +
   description.

Plus category mapping:
  advertising -> marketing
  strictlyNecessary -> necessary
  statistics -> statistics
  functional -> functional

Category-aware scoring (cookie_link_validator.score_vendors):
- 'necessary' (technisch erforderliche, §25 Abs. 2 TDDDG): no opt-out
  required, no country required. Score weight shifts to purpose +
  cookie disclosure (essential cookies must list names + expiry).
- All other categories: opt-out URL still mandatory; missing opt-out
  flags 'no_opt_out_url' and zeros that block of points.

Expected BMW result after this fix:
- ~91 rows (Adobe Analytics, Adform Retargeting, Akamai Infrastructure,
  AWS, ..., plus ~60 strictlyNecessary processings)
- Marketing rows with present opt-out → ~75-90%
- Necessary rows with cookie+expiry → ~85-95%
- Rows missing fields → still flagged
This commit is contained in:
Benjamin Admin
2026-05-17 11:19:31 +02:00
parent 189918b043
commit 6c7d4c7552
2 changed files with 98 additions and 58 deletions
@@ -173,8 +173,16 @@ async def validate_vendor_urls(vendors: list[dict]) -> list[dict]:
def score_vendors(vendors: list[dict]) -> list[dict]:
"""Compute per-vendor compliance score (0-100) and flags. Mutates."""
"""Compute per-vendor compliance score (0-100) and flags. Mutates.
Category-aware: 'necessary' (technisch erforderliche Cookies) do NOT
require an opt-out — §25 Abs. 2 TDDDG. Penalising them for that would
be wrong; instead we require precise purpose + cookie disclosure.
"""
for v in vendors:
is_necessary = (v.get("category") or "").lower() in (
"necessary", "strictlynecessary",
)
score = 0
max_score = 0
flags: list[str] = []
@@ -186,50 +194,56 @@ def score_vendors(vendors: list[dict]) -> list[dict]:
else:
flags.append("no_name")
# Purpose — 15
max_score += 15
# Purpose — 20
max_score += 20
if v.get("purpose"):
score += 15
score += 20
else:
flags.append("no_purpose")
# Country (3rd-country transfer relevance) — 10
# Country (3rd-country transfer relevance) — only relevant for
# consent-based categories (otherwise irrelevant flag noise)
if not is_necessary:
max_score += 10
if v.get("country"):
score += 10
else:
flags.append("no_country")
# Opt-Out URL present + reachable — 25
# Opt-Out URL — only for consent-based categories (§25 TDDDG)
if not is_necessary:
max_score += 25
if not v.get("opt_out_url"):
flags.append("no_opt_out_url")
elif v.get("opt_out_ok") is False:
flags.append("broken_opt_out")
score += 5 # at least they tried
score += 5
else:
score += 25
# Privacy policy URL present + reachable — 15
max_score += 15
# Privacy policy URL — relevant for all, but weight lower for necessary
weight = 10 if is_necessary else 15
max_score += weight
if not v.get("privacy_policy_url"):
flags.append("no_privacy_url")
elif v.get("privacy_ok") is False:
flags.append("broken_privacy_url")
score += 5
score += weight // 3
else:
score += 15
score += weight
# Cookies disclosed (names + expiry) — 15
max_score += 15
# Cookies disclosed (names + expiry) — higher weight for necessary
# (since that's mostly what they offer in lieu of opt-out)
weight = 50 if is_necessary else 15
max_score += weight
cookies = v.get("cookies") or []
if cookies:
named = sum(1 for c in cookies if c.get("name"))
with_expiry = sum(1 for c in cookies if c.get("expiry"))
if named >= 1 and with_expiry >= 1:
score += 15
score += weight
elif named >= 1:
score += 8
score += weight // 2
flags.append("cookies_no_expiry")
else:
flags.append("cookies_no_names")
@@ -90,39 +90,65 @@ def extract_vendors_from_payloads(payloads: list[dict]) -> list[dict]:
# ── ePaaS (BMW Group) ───────────────────────────────────────────────
def _extract_epaas(d: dict) -> list[dict]:
out: list[dict] = []
providers = d.get("providers", []) or []
cookies_by_provider: dict[str, list[dict]] = {}
# Maps ePaaS categoryId -> canonical category used by the VVT scorer.
_EPAAS_CATEGORY_MAP = {
"advertising": "marketing",
"marketing": "marketing",
"strictlyNecessary": "necessary",
"necessary": "necessary",
"statistics": "statistics",
"functional": "functional",
}
for c in d.get("cookies", []) or []:
pid = str(c.get("providerId") or c.get("provider") or c.get("vendor") or "")
if pid:
cookies_by_provider.setdefault(pid, []).append({
def _extract_epaas(d: dict) -> list[dict]:
"""Convert ePaaS payload into one row per *processing* (not provider).
ePaaS schema (BMW):
providers[].processings[].persistences[]
provider: {id, name, description}
processing: {id, name, description, categoryId, optOutLink,
privacyPolicyLink, persistences}
persistence: {id, name, domain, type, expiry, description}
Each processing is a separate displayable unit in the cookie widget
(Adobe Analytics, Adobe Campaign, Adobe Target Personalisation, …) —
matching the website layout one-to-one in the VVT table. Provider name
becomes the prefix so the data-controller entity is visible.
"""
out: list[dict] = []
for provider in d.get("providers", []) or []:
provider_name = provider.get("name") or provider.get("id") or ""
provider_desc = _clean(provider.get("description"))
for processing in provider.get("processings", []) or []:
name = (processing.get("name") or processing.get("id")
or provider_name)
purpose = _clean(processing.get("description")
or processing.get("name") or provider_desc)
cat_raw = processing.get("categoryId", "")
category = _EPAAS_CATEGORY_MAP.get(cat_raw, cat_raw or "")
cookies: list[dict] = []
for c in processing.get("persistences", []) or []:
cookies.append({
"name": c.get("name") or c.get("id") or "",
"purpose": _clean(c.get("purpose") or c.get("description")),
"expiry": _clean(c.get("expiry") or c.get("retention") or c.get("persistence")),
"is_third_party": bool(c.get("isThirdParty") or c.get("third_party")),
"purpose": _clean(c.get("description")),
"expiry": _clean(c.get("expiry")),
"is_third_party": True,
})
for p in providers:
pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "")
cookies = cookies_by_provider.get(pid, []) or [{
"name": c.get("name", ""),
"purpose": _clean(c.get("purpose")),
"expiry": _clean(c.get("expiry") or c.get("persistence")),
"is_third_party": True,
} for c in (p.get("cookies", []) or [])]
display_name = (f"{provider_name}{name}"
if name and name != provider_name
else (provider_name or name))
out.append({
"name": p.get("name") or pid or "",
"country": (p.get("country") or "").strip(),
"purpose": _clean(p.get("purpose")),
"category": (p.get("category") or "").strip(),
"opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl")
or p.get("opt_out_url") or "").strip(),
"privacy_policy_url": (p.get("policyUrl") or p.get("policy_url")
or p.get("privacyPolicyUrl") or "").strip(),
"persistence": _clean(p.get("persistencePurposeDescription")),
"name": display_name,
"country": "", # ePaaS doesn't surface vendor country
"purpose": purpose,
"category": category,
"opt_out_url": (processing.get("optOutLink") or "").strip(),
"privacy_policy_url": (processing.get("privacyPolicyLink")
or "").strip(),
"persistence": "",
"cookies": cookies,
})
return out