From 6c7d4c7552b73f421f7c78097748c2ac64493d8d Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 17 May 2026 11:19:31 +0200 Subject: [PATCH] fix(vvt): correct ePaaS schema mapping + category-aware scoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first BMW VVT table rendered all 24 providers at 20% score because the ePaaS extractor was reading the wrong field names. Actual schema is nested: providers[].processings[].persistences[], NOT providers[] alone. Correct ePaaS schema (verified against bmw.com/epaas/.../de_DE.epaas.json): Provider: {id, name, description, processings[]} Processing: {id, name, description, categoryId, optOutLink, privacyPolicyLink, persistences[]} Persistence: {id, name, domain, type, expiry, description} Two structural changes: 1. One row per processing (not provider). BMW has 26 providers but ~91 processings spread across them (Adobe alone has ACMProcessing, AdobeAnalytics, AdobeCampaign, AdobeTargetAnalytics, AdobeTargetPers.). The cookie widget displays each processing separately — VVT now mirrors that. Display name format: 'Provider Name — Processing Name'. 2. Read optOutLink/privacyPolicyLink from PROCESSING (where they live), not provider. Persistences flatten to cookies[] with name + expiry + description. Plus category mapping: advertising -> marketing strictlyNecessary -> necessary statistics -> statistics functional -> functional Category-aware scoring (cookie_link_validator.score_vendors): - 'necessary' (technisch erforderliche, §25 Abs. 2 TDDDG): no opt-out required, no country required. Score weight shifts to purpose + cookie disclosure (essential cookies must list names + expiry). - All other categories: opt-out URL still mandatory; missing opt-out flags 'no_opt_out_url' and zeros that block of points. Expected BMW result after this fix: - ~91 rows (Adobe Analytics, Adform Retargeting, Akamai Infrastructure, AWS, ..., plus ~60 strictlyNecessary processings) - Marketing rows with present opt-out → ~75-90% - Necessary rows with cookie+expiry → ~85-95% - Rows missing fields → still flagged --- .../services/cookie_link_validator.py | 68 ++++++++------ .../compliance/services/vendor_extractor.py | 88 ++++++++++++------- 2 files changed, 98 insertions(+), 58 deletions(-) diff --git a/backend-compliance/compliance/services/cookie_link_validator.py b/backend-compliance/compliance/services/cookie_link_validator.py index a049d4e4..42888b30 100644 --- a/backend-compliance/compliance/services/cookie_link_validator.py +++ b/backend-compliance/compliance/services/cookie_link_validator.py @@ -173,8 +173,16 @@ async def validate_vendor_urls(vendors: list[dict]) -> list[dict]: def score_vendors(vendors: list[dict]) -> list[dict]: - """Compute per-vendor compliance score (0-100) and flags. Mutates.""" + """Compute per-vendor compliance score (0-100) and flags. Mutates. + + Category-aware: 'necessary' (technisch erforderliche Cookies) do NOT + require an opt-out — §25 Abs. 2 TDDDG. Penalising them for that would + be wrong; instead we require precise purpose + cookie disclosure. + """ for v in vendors: + is_necessary = (v.get("category") or "").lower() in ( + "necessary", "strictlynecessary", + ) score = 0 max_score = 0 flags: list[str] = [] @@ -186,50 +194,56 @@ def score_vendors(vendors: list[dict]) -> list[dict]: else: flags.append("no_name") - # Purpose — 15 - max_score += 15 + # Purpose — 20 + max_score += 20 if v.get("purpose"): - score += 15 + score += 20 else: flags.append("no_purpose") - # Country (3rd-country transfer relevance) — 10 - max_score += 10 - if v.get("country"): - score += 10 - else: - flags.append("no_country") + # Country (3rd-country transfer relevance) — only relevant for + # consent-based categories (otherwise irrelevant flag noise) + if not is_necessary: + max_score += 10 + if v.get("country"): + score += 10 + else: + flags.append("no_country") - # Opt-Out URL present + reachable — 25 - max_score += 25 - if not v.get("opt_out_url"): - flags.append("no_opt_out_url") - elif v.get("opt_out_ok") is False: - flags.append("broken_opt_out") - score += 5 # at least they tried - else: - score += 25 + # Opt-Out URL — only for consent-based categories (§25 TDDDG) + if not is_necessary: + max_score += 25 + if not v.get("opt_out_url"): + flags.append("no_opt_out_url") + elif v.get("opt_out_ok") is False: + flags.append("broken_opt_out") + score += 5 + else: + score += 25 - # Privacy policy URL present + reachable — 15 - max_score += 15 + # Privacy policy URL — relevant for all, but weight lower for necessary + weight = 10 if is_necessary else 15 + max_score += weight if not v.get("privacy_policy_url"): flags.append("no_privacy_url") elif v.get("privacy_ok") is False: flags.append("broken_privacy_url") - score += 5 + score += weight // 3 else: - score += 15 + score += weight - # Cookies disclosed (names + expiry) — 15 - max_score += 15 + # Cookies disclosed (names + expiry) — higher weight for necessary + # (since that's mostly what they offer in lieu of opt-out) + weight = 50 if is_necessary else 15 + max_score += weight cookies = v.get("cookies") or [] if cookies: named = sum(1 for c in cookies if c.get("name")) with_expiry = sum(1 for c in cookies if c.get("expiry")) if named >= 1 and with_expiry >= 1: - score += 15 + score += weight elif named >= 1: - score += 8 + score += weight // 2 flags.append("cookies_no_expiry") else: flags.append("cookies_no_names") diff --git a/backend-compliance/compliance/services/vendor_extractor.py b/backend-compliance/compliance/services/vendor_extractor.py index f6ad7003..cabe83fc 100644 --- a/backend-compliance/compliance/services/vendor_extractor.py +++ b/backend-compliance/compliance/services/vendor_extractor.py @@ -90,41 +90,67 @@ def extract_vendors_from_payloads(payloads: list[dict]) -> list[dict]: # ── ePaaS (BMW Group) ─────────────────────────────────────────────── +# Maps ePaaS categoryId -> canonical category used by the VVT scorer. +_EPAAS_CATEGORY_MAP = { + "advertising": "marketing", + "marketing": "marketing", + "strictlyNecessary": "necessary", + "necessary": "necessary", + "statistics": "statistics", + "functional": "functional", +} + + def _extract_epaas(d: dict) -> list[dict]: + """Convert ePaaS payload into one row per *processing* (not provider). + + ePaaS schema (BMW): + providers[].processings[].persistences[] + provider: {id, name, description} + processing: {id, name, description, categoryId, optOutLink, + privacyPolicyLink, persistences} + persistence: {id, name, domain, type, expiry, description} + + Each processing is a separate displayable unit in the cookie widget + (Adobe Analytics, Adobe Campaign, Adobe Target Personalisation, …) — + matching the website layout one-to-one in the VVT table. Provider name + becomes the prefix so the data-controller entity is visible. + """ out: list[dict] = [] - providers = d.get("providers", []) or [] - cookies_by_provider: dict[str, list[dict]] = {} + for provider in d.get("providers", []) or []: + provider_name = provider.get("name") or provider.get("id") or "" + provider_desc = _clean(provider.get("description")) + for processing in provider.get("processings", []) or []: + name = (processing.get("name") or processing.get("id") + or provider_name) + purpose = _clean(processing.get("description") + or processing.get("name") or provider_desc) + cat_raw = processing.get("categoryId", "") + category = _EPAAS_CATEGORY_MAP.get(cat_raw, cat_raw or "") - for c in d.get("cookies", []) or []: - pid = str(c.get("providerId") or c.get("provider") or c.get("vendor") or "") - if pid: - cookies_by_provider.setdefault(pid, []).append({ - "name": c.get("name") or c.get("id") or "", - "purpose": _clean(c.get("purpose") or c.get("description")), - "expiry": _clean(c.get("expiry") or c.get("retention") or c.get("persistence")), - "is_third_party": bool(c.get("isThirdParty") or c.get("third_party")), + cookies: list[dict] = [] + for c in processing.get("persistences", []) or []: + cookies.append({ + "name": c.get("name") or c.get("id") or "", + "purpose": _clean(c.get("description")), + "expiry": _clean(c.get("expiry")), + "is_third_party": True, + }) + + display_name = (f"{provider_name} — {name}" + if name and name != provider_name + else (provider_name or name)) + out.append({ + "name": display_name, + "country": "", # ePaaS doesn't surface vendor country + "purpose": purpose, + "category": category, + "opt_out_url": (processing.get("optOutLink") or "").strip(), + "privacy_policy_url": (processing.get("privacyPolicyLink") + or "").strip(), + "persistence": "", + "cookies": cookies, }) - - for p in providers: - pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "") - cookies = cookies_by_provider.get(pid, []) or [{ - "name": c.get("name", ""), - "purpose": _clean(c.get("purpose")), - "expiry": _clean(c.get("expiry") or c.get("persistence")), - "is_third_party": True, - } for c in (p.get("cookies", []) or [])] - out.append({ - "name": p.get("name") or pid or "", - "country": (p.get("country") or "").strip(), - "purpose": _clean(p.get("purpose")), - "category": (p.get("category") or "").strip(), - "opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl") - or p.get("opt_out_url") or "").strip(), - "privacy_policy_url": (p.get("policyUrl") or p.get("policy_url") - or p.get("privacyPolicyUrl") or "").strip(), - "persistence": _clean(p.get("persistencePurposeDescription")), - "cookies": cookies, - }) return out