ea4dbb223f
When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.
consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
list[VendorRecord]
backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
and score_vendors(vendors) — 0-100 score per vendor based on name,
purpose, country, opt-out reachable, privacy URL reachable, cookies
with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering
Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
120 lines
4.7 KiB
Python
120 lines
4.7 KiB
Python
"""BMW Group ePaaS (Enterprise Privacy as a Service).
|
|
|
|
URL: /epaas/prod/policypage/<tenant>/<config>/<locale>.epaas.json
|
|
Schema: policyPageMetadata + categories + providers
|
|
"""
|
|
|
|
import re
|
|
|
|
MATCHER = re.compile(r"/epaas/prod/policypage/.+\.epaas\.json(\?|$)", re.I)
|
|
|
|
_TAG_RE = re.compile(r"<[^>]+>")
|
|
_WS_RE = re.compile(r"\s+")
|
|
|
|
|
|
def _clean(text: str) -> str:
|
|
no_tags = _TAG_RE.sub(" ", text)
|
|
no_tags = (no_tags
|
|
.replace(" ", " ").replace("&", "&")
|
|
.replace("<", "<").replace(">", ">")
|
|
.replace(""", '"').replace("'", "'"))
|
|
return _WS_RE.sub(" ", no_tags).strip()
|
|
|
|
|
|
def reconstruct(d: dict) -> str:
|
|
meta = d.get("policyPageMetadata", {}) or {}
|
|
parts: list[str] = ["# Cookie-Richtlinie"]
|
|
|
|
for key in ("heading", "subHeading", "prologue", "dataController", "epilogue"):
|
|
val = meta.get(key)
|
|
if val:
|
|
parts.append("")
|
|
parts.append(_clean(str(val)))
|
|
|
|
cats = d.get("categories", []) or []
|
|
if cats:
|
|
parts.append("")
|
|
parts.append("## Cookie-Kategorien")
|
|
for c in cats:
|
|
name = c.get("name") or c.get("id") or ""
|
|
desc = c.get("description") or c.get("descriptionHtml") or ""
|
|
parts.append("")
|
|
parts.append(f"### {name}")
|
|
parts.append(_clean(str(desc)))
|
|
|
|
providers = d.get("providers", []) or []
|
|
if providers:
|
|
parts.append("")
|
|
parts.append(f"## Anbieter ({len(providers)})")
|
|
for p in providers:
|
|
name = p.get("name") or p.get("id") or ""
|
|
purpose = (p.get("purpose") or "").strip()
|
|
country = (p.get("country") or "").strip()
|
|
persistence = (p.get("persistencePurposeDescription") or "").strip()
|
|
line = f"- {name}"
|
|
if purpose:
|
|
line += f" — Zweck: {purpose}"
|
|
if country:
|
|
line += f" — Sitz: {country}"
|
|
if persistence:
|
|
line += f" — Speicherdauer: {persistence[:120]}"
|
|
parts.append(line)
|
|
|
|
if meta.get("expiresAfter"):
|
|
parts.append("")
|
|
parts.append(f"Speicherdauer: {meta['expiresAfter']}")
|
|
if meta.get("persistencePurposeText"):
|
|
parts.append(_clean(str(meta["persistencePurposeText"])))
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract_vendors(d: dict) -> list[dict]:
|
|
"""Return structured vendor records from ePaaS policy JSON.
|
|
|
|
Schema returned (per vendor):
|
|
{name, country, purpose, category, opt_out_url, privacy_policy_url,
|
|
persistence, cookies: [{name, purpose, expiry, is_third_party}]}
|
|
"""
|
|
out: list[dict] = []
|
|
providers = d.get("providers", []) or []
|
|
cookies_by_provider: dict[str, list[dict]] = {}
|
|
|
|
# ePaaS sometimes stores cookies in a separate 'cookies' array referenced
|
|
# by providerId. If so, group them by provider.
|
|
cookies_list = d.get("cookies", []) or []
|
|
for c in cookies_list:
|
|
pid = (c.get("providerId") or c.get("provider")
|
|
or c.get("vendorId") or c.get("vendor") or "")
|
|
if pid:
|
|
cookies_by_provider.setdefault(str(pid), []).append({
|
|
"name": c.get("name") or c.get("id") or "",
|
|
"purpose": _clean(str(c.get("purpose") or c.get("description") or "")),
|
|
"expiry": _clean(str(c.get("expiry") or c.get("retention")
|
|
or c.get("persistence") or "")),
|
|
"is_third_party": bool(c.get("isThirdParty")
|
|
or c.get("third_party")),
|
|
})
|
|
|
|
for p in providers:
|
|
pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "")
|
|
cookies = (cookies_by_provider.get(pid, [])
|
|
or [{"name": c.get("name", ""),
|
|
"purpose": _clean(str(c.get("purpose", ""))),
|
|
"expiry": _clean(str(c.get("expiry") or c.get("persistence") or "")),
|
|
"is_third_party": True}
|
|
for c in (p.get("cookies", []) or [])])
|
|
out.append({
|
|
"name": p.get("name") or pid or "",
|
|
"country": (p.get("country") or "").strip(),
|
|
"purpose": _clean(str(p.get("purpose") or "")),
|
|
"category": (p.get("category") or "").strip(),
|
|
"opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl")
|
|
or p.get("opt_out_url") or "").strip(),
|
|
"privacy_policy_url": (p.get("policyUrl") or p.get("policy_url")
|
|
or p.get("privacyPolicyUrl") or "").strip(),
|
|
"persistence": _clean(str(p.get("persistencePurposeDescription") or "")),
|
|
"cookies": cookies,
|
|
})
|
|
return out
|