"""BMW Group ePaaS (Enterprise Privacy as a Service). URL: /epaas/prod/policypage///.epaas.json Schema: policyPageMetadata + categories + providers """ import re MATCHER = re.compile(r"/epaas/prod/policypage/.+\.epaas\.json(\?|$)", re.I) _TAG_RE = re.compile(r"<[^>]+>") _WS_RE = re.compile(r"\s+") def _clean(text: str) -> str: no_tags = _TAG_RE.sub(" ", text) no_tags = (no_tags .replace(" ", " ").replace("&", "&") .replace("<", "<").replace(">", ">") .replace(""", '"').replace("'", "'")) return _WS_RE.sub(" ", no_tags).strip() def reconstruct(d: dict) -> str: meta = d.get("policyPageMetadata", {}) or {} parts: list[str] = ["# Cookie-Richtlinie"] for key in ("heading", "subHeading", "prologue", "dataController", "epilogue"): val = meta.get(key) if val: parts.append("") parts.append(_clean(str(val))) cats = d.get("categories", []) or [] if cats: parts.append("") parts.append("## Cookie-Kategorien") for c in cats: name = c.get("name") or c.get("id") or "" desc = c.get("description") or c.get("descriptionHtml") or "" parts.append("") parts.append(f"### {name}") parts.append(_clean(str(desc))) providers = d.get("providers", []) or [] if providers: parts.append("") parts.append(f"## Anbieter ({len(providers)})") for p in providers: name = p.get("name") or p.get("id") or "" purpose = (p.get("purpose") or "").strip() country = (p.get("country") or "").strip() persistence = (p.get("persistencePurposeDescription") or "").strip() line = f"- {name}" if purpose: line += f" — Zweck: {purpose}" if country: line += f" — Sitz: {country}" if persistence: line += f" — Speicherdauer: {persistence[:120]}" parts.append(line) if meta.get("expiresAfter"): parts.append("") parts.append(f"Speicherdauer: {meta['expiresAfter']}") if meta.get("persistencePurposeText"): parts.append(_clean(str(meta["persistencePurposeText"]))) return "\n".join(parts) def extract_vendors(d: dict) -> list[dict]: """Return structured vendor records from ePaaS policy JSON. Schema returned (per vendor): {name, country, purpose, category, opt_out_url, privacy_policy_url, persistence, cookies: [{name, purpose, expiry, is_third_party}]} """ out: list[dict] = [] providers = d.get("providers", []) or [] cookies_by_provider: dict[str, list[dict]] = {} # ePaaS sometimes stores cookies in a separate 'cookies' array referenced # by providerId. If so, group them by provider. cookies_list = d.get("cookies", []) or [] for c in cookies_list: pid = (c.get("providerId") or c.get("provider") or c.get("vendorId") or c.get("vendor") or "") if pid: cookies_by_provider.setdefault(str(pid), []).append({ "name": c.get("name") or c.get("id") or "", "purpose": _clean(str(c.get("purpose") or c.get("description") or "")), "expiry": _clean(str(c.get("expiry") or c.get("retention") or c.get("persistence") or "")), "is_third_party": bool(c.get("isThirdParty") or c.get("third_party")), }) for p in providers: pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "") cookies = (cookies_by_provider.get(pid, []) or [{"name": c.get("name", ""), "purpose": _clean(str(c.get("purpose", ""))), "expiry": _clean(str(c.get("expiry") or c.get("persistence") or "")), "is_third_party": True} for c in (p.get("cookies", []) or [])]) out.append({ "name": p.get("name") or pid or "", "country": (p.get("country") or "").strip(), "purpose": _clean(str(p.get("purpose") or "")), "category": (p.get("category") or "").strip(), "opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl") or p.get("opt_out_url") or "").strip(), "privacy_policy_url": (p.get("policyUrl") or p.get("policy_url") or p.get("privacyPolicyUrl") or "").strip(), "persistence": _clean(str(p.get("persistencePurposeDescription") or "")), "cookies": cookies, }) return out