"""OneTrust Cookie Consent. URL: cdn.cookielaw.org/consent//.json OR cdn.cookielaw.org/consent//.json Schema: Groups[] with GroupName, GroupDescription, Cookies[] """ import re MATCHER = re.compile(r"cdn\.cookielaw\.org/consent/[^/]+/[^/]+\.json", re.I) _TAG_RE = re.compile(r"<[^>]+>") _WS_RE = re.compile(r"\s+") def _clean(text: str) -> str: no_tags = _TAG_RE.sub(" ", text) no_tags = no_tags.replace(" ", " ").replace("&", "&") return _WS_RE.sub(" ", no_tags).strip() def reconstruct(d: dict) -> str: parts: list[str] = ["# Cookie-Richtlinie (OneTrust)"] # Optional preamble fields for key in ("Description", "PolicyText", "PolicyDescription"): val = d.get(key) if val: parts.append("") parts.append(_clean(str(val))) groups = d.get("Groups") or d.get("groups") or [] for g in groups: name = g.get("GroupName") or g.get("name") or "" desc = g.get("GroupDescription") or g.get("description") or "" parts.append("") parts.append(f"## {name}") if desc: parts.append(_clean(str(desc))) cookies = g.get("Cookies") or g.get("cookies") or [] for c in cookies[:50]: cn = c.get("Name") or c.get("name") or "" cp = c.get("Provider") or c.get("provider") or "" cd = c.get("description") or c.get("Description") or "" ce = c.get("Length") or c.get("expires") or "" line = f"- {cn}" if cp: line += f" ({cp})" if cd: line += f" — {cd[:120]}" if ce: line += f" — Speicherdauer: {ce}" parts.append(line) return "\n".join(parts) def extract_vendors(d: dict) -> list[dict]: """Return structured vendor records from OneTrust JSON. OneTrust groups cookies into 'Groups' (Strictly Necessary, Analytics, Marketing, etc). Within each group, cookies are listed with Provider, Host, Length (retention) and optional Privacy/Opt-Out URLs. """ out: list[dict] = [] seen: set[str] = set() groups = d.get("Groups") or d.get("groups") or [] for g in groups: category = g.get("GroupName") or g.get("name") or "" for c in g.get("Cookies") or g.get("cookies") or []: provider = (c.get("Provider") or c.get("provider") or c.get("Host") or c.get("host") or "").strip() if not provider: continue cookie_entry = { "name": c.get("Name") or c.get("name") or "", "purpose": _clean(str(c.get("description") or c.get("Description") or "")), "expiry": _clean(str(c.get("Length") or c.get("expires") or "")), "is_third_party": (c.get("IsThirdParty") or c.get("isThirdParty") or False), } if provider in seen: # Append cookie to existing vendor for entry in out: if entry["name"] == provider: entry["cookies"].append(cookie_entry) break else: seen.add(provider) out.append({ "name": provider, "country": "", "purpose": _clean(str(c.get("description") or g.get("GroupDescription") or "")), "category": category, "opt_out_url": "", "privacy_policy_url": (c.get("PolicyUrl") or c.get("policyUrl") or ""), "persistence": "", "cookies": [cookie_entry], }) return out _TAG_RE = __import__("re").compile(r"<[^>]+>") _WS_RE = __import__("re").compile(r"\s+") def _clean(text: str) -> str: no_tags = _TAG_RE.sub(" ", text or "") return _WS_RE.sub(" ", no_tags).strip()