7e426c31f1
cmp_extractor.py refactored to thin coordinator (123 LOC, was 223). Discovers all CMP modules via cmp_library/_registry.py:load_all() at import time. Restart consent-tester to pick up new modules. New cmp_library/ folder: - _registry.py: auto-discovers all modules with MATCHER + reconstruct() - epaas.py: BMW Group ePaaS (extracted from cmp_extractor) - onetrust.py: cdn.cookielaw.org Groups/Cookies schema - cookiebot.py: consent.cookiebot.com Categories schema - usercentrics.py: api.usercentrics.eu services schema - didomi.py: sdk.privacy-center.org notice + vendors + purposes - trustarc.py: consent.trustarc.com categories + vendors Each module: - MATCHER: re.Pattern matching the CMP JSON endpoint URL - reconstruct(d: dict) -> str: builds German Markdown cookie-policy text Phase E (self-improving) will write auto_*.py files into the same folder; _registry already picks those up via pkgutil.iter_modules.
57 lines
1.7 KiB
Python
57 lines
1.7 KiB
Python
"""OneTrust Cookie Consent.
|
|
|
|
URL: cdn.cookielaw.org/consent/<id>/<id>.json
|
|
OR cdn.cookielaw.org/consent/<id>/<lang>.json
|
|
Schema: Groups[] with GroupName, GroupDescription, Cookies[]
|
|
"""
|
|
|
|
import re
|
|
|
|
MATCHER = re.compile(r"cdn\.cookielaw\.org/consent/[^/]+/[^/]+\.json", re.I)
|
|
|
|
_TAG_RE = re.compile(r"<[^>]+>")
|
|
_WS_RE = re.compile(r"\s+")
|
|
|
|
|
|
def _clean(text: str) -> str:
|
|
no_tags = _TAG_RE.sub(" ", text)
|
|
no_tags = no_tags.replace(" ", " ").replace("&", "&")
|
|
return _WS_RE.sub(" ", no_tags).strip()
|
|
|
|
|
|
def reconstruct(d: dict) -> str:
|
|
parts: list[str] = ["# Cookie-Richtlinie (OneTrust)"]
|
|
|
|
# Optional preamble fields
|
|
for key in ("Description", "PolicyText", "PolicyDescription"):
|
|
val = d.get(key)
|
|
if val:
|
|
parts.append("")
|
|
parts.append(_clean(str(val)))
|
|
|
|
groups = d.get("Groups") or d.get("groups") or []
|
|
for g in groups:
|
|
name = g.get("GroupName") or g.get("name") or ""
|
|
desc = g.get("GroupDescription") or g.get("description") or ""
|
|
parts.append("")
|
|
parts.append(f"## {name}")
|
|
if desc:
|
|
parts.append(_clean(str(desc)))
|
|
|
|
cookies = g.get("Cookies") or g.get("cookies") or []
|
|
for c in cookies[:50]:
|
|
cn = c.get("Name") or c.get("name") or ""
|
|
cp = c.get("Provider") or c.get("provider") or ""
|
|
cd = c.get("description") or c.get("Description") or ""
|
|
ce = c.get("Length") or c.get("expires") or ""
|
|
line = f"- {cn}"
|
|
if cp:
|
|
line += f" ({cp})"
|
|
if cd:
|
|
line += f" — {cd[:120]}"
|
|
if ce:
|
|
line += f" — Speicherdauer: {ce}"
|
|
parts.append(line)
|
|
|
|
return "\n".join(parts)
|