feat(consent-tester): Phase B — named CMP library + plugin architecture

cmp_extractor.py refactored to thin coordinator (123 LOC, was 223).
Discovers all CMP modules via cmp_library/_registry.py:load_all() at
import time. Restart consent-tester to pick up new modules.

New cmp_library/ folder:
- _registry.py: auto-discovers all modules with MATCHER + reconstruct()
- epaas.py:     BMW Group ePaaS (extracted from cmp_extractor)
- onetrust.py:  cdn.cookielaw.org Groups/Cookies schema
- cookiebot.py: consent.cookiebot.com Categories schema
- usercentrics.py: api.usercentrics.eu services schema
- didomi.py:    sdk.privacy-center.org notice + vendors + purposes
- trustarc.py:  consent.trustarc.com categories + vendors

Each module:
- MATCHER: re.Pattern matching the CMP JSON endpoint URL
- reconstruct(d: dict) -> str: builds German Markdown cookie-policy text

Phase E (self-improving) will write auto_*.py files into the same folder;
_registry already picks those up via pkgutil.iter_modules.
This commit is contained in:
Benjamin Admin
2026-05-16 22:59:48 +02:00
parent 4f19310130
commit 7e426c31f1
9 changed files with 427 additions and 144 deletions
+44 -144
View File
@@ -1,42 +1,35 @@
"""
CMP Extractor — capture Cookie-Policy data from Consent Management Platforms.
CMP Extractor — thin coordinator.
Many sites (BMW, Daimler, big enterprise) do NOT render their cookie policy as
static HTML. Instead, a JS widget loads structured data from a JSON endpoint
(BMW: ePaaS; OneTrust: /consent/<id>.json; Cookiebot: /uc.js; Usercentrics:
/settings/<id>.json) and renders it client-side after consent is given.
Captures CMP (Cookie Management Platform) JSON payloads from network responses
during page navigation. Three-stage cascade:
This module sniffs network responses while Playwright loads the page and, if
a CMP JSON is captured, reconstructs the cookie policy text. That bypasses the
"the rendered HTML container is empty" problem entirely.
1. Named CMP library (services/cmp_library/) — best quality, hand-written
reconstructors per vendor (ePaaS, OneTrust, Cookiebot, Usercentrics,
Didomi, TrustArc, + auto-promoted entries from Phase E).
2. Generic JSON cookie-policy heuristic (cmp_heuristic.py) — catches
unknown CMPs that still expose a recognizable JSON shape.
Currently supported:
- ePaaS (BMW Group): policypage/.../<locale>.epaas.json
- OneTrust (placeholder): cdn.cookielaw.org/consent/<id>/<id>.json
Add more CMPs by extending `_MATCHERS` + a corresponding `_reconstruct_<cmp>`.
A single CMP page may emit multiple payloads (e.g. consentcontroller +
policypage); all are captured and concatenated by reconstruct_cookie_policy().
"""
from __future__ import annotations
import json
import logging
import re
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Callable
if TYPE_CHECKING:
from playwright.async_api import Page, Response
from services.cmp_library._registry import load_all as _load_registry
logger = logging.getLogger(__name__)
# URL patterns that identify a CMP policy JSON. Order matters — first match wins.
_MATCHERS: list[tuple[str, re.Pattern[str]]] = [
# BMW ePaaS: /epaas/prod/policypage/<tenant>/<config>/<locale>.epaas.json
# Use a tolerant pattern: any number of segments before .epaas.json
("epaas", re.compile(r"/epaas/prod/policypage/.+\.epaas\.json(\?|$)", re.I)),
("onetrust", re.compile(r"cdn\.cookielaw\.org/consent/[^/]+/[^/]+\.json", re.I)),
]
# Loaded once at import time. Restart consent-tester to pick up new modules.
_REGISTRY: list[tuple[str, object, Callable[[dict], str]]] = _load_registry()
logger.info("CMP library loaded: %d named matchers", len(_REGISTRY))
class CMPCapture:
@@ -51,60 +44,67 @@ class CMPCapture:
async def _on_response(self, response: Response) -> None:
try:
url = response.url
if response.status != 200:
return
url = response.url
# 1) Named CMP matchers (highest quality)
for cmp_name, pattern in _MATCHERS:
if pattern.search(url):
# Stage 1: Named CMP matchers (highest quality)
for cmp_name, matcher, _reconstruct in _REGISTRY:
if matcher.search(url): # type: ignore[attr-defined]
data = await _parse_json_response(response)
if data is None:
return
self.payloads.append((cmp_name, data))
logger.info("CMP captured: %s (%s, ~%dKB)",
cmp_name, url[:120], len(json.dumps(data)) // 1024)
logger.info(
"CMP captured: %s (%s, ~%dKB)",
cmp_name, url[:120], len(json.dumps(data)) // 1024,
)
return
# 2) Generic shape-based heuristic for unknown CMPs.
# Only consider JSON responses ≥1KB (skip small config blobs).
# Stage 2: Generic heuristic for unknown CMPs.
# Pre-filter: skip noisy/irrelevant endpoints to avoid spamming.
content_type = (response.headers.get("content-type") or "").lower()
if "json" not in content_type:
return
# Cheap pre-filter: skip noisy paths (analytics, fonts, etc.)
url_lower = url.lower()
if any(skip in url_lower for skip in (
"/api/config", "/beacon", "/track", "/analytics",
"/fonts/", "/log/", "/heartbeat", "/.well-known/",
"/intake/", "/collect", "/ping", "/metrics",
)):
return
data = await _parse_json_response(response)
if data is None:
return
from services.cmp_heuristic import looks_like_cookie_policy
if looks_like_cookie_policy(data):
self.payloads.append(("_heuristic", data))
logger.info("CMP captured: _heuristic (%s, ~%dKB)",
url[:120], len(json.dumps(data)) // 1024)
logger.info(
"CMP captured: _heuristic (%s, ~%dKB)",
url[:120], len(json.dumps(data)) // 1024,
)
except Exception as e:
logger.debug("CMP listener error: %s", e)
def reconstruct_cookie_policy(self) -> str:
"""Build a single Cookie-Policy text from all captured payloads.
Returns empty string if nothing was captured or reconstruction fails.
Named CMPs take precedence over the generic heuristic (richer output).
"""
"""Build a single Cookie-Policy text from all captured payloads."""
from services.cmp_heuristic import reconstruct_generic
# Build a quick lookup so we can dispatch by name without re-loading
# the registry on every call.
by_name = {name: fn for name, _matcher, fn in _REGISTRY}
parts: list[str] = []
for cmp_name, data in self.payloads:
try:
if cmp_name == "epaas":
parts.append(_reconstruct_epaas(data))
elif cmp_name == "onetrust":
parts.append(_reconstruct_onetrust(data))
elif cmp_name == "_heuristic":
if cmp_name == "_heuristic":
parts.append(reconstruct_generic(data))
elif cmp_name in by_name:
parts.append(by_name[cmp_name](data))
else:
# Unknown name (perhaps a hot-loaded module that was
# since removed) — fall back to generic.
parts.append(reconstruct_generic(data))
except Exception as e:
logger.warning("CMP %s reconstruction failed: %s", cmp_name, e)
@@ -121,103 +121,3 @@ async def _parse_json_response(response: Response) -> dict | None:
return json.loads(body.decode("utf-8", errors="ignore"))
except Exception:
return None
def _reconstruct_epaas(d: dict) -> str:
"""Build a German Cookie-Policy from BMW ePaaS policy JSON.
Schema (observed 2026-05):
- policyPageMetadata: { heading, subHeading, prologue, dataController,
epilogue, persistencePurposeText, expiresAfter, ... }
- categories: [ { id, name, description, ... } ]
- providers: [ { id, name, purpose, country, persistencePurposeDescription, ... } ]
"""
meta = d.get("policyPageMetadata", {}) or {}
parts: list[str] = ["# Cookie-Richtlinie"]
for key in ("heading", "subHeading", "prologue", "dataController", "epilogue"):
val = meta.get(key)
if val:
parts.append("")
parts.append(_clean_html(str(val)))
cats = d.get("categories", []) or []
if cats:
parts.append("")
parts.append("## Cookie-Kategorien")
for c in cats:
name = c.get("name") or c.get("id") or ""
desc = c.get("description") or c.get("descriptionHtml") or ""
parts.append("")
parts.append(f"### {name}")
parts.append(_clean_html(str(desc)))
providers = d.get("providers", []) or []
if providers:
parts.append("")
parts.append(f"## Anbieter ({len(providers)})")
for p in providers:
name = p.get("name") or p.get("id") or ""
purpose = (p.get("purpose") or "").strip()
country = (p.get("country") or "").strip()
persistence = (p.get("persistencePurposeDescription") or "").strip()
line = f"- {name}"
if purpose:
line += f" — Zweck: {purpose}"
if country:
line += f" — Sitz: {country}"
if persistence:
line += f" — Speicherdauer: {persistence[:120]}"
parts.append(line)
if meta.get("expiresAfter"):
parts.append("")
parts.append(f"Speicherdauer: {meta['expiresAfter']}")
if meta.get("persistencePurposeText"):
parts.append(_clean_html(str(meta["persistencePurposeText"])))
return "\n".join(parts)
def _reconstruct_onetrust(d: dict) -> str:
"""Build a Cookie-Policy from OneTrust consent JSON.
Schema varies; common fields: Groups[].GroupName/Description, Cookies[].Name.
"""
parts: list[str] = ["# Cookie-Richtlinie (OneTrust)"]
groups = d.get("Groups") or d.get("groups") or []
for g in groups:
name = g.get("GroupName") or g.get("name") or ""
desc = g.get("GroupDescription") or g.get("description") or ""
parts.append("")
parts.append(f"## {name}")
parts.append(_clean_html(str(desc)))
cookies = g.get("Cookies") or g.get("cookies") or []
for c in cookies[:50]:
cn = c.get("Name") or c.get("name") or ""
cp = c.get("Provider") or c.get("provider") or ""
cd = c.get("description") or c.get("Description") or ""
ce = c.get("Length") or c.get("expires") or ""
line = f"- {cn}"
if cp:
line += f" ({cp})"
if cd:
line += f"{cd[:120]}"
if ce:
line += f" — Speicherdauer: {ce}"
parts.append(line)
return "\n".join(parts)
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
def _clean_html(text: str) -> str:
"""Strip HTML tags and collapse whitespace."""
no_tags = _TAG_RE.sub(" ", text)
no_tags = (no_tags
.replace("&nbsp;", " ").replace("&amp;", "&")
.replace("&lt;", "<").replace("&gt;", ">")
.replace("&quot;", '"').replace("&#39;", "'"))
return _WS_RE.sub(" ", no_tags).strip()