Files
breakpilot-compliance/consent-tester/services/cmp_extractor.py
T
Benjamin Admin 7e426c31f1 feat(consent-tester): Phase B — named CMP library + plugin architecture
cmp_extractor.py refactored to thin coordinator (123 LOC, was 223).
Discovers all CMP modules via cmp_library/_registry.py:load_all() at
import time. Restart consent-tester to pick up new modules.

New cmp_library/ folder:
- _registry.py: auto-discovers all modules with MATCHER + reconstruct()
- epaas.py:     BMW Group ePaaS (extracted from cmp_extractor)
- onetrust.py:  cdn.cookielaw.org Groups/Cookies schema
- cookiebot.py: consent.cookiebot.com Categories schema
- usercentrics.py: api.usercentrics.eu services schema
- didomi.py:    sdk.privacy-center.org notice + vendors + purposes
- trustarc.py:  consent.trustarc.com categories + vendors

Each module:
- MATCHER: re.Pattern matching the CMP JSON endpoint URL
- reconstruct(d: dict) -> str: builds German Markdown cookie-policy text

Phase E (self-improving) will write auto_*.py files into the same folder;
_registry already picks those up via pkgutil.iter_modules.
2026-05-16 22:59:48 +02:00

124 lines
4.8 KiB
Python

"""
CMP Extractor — thin coordinator.
Captures CMP (Cookie Management Platform) JSON payloads from network responses
during page navigation. Three-stage cascade:
1. Named CMP library (services/cmp_library/) — best quality, hand-written
reconstructors per vendor (ePaaS, OneTrust, Cookiebot, Usercentrics,
Didomi, TrustArc, + auto-promoted entries from Phase E).
2. Generic JSON cookie-policy heuristic (cmp_heuristic.py) — catches
unknown CMPs that still expose a recognizable JSON shape.
A single CMP page may emit multiple payloads (e.g. consentcontroller +
policypage); all are captured and concatenated by reconstruct_cookie_policy().
"""
from __future__ import annotations
import json
import logging
from typing import TYPE_CHECKING, Callable
if TYPE_CHECKING:
from playwright.async_api import Page, Response
from services.cmp_library._registry import load_all as _load_registry
logger = logging.getLogger(__name__)
# Loaded once at import time. Restart consent-tester to pick up new modules.
_REGISTRY: list[tuple[str, object, Callable[[dict], str]]] = _load_registry()
logger.info("CMP library loaded: %d named matchers", len(_REGISTRY))
class CMPCapture:
"""Holds CMP-related JSON payloads captured during navigation."""
def __init__(self) -> None:
self.payloads: list[tuple[str, dict]] = [] # [(cmp_name, parsed_json), ...]
def attach(self, page: Page) -> None:
"""Hook the page's response event. Must be called BEFORE page.goto()."""
page.on("response", self._on_response)
async def _on_response(self, response: Response) -> None:
try:
if response.status != 200:
return
url = response.url
# Stage 1: Named CMP matchers (highest quality)
for cmp_name, matcher, _reconstruct in _REGISTRY:
if matcher.search(url): # type: ignore[attr-defined]
data = await _parse_json_response(response)
if data is None:
return
self.payloads.append((cmp_name, data))
logger.info(
"CMP captured: %s (%s, ~%dKB)",
cmp_name, url[:120], len(json.dumps(data)) // 1024,
)
return
# Stage 2: Generic heuristic for unknown CMPs.
# Pre-filter: skip noisy/irrelevant endpoints to avoid spamming.
content_type = (response.headers.get("content-type") or "").lower()
if "json" not in content_type:
return
url_lower = url.lower()
if any(skip in url_lower for skip in (
"/api/config", "/beacon", "/track", "/analytics",
"/fonts/", "/log/", "/heartbeat", "/.well-known/",
"/intake/", "/collect", "/ping", "/metrics",
)):
return
data = await _parse_json_response(response)
if data is None:
return
from services.cmp_heuristic import looks_like_cookie_policy
if looks_like_cookie_policy(data):
self.payloads.append(("_heuristic", data))
logger.info(
"CMP captured: _heuristic (%s, ~%dKB)",
url[:120], len(json.dumps(data)) // 1024,
)
except Exception as e:
logger.debug("CMP listener error: %s", e)
def reconstruct_cookie_policy(self) -> str:
"""Build a single Cookie-Policy text from all captured payloads."""
from services.cmp_heuristic import reconstruct_generic
# Build a quick lookup so we can dispatch by name without re-loading
# the registry on every call.
by_name = {name: fn for name, _matcher, fn in _REGISTRY}
parts: list[str] = []
for cmp_name, data in self.payloads:
try:
if cmp_name == "_heuristic":
parts.append(reconstruct_generic(data))
elif cmp_name in by_name:
parts.append(by_name[cmp_name](data))
else:
# Unknown name (perhaps a hot-loaded module that was
# since removed) — fall back to generic.
parts.append(reconstruct_generic(data))
except Exception as e:
logger.warning("CMP %s reconstruction failed: %s", cmp_name, e)
return "\n\n".join(p for p in parts if p)
async def _parse_json_response(response: Response) -> dict | None:
"""Best-effort JSON parse from a Playwright Response."""
try:
return await response.json()
except Exception:
try:
body = await response.body()
return json.loads(body.decode("utf-8", errors="ignore"))
except Exception:
return None