feat(consent-tester): Phase A — generic JSON cookie-policy heuristic
New module cmp_heuristic.py with: - looks_like_cookie_policy(data): shape-based classifier (top-level keys cookies/categories/providers/vendors/purposes/cookieList/etc. + at least 2 name+description objects, or IAB TCF v2 vendors[]+purposes[]) - reconstruct_generic(data): walks JSON, extracts name + description fields + standalone prologue/dataController/persistence fields, emits flat German Markdown text (max 5000 words, dedup) cmp_extractor.py wired so that AFTER named CMP matchers (epaas, onetrust) fail, every JSON response on the page is tested for the heuristic. If matched, payload is captured as '_heuristic' kind and reconstructed via the generic walker. This is Phase A of the 4-stage cascade (B-D follow). Unknown CMPs that return JSON now work without hand-coding each one. Pre-filter: skips response paths /api/config, /beacon, /track, /analytics, /fonts/, /log/, /heartbeat/, /.well-known/ to avoid spamming the heuristic on every Playwright load.
This commit is contained in:
@@ -52,24 +52,40 @@ class CMPCapture:
|
||||
async def _on_response(self, response: Response) -> None:
|
||||
try:
|
||||
url = response.url
|
||||
if response.status != 200:
|
||||
return
|
||||
|
||||
# 1) Named CMP matchers (highest quality)
|
||||
for cmp_name, pattern in _MATCHERS:
|
||||
if pattern.search(url):
|
||||
if response.status != 200:
|
||||
logger.info("CMP %s response %s (%d) — skipped",
|
||||
cmp_name, url[:120], response.status)
|
||||
data = await _parse_json_response(response)
|
||||
if data is None:
|
||||
return
|
||||
try:
|
||||
data = await response.json()
|
||||
except Exception:
|
||||
body = await response.body()
|
||||
try:
|
||||
data = json.loads(body.decode("utf-8", errors="ignore"))
|
||||
except Exception:
|
||||
return
|
||||
self.payloads.append((cmp_name, data))
|
||||
logger.info("CMP captured: %s (%s, ~%dKB)",
|
||||
cmp_name, url[:120], len(json.dumps(data)) // 1024)
|
||||
return
|
||||
|
||||
# 2) Generic shape-based heuristic for unknown CMPs.
|
||||
# Only consider JSON responses ≥1KB (skip small config blobs).
|
||||
content_type = (response.headers.get("content-type") or "").lower()
|
||||
if "json" not in content_type:
|
||||
return
|
||||
# Cheap pre-filter: skip noisy paths (analytics, fonts, etc.)
|
||||
url_lower = url.lower()
|
||||
if any(skip in url_lower for skip in (
|
||||
"/api/config", "/beacon", "/track", "/analytics",
|
||||
"/fonts/", "/log/", "/heartbeat", "/.well-known/",
|
||||
)):
|
||||
return
|
||||
data = await _parse_json_response(response)
|
||||
if data is None:
|
||||
return
|
||||
from services.cmp_heuristic import looks_like_cookie_policy
|
||||
if looks_like_cookie_policy(data):
|
||||
self.payloads.append(("_heuristic", data))
|
||||
logger.info("CMP captured: _heuristic (%s, ~%dKB)",
|
||||
url[:120], len(json.dumps(data)) // 1024)
|
||||
except Exception as e:
|
||||
logger.debug("CMP listener error: %s", e)
|
||||
|
||||
@@ -77,7 +93,10 @@ class CMPCapture:
|
||||
"""Build a single Cookie-Policy text from all captured payloads.
|
||||
|
||||
Returns empty string if nothing was captured or reconstruction fails.
|
||||
Named CMPs take precedence over the generic heuristic (richer output).
|
||||
"""
|
||||
from services.cmp_heuristic import reconstruct_generic
|
||||
|
||||
parts: list[str] = []
|
||||
for cmp_name, data in self.payloads:
|
||||
try:
|
||||
@@ -85,11 +104,25 @@ class CMPCapture:
|
||||
parts.append(_reconstruct_epaas(data))
|
||||
elif cmp_name == "onetrust":
|
||||
parts.append(_reconstruct_onetrust(data))
|
||||
elif cmp_name == "_heuristic":
|
||||
parts.append(reconstruct_generic(data))
|
||||
except Exception as e:
|
||||
logger.warning("CMP %s reconstruction failed: %s", cmp_name, e)
|
||||
return "\n\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
async def _parse_json_response(response: Response) -> dict | None:
|
||||
"""Best-effort JSON parse from a Playwright Response."""
|
||||
try:
|
||||
return await response.json()
|
||||
except Exception:
|
||||
try:
|
||||
body = await response.body()
|
||||
return json.loads(body.decode("utf-8", errors="ignore"))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _reconstruct_epaas(d: dict) -> str:
|
||||
"""Build a German Cookie-Policy from BMW ePaaS policy JSON.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user