fix(consent-tester): capture CMP JSON to extract dynamically-loaded cookie policies
BMW (and other big enterprise sites) do NOT render cookie policies as static HTML. Their widget loads structured data from a JSON endpoint (BMW: ePaaS at /epaas/prod/policypage/.../<locale>.epaas.json) and renders it client-side after consent. Our DOM extraction therefore only captured site navigation (603 words of header/footer chrome), not the actual policy. New module consent-tester/services/cmp_extractor.py: - CMPCapture: response listener that catches policy JSON during navigation - Reconstructors for ePaaS (BMW) + OneTrust placeholder - Returns Cookie-Richtlinie text built from policyPageMetadata + categories + providers (BMW: 1673 words reconstructed vs. 603 noise) dsi_discovery.py: - Attach CMPCapture before page.goto - After self-extraction: if rendered DOM < 300 words AND CMP captured a payload, prefer the CMP-reconstructed text. This bypasses the empty '.cookie-policy' div problem entirely.
This commit is contained in:
@@ -24,6 +24,7 @@ from urllib.parse import urlparse, urljoin
|
||||
from playwright.async_api import Page
|
||||
|
||||
from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
|
||||
from services.cmp_extractor import CMPCapture
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -221,6 +222,11 @@ async def discover_dsi_documents(
|
||||
seen_urls: set[str] = set()
|
||||
seen_titles: set[str] = set()
|
||||
|
||||
# CMP capture must be wired BEFORE navigation so we catch the JSON requests
|
||||
# that fire as soon as the consent widget initializes (e.g. BMW ePaaS).
|
||||
cmp_capture = CMPCapture()
|
||||
cmp_capture.attach(page)
|
||||
|
||||
try:
|
||||
# Step 1: Load the page (with networkidle → domcontentloaded fallback)
|
||||
await goto_resilient(page, url, timeout=60000)
|
||||
@@ -302,6 +308,22 @@ async def discover_dsi_documents(
|
||||
self_wc = len(self_text.split())
|
||||
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
|
||||
|
||||
# If the rendered DOM is still short, the page is likely a
|
||||
# JS-injected CMP widget (BMW ePaaS, OneTrust Cookie List).
|
||||
# Use the JSON we captured from network responses instead —
|
||||
# that's the structured source the widget would have rendered.
|
||||
# We also prefer CMP data over thin DOM extraction (< 300 words)
|
||||
# because thin DOM = mostly site navigation, not policy.
|
||||
if self_wc < 300 and cmp_capture.payloads:
|
||||
cmp_text = cmp_capture.reconstruct_cookie_policy()
|
||||
cmp_wc = len(cmp_text.split()) if cmp_text else 0
|
||||
if cmp_wc > self_wc:
|
||||
self_text = cmp_text
|
||||
self_wc = cmp_wc
|
||||
logger.info("Self-extraction via CMP capture for %s: %d words "
|
||||
"(%d CMP payloads)", url, self_wc,
|
||||
len(cmp_capture.payloads))
|
||||
|
||||
if self_wc >= 100:
|
||||
page_title = await page.title() or url
|
||||
result.documents.append(DiscoveredDSI(
|
||||
|
||||
Reference in New Issue
Block a user