fix(consent-tester): prefer CMP-JSON over thin DOM extraction

Previous threshold (DOM < 300 words) missed the BMW case where Playwright
extracted 346 words of pure site navigation. The CMP JSON had 1673 words
of real policy content but was discarded.

New heuristic: prefer CMP when ANY of:
  - DOM < 300 words (existing)
  - CMP text >= 1000 words (authoritative at scale)
  - CMP text >1.5x longer than DOM
This commit is contained in:
Benjamin Admin
2026-05-16 20:56:11 +02:00
parent 1792c6f896
commit 17a93bc694
+21 -11
View File
@@ -308,21 +308,31 @@ async def discover_dsi_documents(
self_wc = len(self_text.split())
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
# If the rendered DOM is still short, the page is likely a
# JS-injected CMP widget (BMW ePaaS, OneTrust Cookie List).
# Use the JSON we captured from network responses instead —
# that's the structured source the widget would have rendered.
# We also prefer CMP data over thin DOM extraction (< 300 words)
# because thin DOM = mostly site navigation, not policy.
if self_wc < 300 and cmp_capture.payloads:
# If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is
# the authoritative source for the cookie policy — far more
# reliable than the rendered DOM, which usually only contains
# site chrome (navigation/footer) when the policy widget hasn't
# finished rendering yet.
#
# Prefer the CMP-reconstructed text when ANY of:
# - DOM extraction was very short (< 300 words)
# - CMP text is at least 1.5x longer than DOM
# - CMP text exceeds 1000 words (always authoritative at scale)
if cmp_capture.payloads:
cmp_text = cmp_capture.reconstruct_cookie_policy()
cmp_wc = len(cmp_text.split()) if cmp_text else 0
if cmp_wc > self_wc:
if cmp_wc > 0 and (
self_wc < 300
or cmp_wc >= 1000
or cmp_wc > self_wc * 1.5
):
logger.info(
"Self-extraction via CMP capture for %s: %d words "
"(replacing %d-word DOM extraction, %d CMP payloads)",
url, cmp_wc, self_wc, len(cmp_capture.payloads),
)
self_text = cmp_text
self_wc = cmp_wc
logger.info("Self-extraction via CMP capture for %s: %d words "
"(%d CMP payloads)", url, self_wc,
len(cmp_capture.payloads))
if self_wc >= 100:
page_title = await page.title() or url