fix(consent-tester): prefer CMP-JSON over thin DOM extraction
Previous threshold (DOM < 300 words) missed the BMW case where Playwright extracted 346 words of pure site navigation. The CMP JSON had 1673 words of real policy content but was discarded. New heuristic: prefer CMP when ANY of: - DOM < 300 words (existing) - CMP text >= 1000 words (authoritative at scale) - CMP text >1.5x longer than DOM
This commit is contained in:
@@ -308,21 +308,31 @@ async def discover_dsi_documents(
|
||||
self_wc = len(self_text.split())
|
||||
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
|
||||
|
||||
# If the rendered DOM is still short, the page is likely a
|
||||
# JS-injected CMP widget (BMW ePaaS, OneTrust Cookie List).
|
||||
# Use the JSON we captured from network responses instead —
|
||||
# that's the structured source the widget would have rendered.
|
||||
# We also prefer CMP data over thin DOM extraction (< 300 words)
|
||||
# because thin DOM = mostly site navigation, not policy.
|
||||
if self_wc < 300 and cmp_capture.payloads:
|
||||
# If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is
|
||||
# the authoritative source for the cookie policy — far more
|
||||
# reliable than the rendered DOM, which usually only contains
|
||||
# site chrome (navigation/footer) when the policy widget hasn't
|
||||
# finished rendering yet.
|
||||
#
|
||||
# Prefer the CMP-reconstructed text when ANY of:
|
||||
# - DOM extraction was very short (< 300 words)
|
||||
# - CMP text is at least 1.5x longer than DOM
|
||||
# - CMP text exceeds 1000 words (always authoritative at scale)
|
||||
if cmp_capture.payloads:
|
||||
cmp_text = cmp_capture.reconstruct_cookie_policy()
|
||||
cmp_wc = len(cmp_text.split()) if cmp_text else 0
|
||||
if cmp_wc > self_wc:
|
||||
if cmp_wc > 0 and (
|
||||
self_wc < 300
|
||||
or cmp_wc >= 1000
|
||||
or cmp_wc > self_wc * 1.5
|
||||
):
|
||||
logger.info(
|
||||
"Self-extraction via CMP capture for %s: %d words "
|
||||
"(replacing %d-word DOM extraction, %d CMP payloads)",
|
||||
url, cmp_wc, self_wc, len(cmp_capture.payloads),
|
||||
)
|
||||
self_text = cmp_text
|
||||
self_wc = cmp_wc
|
||||
logger.info("Self-extraction via CMP capture for %s: %d words "
|
||||
"(%d CMP payloads)", url, self_wc,
|
||||
len(cmp_capture.payloads))
|
||||
|
||||
if self_wc >= 100:
|
||||
page_title = await page.title() or url
|
||||
|
||||
Reference in New Issue
Block a user