fix(consent-tester): prefer CMP-JSON over thin DOM extraction
Previous threshold (DOM < 300 words) missed the BMW case where Playwright extracted 346 words of pure site navigation. The CMP JSON had 1673 words of real policy content but was discarded. New heuristic: prefer CMP when ANY of: - DOM < 300 words (existing) - CMP text >= 1000 words (authoritative at scale) - CMP text >1.5x longer than DOM
This commit is contained in:
@@ -308,21 +308,31 @@ async def discover_dsi_documents(
|
|||||||
self_wc = len(self_text.split())
|
self_wc = len(self_text.split())
|
||||||
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
|
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
|
||||||
|
|
||||||
# If the rendered DOM is still short, the page is likely a
|
# If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is
|
||||||
# JS-injected CMP widget (BMW ePaaS, OneTrust Cookie List).
|
# the authoritative source for the cookie policy — far more
|
||||||
# Use the JSON we captured from network responses instead —
|
# reliable than the rendered DOM, which usually only contains
|
||||||
# that's the structured source the widget would have rendered.
|
# site chrome (navigation/footer) when the policy widget hasn't
|
||||||
# We also prefer CMP data over thin DOM extraction (< 300 words)
|
# finished rendering yet.
|
||||||
# because thin DOM = mostly site navigation, not policy.
|
#
|
||||||
if self_wc < 300 and cmp_capture.payloads:
|
# Prefer the CMP-reconstructed text when ANY of:
|
||||||
|
# - DOM extraction was very short (< 300 words)
|
||||||
|
# - CMP text is at least 1.5x longer than DOM
|
||||||
|
# - CMP text exceeds 1000 words (always authoritative at scale)
|
||||||
|
if cmp_capture.payloads:
|
||||||
cmp_text = cmp_capture.reconstruct_cookie_policy()
|
cmp_text = cmp_capture.reconstruct_cookie_policy()
|
||||||
cmp_wc = len(cmp_text.split()) if cmp_text else 0
|
cmp_wc = len(cmp_text.split()) if cmp_text else 0
|
||||||
if cmp_wc > self_wc:
|
if cmp_wc > 0 and (
|
||||||
|
self_wc < 300
|
||||||
|
or cmp_wc >= 1000
|
||||||
|
or cmp_wc > self_wc * 1.5
|
||||||
|
):
|
||||||
|
logger.info(
|
||||||
|
"Self-extraction via CMP capture for %s: %d words "
|
||||||
|
"(replacing %d-word DOM extraction, %d CMP payloads)",
|
||||||
|
url, cmp_wc, self_wc, len(cmp_capture.payloads),
|
||||||
|
)
|
||||||
self_text = cmp_text
|
self_text = cmp_text
|
||||||
self_wc = cmp_wc
|
self_wc = cmp_wc
|
||||||
logger.info("Self-extraction via CMP capture for %s: %d words "
|
|
||||||
"(%d CMP payloads)", url, self_wc,
|
|
||||||
len(cmp_capture.payloads))
|
|
||||||
|
|
||||||
if self_wc >= 100:
|
if self_wc >= 100:
|
||||||
page_title = await page.title() or url
|
page_title = await page.title() or url
|
||||||
|
|||||||
Reference in New Issue
Block a user