diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index 304c2425..8b971e72 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -308,21 +308,31 @@ async def discover_dsi_documents( self_wc = len(self_text.split()) logger.info("Self-extraction via iframe for %s: %d words", url, self_wc) - # If the rendered DOM is still short, the page is likely a - # JS-injected CMP widget (BMW ePaaS, OneTrust Cookie List). - # Use the JSON we captured from network responses instead — - # that's the structured source the widget would have rendered. - # We also prefer CMP data over thin DOM extraction (< 300 words) - # because thin DOM = mostly site navigation, not policy. - if self_wc < 300 and cmp_capture.payloads: + # If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is + # the authoritative source for the cookie policy — far more + # reliable than the rendered DOM, which usually only contains + # site chrome (navigation/footer) when the policy widget hasn't + # finished rendering yet. + # + # Prefer the CMP-reconstructed text when ANY of: + # - DOM extraction was very short (< 300 words) + # - CMP text is at least 1.5x longer than DOM + # - CMP text exceeds 1000 words (always authoritative at scale) + if cmp_capture.payloads: cmp_text = cmp_capture.reconstruct_cookie_policy() cmp_wc = len(cmp_text.split()) if cmp_text else 0 - if cmp_wc > self_wc: + if cmp_wc > 0 and ( + self_wc < 300 + or cmp_wc >= 1000 + or cmp_wc > self_wc * 1.5 + ): + logger.info( + "Self-extraction via CMP capture for %s: %d words " + "(replacing %d-word DOM extraction, %d CMP payloads)", + url, cmp_wc, self_wc, len(cmp_capture.payloads), + ) self_text = cmp_text self_wc = cmp_wc - logger.info("Self-extraction via CMP capture for %s: %d words " - "(%d CMP payloads)", url, self_wc, - len(cmp_capture.payloads)) if self_wc >= 100: page_title = await page.title() or url