From 189918b0438cb591bcc458a1f218ec09fbeb5c7c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 17 May 2026 10:50:19 +0200 Subject: [PATCH] fix(cmp): stricter heuristic + only replace DOM when CMP is strictly larger MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs observed in BMW BMW test run: 1. Generic JSON heuristic captured /de-de/login/bmw/api/flyout/data (4KB, user login fly-out data) and reconstruct_generic produced 56 words of noise. The CMP-prefer logic then 'replaced' the 185-word imprint DOM extraction with those 56 words because self_wc(185) < 300 — even though cmp_wc(56) < self_wc(185). 2. The strict prefilter list was too short. Login/auth/cart endpoints often have category-shaped JSON without being cookie policies. Fixes: - dsi_discovery: replace DOM with CMP only when cmp_wc > self_wc AND meets one of the existing conditions. Tiny captures can no longer silently destroy a bigger DOM extraction. - cmp_extractor: skip non-cookie URLs (/login, /auth, /user, /session, /cart, /checkout, /search, /flyout, /menu, /nav, /translation, /i18n, /locale, /feature-flag). - cmp_extractor: require ≥5KB payload size — real CMP policies are always larger (BMW ePaaS is ~393KB). Tiny matches drop out before reconstruction. --- consent-tester/services/cmp_extractor.py | 13 ++++++++++++- consent-tester/services/dsi_discovery.py | 8 +++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/consent-tester/services/cmp_extractor.py b/consent-tester/services/cmp_extractor.py index d2ca2bcc..7f8ef0ad 100644 --- a/consent-tester/services/cmp_extractor.py +++ b/consent-tester/services/cmp_extractor.py @@ -71,18 +71,29 @@ class CMPCapture: "/api/config", "/beacon", "/track", "/analytics", "/fonts/", "/log/", "/heartbeat", "/.well-known/", "/intake/", "/collect", "/ping", "/metrics", + "/login", "/auth", "/user", "/session", "/cart", "/checkout", + "/search", "/recommendation", "/flyout", "/menu", "/nav", + "/translation", "/i18n", "/locale", "/feature-flag", )): return data = await _parse_json_response(response) if data is None: return + # Skip tiny payloads — real CMP cookie policies are ≥5KB. + # A 4KB JSON of cookie-shaped data is almost never the policy. + try: + size_kb = len(json.dumps(data)) // 1024 + except Exception: + size_kb = 0 + if size_kb < 5: + return from services.cmp_heuristic import looks_like_cookie_policy if looks_like_cookie_policy(data): self.payloads.append(("_heuristic", data)) logger.info( "CMP captured: _heuristic (%s, ~%dKB)", - url[:120], len(json.dumps(data)) // 1024, + url[:120], size_kb, ) except Exception as e: logger.debug("CMP listener error: %s", e) diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index b06db3fb..668c33e1 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -349,7 +349,13 @@ async def discover_dsi_documents( if cmp_capture.payloads: cmp_text = cmp_capture.reconstruct_cookie_policy() cmp_wc = len(cmp_text.split()) if cmp_text else 0 - if cmp_wc > 0 and ( + # Replace DOM with CMP only when CMP is *strictly larger* + # AND meets at least one of: DOM was very thin, CMP is + # substantial, or CMP is significantly longer than DOM. + # The strict-larger guard prevents a tiny heuristic match + # (e.g. an unrelated /api/data JSON) from clobbering a + # bigger DOM extraction. + if cmp_wc > self_wc and ( self_wc < 300 or cmp_wc >= 1000 or cmp_wc > self_wc * 1.5