From 75174273f479b2bc2879f8a465db86b806ed09a5 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 22 May 2026 10:28:57 +0200 Subject: [PATCH] diag(cmp): log skipped CMP candidates with top-keys for Phase 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VW & andere unbekannte CMPs liefern 603-Wort-Bug: kein Named-Matcher greift, generische Heuristik filtert oder size_kb < 5 → cmp_cookie_text bleibt leer → Backend faellt auf 603-Wort DOM-Navigation zurueck. Neuer INFO-Log fuer jede JSON-Response >=3KB die als CMP-Kandidat ueberlebt, aber Heuristik ODER Size-Schwelle nicht passt. Top-Keys + URL + Size — beim naechsten VW-Run sofort sichtbar, welcher Endpoint ein Named-Pattern braucht. Co-Authored-By: Claude Opus 4.7 (1M context) --- consent-tester/services/cmp_extractor.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/consent-tester/services/cmp_extractor.py b/consent-tester/services/cmp_extractor.py index 7f8ef0ad..5030ff4f 100644 --- a/consent-tester/services/cmp_extractor.py +++ b/consent-tester/services/cmp_extractor.py @@ -80,21 +80,33 @@ class CMPCapture: data = await _parse_json_response(response) if data is None: return - # Skip tiny payloads — real CMP cookie policies are ≥5KB. - # A 4KB JSON of cookie-shaped data is almost never the policy. try: size_kb = len(json.dumps(data)) // 1024 except Exception: size_kb = 0 - if size_kb < 5: - return from services.cmp_heuristic import looks_like_cookie_policy - if looks_like_cookie_policy(data): + matched = looks_like_cookie_policy(data) + if matched and size_kb >= 5: self.payloads.append(("_heuristic", data)) logger.info( "CMP captured: _heuristic (%s, ~%dKB)", url[:120], size_kb, ) + elif size_kb >= 3: + # Phase-0-Diagnose-Log: JSON-Response die als CMP-Kandidat + # ueberlebt hat, aber heuristic OR size-threshold abgelehnt + # wurde. Zeigt beim naechsten VW/BMW/... Run welche Endpoints + # uebersehen werden — schneller Pattern-Add ohne raten. + top_keys = [] + if isinstance(data, dict): + top_keys = list(data.keys())[:8] + elif isinstance(data, list) and data and isinstance(data[0], dict): + top_keys = list(data[0].keys())[:8] + logger.info( + "CMP candidate skipped: url=%s size=%dKB heuristic=%s " + "top_keys=%s", + url[:120], size_kb, matched, top_keys, + ) except Exception as e: logger.debug("CMP listener error: %s", e)