diff --git a/consent-tester/services/cmp_extractor.py b/consent-tester/services/cmp_extractor.py index 7f8ef0ad..5030ff4f 100644 --- a/consent-tester/services/cmp_extractor.py +++ b/consent-tester/services/cmp_extractor.py @@ -80,21 +80,33 @@ class CMPCapture: data = await _parse_json_response(response) if data is None: return - # Skip tiny payloads — real CMP cookie policies are ≥5KB. - # A 4KB JSON of cookie-shaped data is almost never the policy. try: size_kb = len(json.dumps(data)) // 1024 except Exception: size_kb = 0 - if size_kb < 5: - return from services.cmp_heuristic import looks_like_cookie_policy - if looks_like_cookie_policy(data): + matched = looks_like_cookie_policy(data) + if matched and size_kb >= 5: self.payloads.append(("_heuristic", data)) logger.info( "CMP captured: _heuristic (%s, ~%dKB)", url[:120], size_kb, ) + elif size_kb >= 3: + # Phase-0-Diagnose-Log: JSON-Response die als CMP-Kandidat + # ueberlebt hat, aber heuristic OR size-threshold abgelehnt + # wurde. Zeigt beim naechsten VW/BMW/... Run welche Endpoints + # uebersehen werden — schneller Pattern-Add ohne raten. + top_keys = [] + if isinstance(data, dict): + top_keys = list(data.keys())[:8] + elif isinstance(data, list) and data and isinstance(data[0], dict): + top_keys = list(data[0].keys())[:8] + logger.info( + "CMP candidate skipped: url=%s size=%dKB heuristic=%s " + "top_keys=%s", + url[:120], size_kb, matched, top_keys, + ) except Exception as e: logger.debug("CMP listener error: %s", e)