Files
breakpilot-compliance/backend-compliance/tests/test_cookie_inventory_fuzzy.py
T
Benjamin Admin 0b29d1fada fix(cookie-inventory): fuzzy prefix-match + BMW-GT-File
BMW-Mail zeigte 738 deklariert / 31 Browser / **0 OK** — alle
Browser-Cookies landeten als UNDOC, alle deklarierten als ORPH.
Ursache: exact-string-match scheitert bei Suffix-Cookies.

_norm_for_match() + _matches() Helper:
  - Strippt Wildcards (`*`, `.*`, `<id>`, `{var}`) + Lower-Case
  - Erhält führende Underscores (`__cf_bm`, `_ga` sind meaningful)
  - Prefix-Match in BEIDE Richtungen, min 3 Chars (kein "_"-Garbage)

build_cookie_inventory():
  - Für jeden Browser-Cookie: längster Prefix-Match in declared wählen
  - browser-to-decl Index + decl-match-Index für O(N×M) → O(N+M)
  - matched browser-keys werden aus all_keys entfernt → kein
    Double-Count (vorher: ORPH + UNDOC parallel)

Realistischer BMW-Match-Test:
  declared=[_ga, _gid, __cf_bm, AMP_TOKEN, _fbp, intercom-session,
            _pk_id.*, OptanonConsent]
  browser= [_ga_K8YL3M9T, _gid_xyz, __cf_bm_actual_hash,
            AMP_TOKEN_runtime, _fbp_123, intercom-session-2026,
            _pk_id.5.7d8, OptanonConsent]
  → 8 OK (vorher 0)

BMW-GT-File (zeroclaw/docs/ground-truth/bmw_de_2026-06-07.json):
  - OneTrust CMP + 14 erwartete Vendoren
  - Cookie-Count-Ranges (browser 80-250, deklariert 300-800)
  - 7 expected findings inkl. neuem COOKIE-INVENTORY-MATCH-001 als
    Benchmark gegen den Fuzzy-Match-Bug

Tests: 14/14 grün (4 _norm_for_match + 5 _matches + 5
build_cookie_inventory inkl. realistic_bmw_pattern).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-07 21:29:21 +02:00

110 lines
3.7 KiB
Python

"""Tests for the Cookie-Inventory fuzzy-matcher.
Regression: BMW-Mail zeigte 0 OK obwohl 31 Browser-Cookies + 738
deklarierte vorhanden waren. Ursache: exact-string-match scheitert
bei `_ga` (declared) vs `_ga_K8YL3M9T` (browser).
"""
from compliance.services.mail_render_v2._cookie_inventory import (
_matches,
_norm_for_match,
build_cookie_inventory,
)
class TestNormForMatch:
def test_strip_wildcard(self):
assert _norm_for_match("_ga*") == "_ga"
def test_strip_regex_wildcard(self):
assert _norm_for_match("_pk_id.*") == "_pk_id"
def test_strip_placeholder(self):
assert _norm_for_match("session_<id>") == "session_"
def test_lowercase(self):
assert _norm_for_match("__CF_BM") == "__cf_bm"
class TestMatches:
def test_exact(self):
assert _matches("_ga", "_ga")
def test_declared_prefix_of_browser(self):
# declared "_ga" matches browser "_ga_k8yl3m9t"
assert _matches("_ga", "_ga_k8yl3m9t")
def test_browser_prefix_of_declared(self):
# browser "__cf_bm" matches declared "__cf_bm_session"
assert _matches("__cf_bm_session", "__cf_bm")
def test_short_prefix_rejected(self):
# 2-char prefix would match too much
assert not _matches("_g", "_ga_k8yl3m9t")
def test_unrelated(self):
assert not _matches("_ga", "intercom-session")
class TestBuildInventory:
def _make_state(self, declared_cookies, browser_cookies):
return {
"cmp_vendors": [{
"name": "Test", "country": "DE", "source": "dse",
"cookies": [{"name": n} for n in declared_cookies],
}],
"banner_result": {
"cookies_detailed": [{"name": n} for n in browser_cookies],
},
"cookie_audit": {},
}
def test_no_match_no_ok(self):
rows, summary = build_cookie_inventory(
self._make_state(["foo"], ["bar"]),
)
assert summary["ok"] == 0
assert summary["orph"] == 1
assert summary["undoc"] == 1
def test_exact_match_yields_ok(self):
rows, summary = build_cookie_inventory(
self._make_state(["_ga"], ["_ga"]),
)
assert summary["ok"] == 1
assert summary["orph"] == 0
assert summary["undoc"] == 0
def test_prefix_match_yields_ok_no_double_count(self):
# Realistic BMW case: declared "_ga", browser "_ga_K8YL3M9T"
rows, summary = build_cookie_inventory(
self._make_state(["_ga"], ["_ga_K8YL3M9T"]),
)
assert summary["ok"] == 1, "fuzzy prefix-match should land in OK"
assert summary["orph"] == 0, "declared must not double-count as ORPH"
assert summary["undoc"] == 0, (
"browser cookie must fold into declared row, not appear separately"
)
def test_wildcard_match(self):
rows, summary = build_cookie_inventory(
self._make_state(["_pk_id.*"], ["_pk_id.5"]),
)
assert summary["ok"] == 1
def test_realistic_bmw_pattern(self):
# Declared: long list with common cookies
decl = ["_ga", "_gid", "__cf_bm", "AMP_TOKEN", "_fbp",
"intercom-session", "_pk_id.*", "OptanonConsent"]
# Browser: actual cookies with runtime suffixes
bro = ["_ga_K8YL3M9T", "_gid_xyz", "__cf_bm_actual_hash",
"AMP_TOKEN_runtime", "_fbp_123",
"intercom-session-2026", "_pk_id.5.7d8", "OptanonConsent"]
rows, summary = build_cookie_inventory(
self._make_state(decl, bro),
)
# All 8 browser cookies should fold into the 8 declared rows.
assert summary["ok"] == 8, f"expected 8 OK, got {summary}"
assert summary["orph"] == 0
assert summary["undoc"] == 0