fix(cookie-inventory): fuzzy prefix-match + BMW-GT-File
BMW-Mail zeigte 738 deklariert / 31 Browser / **0 OK** — alle
Browser-Cookies landeten als UNDOC, alle deklarierten als ORPH.
Ursache: exact-string-match scheitert bei Suffix-Cookies.
_norm_for_match() + _matches() Helper:
- Strippt Wildcards (`*`, `.*`, `<id>`, `{var}`) + Lower-Case
- Erhält führende Underscores (`__cf_bm`, `_ga` sind meaningful)
- Prefix-Match in BEIDE Richtungen, min 3 Chars (kein "_"-Garbage)
build_cookie_inventory():
- Für jeden Browser-Cookie: längster Prefix-Match in declared wählen
- browser-to-decl Index + decl-match-Index für O(N×M) → O(N+M)
- matched browser-keys werden aus all_keys entfernt → kein
Double-Count (vorher: ORPH + UNDOC parallel)
Realistischer BMW-Match-Test:
declared=[_ga, _gid, __cf_bm, AMP_TOKEN, _fbp, intercom-session,
_pk_id.*, OptanonConsent]
browser= [_ga_K8YL3M9T, _gid_xyz, __cf_bm_actual_hash,
AMP_TOKEN_runtime, _fbp_123, intercom-session-2026,
_pk_id.5.7d8, OptanonConsent]
→ 8 OK (vorher 0)
BMW-GT-File (zeroclaw/docs/ground-truth/bmw_de_2026-06-07.json):
- OneTrust CMP + 14 erwartete Vendoren
- Cookie-Count-Ranges (browser 80-250, deklariert 300-800)
- 7 expected findings inkl. neuem COOKIE-INVENTORY-MATCH-001 als
Benchmark gegen den Fuzzy-Match-Bug
Tests: 14/14 grün (4 _norm_for_match + 5 _matches + 5
build_cookie_inventory inkl. realistic_bmw_pattern).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,54 @@ def _norm(s: str | None) -> str:
|
|||||||
return (s or "").strip().lower()
|
return (s or "").strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_for_match(s: str) -> str:
|
||||||
|
"""Normalised name for fuzzy matching.
|
||||||
|
|
||||||
|
Common patterns in DSE-tables: wildcards (`_ga*`, `_ga.*`, `_pk_id.*`,
|
||||||
|
`<random>`), trailing dots, brackets. Browser cookies often have
|
||||||
|
a runtime suffix (`_ga_K8YL3M9T`, `__cf_bm_session_hash`). We strip
|
||||||
|
trailing wildcards / suffix-noise so the prefix-match below works.
|
||||||
|
|
||||||
|
IMPORTANT: leading `_`/`__` are MEANINGFUL (`__cf_bm`, `_ga`) and
|
||||||
|
must NOT be stripped.
|
||||||
|
"""
|
||||||
|
out = _norm(s)
|
||||||
|
out = out.replace("*", "").replace("…", "")
|
||||||
|
out = re.sub(r"\.\*$", "", out)
|
||||||
|
out = re.sub(r"\.\$?$", "", out)
|
||||||
|
out = re.sub(r"<[^>]+>", "", out)
|
||||||
|
out = re.sub(r"\{[^}]+\}", "", out)
|
||||||
|
return out.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _matches(decl_key: str, browser_key: str) -> bool:
|
||||||
|
"""Fuzzy match between a declared cookie name and a browser cookie.
|
||||||
|
|
||||||
|
Rules (in priority order):
|
||||||
|
1. exact match after normalisation
|
||||||
|
2. declared is a PREFIX of browser (declared "_ga" matches
|
||||||
|
browser "_ga_k8yl3m9t")
|
||||||
|
3. browser is a PREFIX of declared (rare: declared has a
|
||||||
|
specific variant, browser only generic — e.g. declared
|
||||||
|
"__cf_bm_session" with browser "__cf_bm")
|
||||||
|
"""
|
||||||
|
if not decl_key or not browser_key:
|
||||||
|
return False
|
||||||
|
if decl_key == browser_key:
|
||||||
|
return True
|
||||||
|
# Only allow prefix-match for prefixes ≥ 3 chars to avoid garbage
|
||||||
|
# (e.g. declared "_" matching everything).
|
||||||
|
if len(decl_key) >= 3 and browser_key.startswith(decl_key):
|
||||||
|
return True
|
||||||
|
if len(browser_key) >= 3 and decl_key.startswith(browser_key):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Need re-import for the regex use above
|
||||||
|
import re # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
def _missing(value: str | None) -> bool:
|
def _missing(value: str | None) -> bool:
|
||||||
if value is None:
|
if value is None:
|
||||||
return True
|
return True
|
||||||
@@ -190,7 +238,30 @@ def build_cookie_inventory(state: dict) -> tuple[list[dict], dict]:
|
|||||||
for c in (cookie_audit.get("compliant") or [])
|
for c in (cookie_audit.get("compliant") or [])
|
||||||
}
|
}
|
||||||
|
|
||||||
all_keys = set(declared.keys()) | set(browser.keys())
|
# Build fuzzy-match-Index: declared-key (normalised) → list of
|
||||||
|
# browser-keys that match. Browser-key only matches ONE declared
|
||||||
|
# entry (the longest prefix match wins) so we don't double-count.
|
||||||
|
decl_match_index: dict[str, list[str]] = {k: [] for k in declared}
|
||||||
|
browser_to_decl: dict[str, str] = {}
|
||||||
|
for bkey in browser:
|
||||||
|
bnorm = _norm_for_match(bkey)
|
||||||
|
best = ""
|
||||||
|
best_len = -1
|
||||||
|
for dkey in declared:
|
||||||
|
dnorm = _norm_for_match(dkey)
|
||||||
|
if _matches(dnorm, bnorm) and len(dnorm) > best_len:
|
||||||
|
best = dkey
|
||||||
|
best_len = len(dnorm)
|
||||||
|
if best:
|
||||||
|
decl_match_index[best].append(bkey)
|
||||||
|
browser_to_decl[bkey] = best
|
||||||
|
|
||||||
|
# all_keys = declared + browser, but browser-keys that fuzzy-match
|
||||||
|
# an existing declared entry are FOLDED into the declared row
|
||||||
|
# (avoid double-counting them as both ORPH and UNDOC).
|
||||||
|
matched_browser_keys = set(browser_to_decl.keys())
|
||||||
|
all_keys = (set(declared.keys())
|
||||||
|
| (set(browser.keys()) - matched_browser_keys))
|
||||||
rows: list[dict] = []
|
rows: list[dict] = []
|
||||||
for key in sorted(all_keys):
|
for key in sorted(all_keys):
|
||||||
d = declared.get(key) or {}
|
d = declared.get(key) or {}
|
||||||
@@ -200,7 +271,7 @@ def build_cookie_inventory(state: dict) -> tuple[list[dict], dict]:
|
|||||||
or b.get("domain") or "").strip() or ""
|
or b.get("domain") or "").strip() or ""
|
||||||
country = d.get("country", "")
|
country = d.get("country", "")
|
||||||
country_display, is_third, adq = _country_third(country)
|
country_display, is_third, adq = _country_third(country)
|
||||||
in_browser = key in browser
|
in_browser = (key in browser) or bool(decl_match_index.get(key))
|
||||||
is_declared = key in declared
|
is_declared = key in declared
|
||||||
status, sev = _build_status(
|
status, sev = _build_status(
|
||||||
is_declared, in_browser, undeclared_set, compliant_set, key,
|
is_declared, in_browser, undeclared_set, compliant_set, key,
|
||||||
|
|||||||
@@ -0,0 +1,109 @@
|
|||||||
|
"""Tests for the Cookie-Inventory fuzzy-matcher.
|
||||||
|
|
||||||
|
Regression: BMW-Mail zeigte 0 OK obwohl 31 Browser-Cookies + 738
|
||||||
|
deklarierte vorhanden waren. Ursache: exact-string-match scheitert
|
||||||
|
bei `_ga` (declared) vs `_ga_K8YL3M9T` (browser).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from compliance.services.mail_render_v2._cookie_inventory import (
|
||||||
|
_matches,
|
||||||
|
_norm_for_match,
|
||||||
|
build_cookie_inventory,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestNormForMatch:
|
||||||
|
def test_strip_wildcard(self):
|
||||||
|
assert _norm_for_match("_ga*") == "_ga"
|
||||||
|
|
||||||
|
def test_strip_regex_wildcard(self):
|
||||||
|
assert _norm_for_match("_pk_id.*") == "_pk_id"
|
||||||
|
|
||||||
|
def test_strip_placeholder(self):
|
||||||
|
assert _norm_for_match("session_<id>") == "session_"
|
||||||
|
|
||||||
|
def test_lowercase(self):
|
||||||
|
assert _norm_for_match("__CF_BM") == "__cf_bm"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMatches:
|
||||||
|
def test_exact(self):
|
||||||
|
assert _matches("_ga", "_ga")
|
||||||
|
|
||||||
|
def test_declared_prefix_of_browser(self):
|
||||||
|
# declared "_ga" matches browser "_ga_k8yl3m9t"
|
||||||
|
assert _matches("_ga", "_ga_k8yl3m9t")
|
||||||
|
|
||||||
|
def test_browser_prefix_of_declared(self):
|
||||||
|
# browser "__cf_bm" matches declared "__cf_bm_session"
|
||||||
|
assert _matches("__cf_bm_session", "__cf_bm")
|
||||||
|
|
||||||
|
def test_short_prefix_rejected(self):
|
||||||
|
# 2-char prefix would match too much
|
||||||
|
assert not _matches("_g", "_ga_k8yl3m9t")
|
||||||
|
|
||||||
|
def test_unrelated(self):
|
||||||
|
assert not _matches("_ga", "intercom-session")
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildInventory:
|
||||||
|
def _make_state(self, declared_cookies, browser_cookies):
|
||||||
|
return {
|
||||||
|
"cmp_vendors": [{
|
||||||
|
"name": "Test", "country": "DE", "source": "dse",
|
||||||
|
"cookies": [{"name": n} for n in declared_cookies],
|
||||||
|
}],
|
||||||
|
"banner_result": {
|
||||||
|
"cookies_detailed": [{"name": n} for n in browser_cookies],
|
||||||
|
},
|
||||||
|
"cookie_audit": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_no_match_no_ok(self):
|
||||||
|
rows, summary = build_cookie_inventory(
|
||||||
|
self._make_state(["foo"], ["bar"]),
|
||||||
|
)
|
||||||
|
assert summary["ok"] == 0
|
||||||
|
assert summary["orph"] == 1
|
||||||
|
assert summary["undoc"] == 1
|
||||||
|
|
||||||
|
def test_exact_match_yields_ok(self):
|
||||||
|
rows, summary = build_cookie_inventory(
|
||||||
|
self._make_state(["_ga"], ["_ga"]),
|
||||||
|
)
|
||||||
|
assert summary["ok"] == 1
|
||||||
|
assert summary["orph"] == 0
|
||||||
|
assert summary["undoc"] == 0
|
||||||
|
|
||||||
|
def test_prefix_match_yields_ok_no_double_count(self):
|
||||||
|
# Realistic BMW case: declared "_ga", browser "_ga_K8YL3M9T"
|
||||||
|
rows, summary = build_cookie_inventory(
|
||||||
|
self._make_state(["_ga"], ["_ga_K8YL3M9T"]),
|
||||||
|
)
|
||||||
|
assert summary["ok"] == 1, "fuzzy prefix-match should land in OK"
|
||||||
|
assert summary["orph"] == 0, "declared must not double-count as ORPH"
|
||||||
|
assert summary["undoc"] == 0, (
|
||||||
|
"browser cookie must fold into declared row, not appear separately"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_wildcard_match(self):
|
||||||
|
rows, summary = build_cookie_inventory(
|
||||||
|
self._make_state(["_pk_id.*"], ["_pk_id.5"]),
|
||||||
|
)
|
||||||
|
assert summary["ok"] == 1
|
||||||
|
|
||||||
|
def test_realistic_bmw_pattern(self):
|
||||||
|
# Declared: long list with common cookies
|
||||||
|
decl = ["_ga", "_gid", "__cf_bm", "AMP_TOKEN", "_fbp",
|
||||||
|
"intercom-session", "_pk_id.*", "OptanonConsent"]
|
||||||
|
# Browser: actual cookies with runtime suffixes
|
||||||
|
bro = ["_ga_K8YL3M9T", "_gid_xyz", "__cf_bm_actual_hash",
|
||||||
|
"AMP_TOKEN_runtime", "_fbp_123",
|
||||||
|
"intercom-session-2026", "_pk_id.5.7d8", "OptanonConsent"]
|
||||||
|
rows, summary = build_cookie_inventory(
|
||||||
|
self._make_state(decl, bro),
|
||||||
|
)
|
||||||
|
# All 8 browser cookies should fold into the 8 declared rows.
|
||||||
|
assert summary["ok"] == 8, f"expected 8 OK, got {summary}"
|
||||||
|
assert summary["orph"] == 0
|
||||||
|
assert summary["undoc"] == 0
|
||||||
@@ -0,0 +1,107 @@
|
|||||||
|
{
|
||||||
|
"site": "bmw.de",
|
||||||
|
"crawled_at": "2026-06-07",
|
||||||
|
"crawler": "BreakPilot-Compliance Audit-Run + Web-Recherche",
|
||||||
|
"notes": [
|
||||||
|
"BMW Group DE-Site — Konzern-Stack: BMW, MINI, BMW M, BMW i, Connected Drive, Financial Services, Performance.",
|
||||||
|
"Verantwortlicher: Bayerische Motoren Werke Aktiengesellschaft (München).",
|
||||||
|
"CMP: OneTrust (häufigster Stack im Konzern-Auto-Segment).",
|
||||||
|
"DSE listet typischerweise mehrere hundert Cookies (alle Marken/Regionen aggregiert).",
|
||||||
|
"Connected-Drive-AI-Assistant — schauen ob AI-Act Art. 50 Hinweis im Chat-UI."
|
||||||
|
],
|
||||||
|
"expected_url_layout": {
|
||||||
|
"impressum": "/de/footer/footer-section/imprint.html",
|
||||||
|
"dse": "/de/footer/datenschutz-cookies/datenschutz-bmw-website.html",
|
||||||
|
"cookie": "/de/footer/datenschutz-cookies/cookie-richtlinie-de.html",
|
||||||
|
"agb_or_nutzungsbedingungen": "/de/footer/footer-section/terms-of-use.html",
|
||||||
|
"widerrufsbelehrung": "unbekannt — bei Online-Shop-Komponenten (M Performance Parts Onlineshop) erforderlich"
|
||||||
|
},
|
||||||
|
"expected_vendors_in_dse": [
|
||||||
|
{"name": "OneTrust", "country": "US", "category": "CMP"},
|
||||||
|
{"name": "Google Analytics", "country": "US", "category": "Analytics"},
|
||||||
|
{"name": "Google Tag Manager", "country": "US", "category": "Tag-Mgmt"},
|
||||||
|
{"name": "Google Ads / DoubleClick", "country": "US", "category": "Marketing"},
|
||||||
|
{"name": "Meta Pixel", "country": "US", "category": "Marketing"},
|
||||||
|
{"name": "Adobe Analytics", "country": "US", "category": "Analytics"},
|
||||||
|
{"name": "Adobe Target", "country": "US", "category": "Personalisierung"},
|
||||||
|
{"name": "Salesforce Marketing Cloud", "country": "US", "category": "CRM/Marketing"},
|
||||||
|
{"name": "Sitecore", "country": "US", "category": "CMS"},
|
||||||
|
{"name": "Cloudflare", "country": "US", "category": "CDN/Bot"},
|
||||||
|
{"name": "Microsoft Clarity", "country": "US", "category": "Session-Replay"},
|
||||||
|
{"name": "LinkedIn Insight Tag", "country": "US/IE", "category": "Marketing"},
|
||||||
|
{"name": "YouTube", "country": "US", "category": "Embed/Marketing"},
|
||||||
|
{"name": "BMW Connected Drive AI", "country": "DE", "category": "AI-Assistant (vermutet)"}
|
||||||
|
],
|
||||||
|
"expected_cookie_count_ranges": {
|
||||||
|
"im_browser_nach_accept": "80–250 (BMW.de allein, ohne Sub-Domains)",
|
||||||
|
"deklariert_in_dse": "300–800 (Konzern-DSE deckt mehrere Marken)",
|
||||||
|
"match_quote_OK_in_browser": ">85% — Standard-Cookies (_ga, __cf_bm, OptanonConsent) müssen matchen",
|
||||||
|
"third_country_cookies": "60–90% (US-Vendoren dominieren)"
|
||||||
|
},
|
||||||
|
"expected_findings": [
|
||||||
|
{
|
||||||
|
"id": "AI-ACT-TRANSPARENCY-001",
|
||||||
|
"severity": "HIGH",
|
||||||
|
"title": "AI-Act Art. 50 Pre-Interaction-Disclosure für Connected-Drive-AI nicht prüfbar ohne Live-Test",
|
||||||
|
"evidence": "BMW Connected Drive nutzt AI-Assistenten. DSE nennt KI-Einsatz, aber Pre-Chat-Disclosure am Widget muss live verifiziert werden.",
|
||||||
|
"expected_pass": "UNKNOWN-LIKELY-PARTIAL"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TH-RETENTION-001",
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"title": "Aufbewahrungsdauer pro Cookie unvollständig — Konzern-DSE listet viele ohne Speicherdauer",
|
||||||
|
"evidence": "Bei einer Cookie-Liste von 300+ Cookies fehlt erfahrungsgemäß bei 40-60% die explizite Speicherdauer (Art. 13 Abs. 2 lit. a DSGVO).",
|
||||||
|
"expected_pass": "PARTIAL"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TRANSFER-001",
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"title": "US-Transfer-Mechanismus pro Vendor inkonsistent benannt",
|
||||||
|
"evidence": "Google/Meta meist auf DPF, Salesforce auf SCCs, Cloudflare implizit. Detailgrad pro Vendor uneinheitlich (typisches Großkonzern-Pattern).",
|
||||||
|
"expected_pass": "PARTIAL"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "IMPRESSUM-001",
|
||||||
|
"severity": "LOW",
|
||||||
|
"title": "Konzern-Impressum vermutlich vollständig — single legal entity (BMW AG)",
|
||||||
|
"evidence": "BMW AG ist Hauptverantwortlicher. Konzern-Konstellation: HRB München, USt-IdNr, Vorstand (mehrere Personen) — Multi-Entity-Bug-Trigger nicht erwartet.",
|
||||||
|
"expected_pass": "PASS"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "URL-STRUCTURE-001",
|
||||||
|
"severity": "LOW",
|
||||||
|
"title": "Vermutlich Standard-Slug-Drift (Standard-Slugs wie /impressum 404)",
|
||||||
|
"evidence": "BMW nutzt Subpaths unter /footer/. /impressum direkt → wahrscheinlich 404 oder Redirect.",
|
||||||
|
"expected_pass": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "COOKIE-INVENTORY-MATCH-001",
|
||||||
|
"severity": "HIGH",
|
||||||
|
"title": "Match-Quote zwischen DSE-Cookies und Browser-Cookies muss >85% sein",
|
||||||
|
"evidence": "Engine muss Standard-Cookies wie _ga (declared) ↔ _ga_K8YL3M9T (browser), __cf_bm ↔ __cf_bm_<hash> per Prefix-Match folden. <85% = Fuzzy-Match-Bug.",
|
||||||
|
"expected_pass": "BENCHMARK"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "COOKIE-CONSENT-UX-001",
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"title": "Mobile-Reachability für Consent-Reopen via OneTrust",
|
||||||
|
"evidence": "OneTrust-Footer-Link 'Cookie-Einstellungen' muss Tap-Target ≥ 44×44 px haben (Apple HIG / WCAG 2.5.5).",
|
||||||
|
"expected_pass": "UNKNOWN"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"expected_b17_walk_behaviour": {
|
||||||
|
"footer_links_min": 6,
|
||||||
|
"accordion_expansion_on_dse": "wahrscheinlich >5 (BMW DSE hat Akkordeons für Cookie-Tabellen)",
|
||||||
|
"banner_tour_clicks": "10-30 (OneTrust hat viele Tab/Category-Toggles)"
|
||||||
|
},
|
||||||
|
"summary_for_breakpilot_audit_comparison": {
|
||||||
|
"high_severity_findings_count": 2,
|
||||||
|
"medium_severity_findings_count": 3,
|
||||||
|
"low_severity_findings_count": 2,
|
||||||
|
"must_detect_to_pass_benchmark": [
|
||||||
|
"AI-ACT-TRANSPARENCY-001",
|
||||||
|
"URL-STRUCTURE-001",
|
||||||
|
"COOKIE-INVENTORY-MATCH-001"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user