diff --git a/backend-compliance/compliance/services/mail_render_v2/_cookie_inventory.py b/backend-compliance/compliance/services/mail_render_v2/_cookie_inventory.py index ceca7eee..4e232057 100644 --- a/backend-compliance/compliance/services/mail_render_v2/_cookie_inventory.py +++ b/backend-compliance/compliance/services/mail_render_v2/_cookie_inventory.py @@ -40,6 +40,54 @@ def _norm(s: str | None) -> str: return (s or "").strip().lower() +def _norm_for_match(s: str) -> str: + """Normalised name for fuzzy matching. + + Common patterns in DSE-tables: wildcards (`_ga*`, `_ga.*`, `_pk_id.*`, + ``), trailing dots, brackets. Browser cookies often have + a runtime suffix (`_ga_K8YL3M9T`, `__cf_bm_session_hash`). We strip + trailing wildcards / suffix-noise so the prefix-match below works. + + IMPORTANT: leading `_`/`__` are MEANINGFUL (`__cf_bm`, `_ga`) and + must NOT be stripped. + """ + out = _norm(s) + out = out.replace("*", "").replace("…", "") + out = re.sub(r"\.\*$", "", out) + out = re.sub(r"\.\$?$", "", out) + out = re.sub(r"<[^>]+>", "", out) + out = re.sub(r"\{[^}]+\}", "", out) + return out.strip() + + +def _matches(decl_key: str, browser_key: str) -> bool: + """Fuzzy match between a declared cookie name and a browser cookie. + + Rules (in priority order): + 1. exact match after normalisation + 2. declared is a PREFIX of browser (declared "_ga" matches + browser "_ga_k8yl3m9t") + 3. browser is a PREFIX of declared (rare: declared has a + specific variant, browser only generic — e.g. declared + "__cf_bm_session" with browser "__cf_bm") + """ + if not decl_key or not browser_key: + return False + if decl_key == browser_key: + return True + # Only allow prefix-match for prefixes ≥ 3 chars to avoid garbage + # (e.g. declared "_" matching everything). + if len(decl_key) >= 3 and browser_key.startswith(decl_key): + return True + if len(browser_key) >= 3 and decl_key.startswith(browser_key): + return True + return False + + +# Need re-import for the regex use above +import re # noqa: E402 + + def _missing(value: str | None) -> bool: if value is None: return True @@ -190,7 +238,30 @@ def build_cookie_inventory(state: dict) -> tuple[list[dict], dict]: for c in (cookie_audit.get("compliant") or []) } - all_keys = set(declared.keys()) | set(browser.keys()) + # Build fuzzy-match-Index: declared-key (normalised) → list of + # browser-keys that match. Browser-key only matches ONE declared + # entry (the longest prefix match wins) so we don't double-count. + decl_match_index: dict[str, list[str]] = {k: [] for k in declared} + browser_to_decl: dict[str, str] = {} + for bkey in browser: + bnorm = _norm_for_match(bkey) + best = "" + best_len = -1 + for dkey in declared: + dnorm = _norm_for_match(dkey) + if _matches(dnorm, bnorm) and len(dnorm) > best_len: + best = dkey + best_len = len(dnorm) + if best: + decl_match_index[best].append(bkey) + browser_to_decl[bkey] = best + + # all_keys = declared + browser, but browser-keys that fuzzy-match + # an existing declared entry are FOLDED into the declared row + # (avoid double-counting them as both ORPH and UNDOC). + matched_browser_keys = set(browser_to_decl.keys()) + all_keys = (set(declared.keys()) + | (set(browser.keys()) - matched_browser_keys)) rows: list[dict] = [] for key in sorted(all_keys): d = declared.get(key) or {} @@ -200,7 +271,7 @@ def build_cookie_inventory(state: dict) -> tuple[list[dict], dict]: or b.get("domain") or "").strip() or "" country = d.get("country", "") country_display, is_third, adq = _country_third(country) - in_browser = key in browser + in_browser = (key in browser) or bool(decl_match_index.get(key)) is_declared = key in declared status, sev = _build_status( is_declared, in_browser, undeclared_set, compliant_set, key, diff --git a/backend-compliance/tests/test_cookie_inventory_fuzzy.py b/backend-compliance/tests/test_cookie_inventory_fuzzy.py new file mode 100644 index 00000000..da9b3c8a --- /dev/null +++ b/backend-compliance/tests/test_cookie_inventory_fuzzy.py @@ -0,0 +1,109 @@ +"""Tests for the Cookie-Inventory fuzzy-matcher. + +Regression: BMW-Mail zeigte 0 OK obwohl 31 Browser-Cookies + 738 +deklarierte vorhanden waren. Ursache: exact-string-match scheitert +bei `_ga` (declared) vs `_ga_K8YL3M9T` (browser). +""" + +from compliance.services.mail_render_v2._cookie_inventory import ( + _matches, + _norm_for_match, + build_cookie_inventory, +) + + +class TestNormForMatch: + def test_strip_wildcard(self): + assert _norm_for_match("_ga*") == "_ga" + + def test_strip_regex_wildcard(self): + assert _norm_for_match("_pk_id.*") == "_pk_id" + + def test_strip_placeholder(self): + assert _norm_for_match("session_") == "session_" + + def test_lowercase(self): + assert _norm_for_match("__CF_BM") == "__cf_bm" + + +class TestMatches: + def test_exact(self): + assert _matches("_ga", "_ga") + + def test_declared_prefix_of_browser(self): + # declared "_ga" matches browser "_ga_k8yl3m9t" + assert _matches("_ga", "_ga_k8yl3m9t") + + def test_browser_prefix_of_declared(self): + # browser "__cf_bm" matches declared "__cf_bm_session" + assert _matches("__cf_bm_session", "__cf_bm") + + def test_short_prefix_rejected(self): + # 2-char prefix would match too much + assert not _matches("_g", "_ga_k8yl3m9t") + + def test_unrelated(self): + assert not _matches("_ga", "intercom-session") + + +class TestBuildInventory: + def _make_state(self, declared_cookies, browser_cookies): + return { + "cmp_vendors": [{ + "name": "Test", "country": "DE", "source": "dse", + "cookies": [{"name": n} for n in declared_cookies], + }], + "banner_result": { + "cookies_detailed": [{"name": n} for n in browser_cookies], + }, + "cookie_audit": {}, + } + + def test_no_match_no_ok(self): + rows, summary = build_cookie_inventory( + self._make_state(["foo"], ["bar"]), + ) + assert summary["ok"] == 0 + assert summary["orph"] == 1 + assert summary["undoc"] == 1 + + def test_exact_match_yields_ok(self): + rows, summary = build_cookie_inventory( + self._make_state(["_ga"], ["_ga"]), + ) + assert summary["ok"] == 1 + assert summary["orph"] == 0 + assert summary["undoc"] == 0 + + def test_prefix_match_yields_ok_no_double_count(self): + # Realistic BMW case: declared "_ga", browser "_ga_K8YL3M9T" + rows, summary = build_cookie_inventory( + self._make_state(["_ga"], ["_ga_K8YL3M9T"]), + ) + assert summary["ok"] == 1, "fuzzy prefix-match should land in OK" + assert summary["orph"] == 0, "declared must not double-count as ORPH" + assert summary["undoc"] == 0, ( + "browser cookie must fold into declared row, not appear separately" + ) + + def test_wildcard_match(self): + rows, summary = build_cookie_inventory( + self._make_state(["_pk_id.*"], ["_pk_id.5"]), + ) + assert summary["ok"] == 1 + + def test_realistic_bmw_pattern(self): + # Declared: long list with common cookies + decl = ["_ga", "_gid", "__cf_bm", "AMP_TOKEN", "_fbp", + "intercom-session", "_pk_id.*", "OptanonConsent"] + # Browser: actual cookies with runtime suffixes + bro = ["_ga_K8YL3M9T", "_gid_xyz", "__cf_bm_actual_hash", + "AMP_TOKEN_runtime", "_fbp_123", + "intercom-session-2026", "_pk_id.5.7d8", "OptanonConsent"] + rows, summary = build_cookie_inventory( + self._make_state(decl, bro), + ) + # All 8 browser cookies should fold into the 8 declared rows. + assert summary["ok"] == 8, f"expected 8 OK, got {summary}" + assert summary["orph"] == 0 + assert summary["undoc"] == 0 diff --git a/zeroclaw/docs/ground-truth/bmw_de_2026-06-07.json b/zeroclaw/docs/ground-truth/bmw_de_2026-06-07.json new file mode 100644 index 00000000..00cbf6c9 --- /dev/null +++ b/zeroclaw/docs/ground-truth/bmw_de_2026-06-07.json @@ -0,0 +1,107 @@ +{ + "site": "bmw.de", + "crawled_at": "2026-06-07", + "crawler": "BreakPilot-Compliance Audit-Run + Web-Recherche", + "notes": [ + "BMW Group DE-Site — Konzern-Stack: BMW, MINI, BMW M, BMW i, Connected Drive, Financial Services, Performance.", + "Verantwortlicher: Bayerische Motoren Werke Aktiengesellschaft (München).", + "CMP: OneTrust (häufigster Stack im Konzern-Auto-Segment).", + "DSE listet typischerweise mehrere hundert Cookies (alle Marken/Regionen aggregiert).", + "Connected-Drive-AI-Assistant — schauen ob AI-Act Art. 50 Hinweis im Chat-UI." + ], + "expected_url_layout": { + "impressum": "/de/footer/footer-section/imprint.html", + "dse": "/de/footer/datenschutz-cookies/datenschutz-bmw-website.html", + "cookie": "/de/footer/datenschutz-cookies/cookie-richtlinie-de.html", + "agb_or_nutzungsbedingungen": "/de/footer/footer-section/terms-of-use.html", + "widerrufsbelehrung": "unbekannt — bei Online-Shop-Komponenten (M Performance Parts Onlineshop) erforderlich" + }, + "expected_vendors_in_dse": [ + {"name": "OneTrust", "country": "US", "category": "CMP"}, + {"name": "Google Analytics", "country": "US", "category": "Analytics"}, + {"name": "Google Tag Manager", "country": "US", "category": "Tag-Mgmt"}, + {"name": "Google Ads / DoubleClick", "country": "US", "category": "Marketing"}, + {"name": "Meta Pixel", "country": "US", "category": "Marketing"}, + {"name": "Adobe Analytics", "country": "US", "category": "Analytics"}, + {"name": "Adobe Target", "country": "US", "category": "Personalisierung"}, + {"name": "Salesforce Marketing Cloud", "country": "US", "category": "CRM/Marketing"}, + {"name": "Sitecore", "country": "US", "category": "CMS"}, + {"name": "Cloudflare", "country": "US", "category": "CDN/Bot"}, + {"name": "Microsoft Clarity", "country": "US", "category": "Session-Replay"}, + {"name": "LinkedIn Insight Tag", "country": "US/IE", "category": "Marketing"}, + {"name": "YouTube", "country": "US", "category": "Embed/Marketing"}, + {"name": "BMW Connected Drive AI", "country": "DE", "category": "AI-Assistant (vermutet)"} + ], + "expected_cookie_count_ranges": { + "im_browser_nach_accept": "80–250 (BMW.de allein, ohne Sub-Domains)", + "deklariert_in_dse": "300–800 (Konzern-DSE deckt mehrere Marken)", + "match_quote_OK_in_browser": ">85% — Standard-Cookies (_ga, __cf_bm, OptanonConsent) müssen matchen", + "third_country_cookies": "60–90% (US-Vendoren dominieren)" + }, + "expected_findings": [ + { + "id": "AI-ACT-TRANSPARENCY-001", + "severity": "HIGH", + "title": "AI-Act Art. 50 Pre-Interaction-Disclosure für Connected-Drive-AI nicht prüfbar ohne Live-Test", + "evidence": "BMW Connected Drive nutzt AI-Assistenten. DSE nennt KI-Einsatz, aber Pre-Chat-Disclosure am Widget muss live verifiziert werden.", + "expected_pass": "UNKNOWN-LIKELY-PARTIAL" + }, + { + "id": "TH-RETENTION-001", + "severity": "MEDIUM", + "title": "Aufbewahrungsdauer pro Cookie unvollständig — Konzern-DSE listet viele ohne Speicherdauer", + "evidence": "Bei einer Cookie-Liste von 300+ Cookies fehlt erfahrungsgemäß bei 40-60% die explizite Speicherdauer (Art. 13 Abs. 2 lit. a DSGVO).", + "expected_pass": "PARTIAL" + }, + { + "id": "TRANSFER-001", + "severity": "MEDIUM", + "title": "US-Transfer-Mechanismus pro Vendor inkonsistent benannt", + "evidence": "Google/Meta meist auf DPF, Salesforce auf SCCs, Cloudflare implizit. Detailgrad pro Vendor uneinheitlich (typisches Großkonzern-Pattern).", + "expected_pass": "PARTIAL" + }, + { + "id": "IMPRESSUM-001", + "severity": "LOW", + "title": "Konzern-Impressum vermutlich vollständig — single legal entity (BMW AG)", + "evidence": "BMW AG ist Hauptverantwortlicher. Konzern-Konstellation: HRB München, USt-IdNr, Vorstand (mehrere Personen) — Multi-Entity-Bug-Trigger nicht erwartet.", + "expected_pass": "PASS" + }, + { + "id": "URL-STRUCTURE-001", + "severity": "LOW", + "title": "Vermutlich Standard-Slug-Drift (Standard-Slugs wie /impressum 404)", + "evidence": "BMW nutzt Subpaths unter /footer/. /impressum direkt → wahrscheinlich 404 oder Redirect.", + "expected_pass": false + }, + { + "id": "COOKIE-INVENTORY-MATCH-001", + "severity": "HIGH", + "title": "Match-Quote zwischen DSE-Cookies und Browser-Cookies muss >85% sein", + "evidence": "Engine muss Standard-Cookies wie _ga (declared) ↔ _ga_K8YL3M9T (browser), __cf_bm ↔ __cf_bm_ per Prefix-Match folden. <85% = Fuzzy-Match-Bug.", + "expected_pass": "BENCHMARK" + }, + { + "id": "COOKIE-CONSENT-UX-001", + "severity": "MEDIUM", + "title": "Mobile-Reachability für Consent-Reopen via OneTrust", + "evidence": "OneTrust-Footer-Link 'Cookie-Einstellungen' muss Tap-Target ≥ 44×44 px haben (Apple HIG / WCAG 2.5.5).", + "expected_pass": "UNKNOWN" + } + ], + "expected_b17_walk_behaviour": { + "footer_links_min": 6, + "accordion_expansion_on_dse": "wahrscheinlich >5 (BMW DSE hat Akkordeons für Cookie-Tabellen)", + "banner_tour_clicks": "10-30 (OneTrust hat viele Tab/Category-Toggles)" + }, + "summary_for_breakpilot_audit_comparison": { + "high_severity_findings_count": 2, + "medium_severity_findings_count": 3, + "low_severity_findings_count": 2, + "must_detect_to_pass_benchmark": [ + "AI-ACT-TRANSPARENCY-001", + "URL-STRUCTURE-001", + "COOKIE-INVENTORY-MATCH-001" + ] + } +}