feat(cookie): präfix-bewusster Library-Match (Runtime-Suffixe)

load_big_library matchte nur EXAKT → nur ~27% der BMW-Cookies trafen die
Open-Cookie-DB, weil Per-Instanz-Suffixe abweichen (_ga_GTM-XYZ, AMCVS_###@
AdobeOrg, _pk_id.5.7d8). Jetzt: Library einmal laden, Namen entwildcarden,
über _candidate_keys (exact + Präfix an Trennzeichen, Mindestlänge 3 gegen
Über-Match) matchen. Reuse der bewährten _strip_wildcards-Logik.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-11 15:24:45 +02:00
parent 0f443b6a9c
commit 9dfdaae8e4
2 changed files with 77 additions and 10 deletions
@@ -46,23 +46,70 @@ _CONTROL_MAP = {
_HINWEIS_TYPES = {"third_country", "eu_alternative"} _HINWEIS_TYPES = {"third_country", "eu_alternative"}
# Trennzeichen, an denen ein Runtime-Suffix abgeschnitten werden darf
# (z.B. '_ga_GTM-XYZ' → '_ga', 'AMCVS_1234@AdobeOrg' → 'AMCVS').
_SEP_RE = re.compile(r"[_\-.:$%@\[]")
def _candidate_keys(name: str) -> list[str]:
"""Library-Match-Kandidaten: voller (entwildcardeter) Name + Präfixe an
Trennzeichen. Fängt Per-Instanz-Suffixe (GTM-Container, @AdobeOrg, Hash-IDs),
ohne kurze generische Namen zu über-matchen (Mindestlänge 3)."""
from compliance.services.cookie_library_lookup import _strip_wildcards
base = _strip_wildcards(name)
keys: list[str] = []
if base:
keys.append(base)
cur = base
while True:
seps = list(_SEP_RE.finditer(cur))
if not seps:
break
cur = cur[:seps[-1].start()].rstrip("_-.:$%@")
if len(cur) >= 3 and cur not in keys:
keys.append(cur)
else:
break
return keys
def _match_lib(name: str, lib_bases: dict) -> dict | None:
"""Erster Treffer eines Kandidaten-Schlüssels in der (entwildcardeten)
Library-Basis-Map. Pure + testbar."""
for k in _candidate_keys(name):
if len(k) >= 3 and k in lib_bases:
return lib_bases[k]
return None
def load_big_library(db, names: list[str]) -> dict: def load_big_library(db, names: list[str]) -> dict:
"""Batch-Lookup der grossen Open-Cookie-Database (compliance.cookie_library, """Präfix-bewusster Lookup gegen die Open-Cookie-Database
~2287 Cookies) fuer die gegebenen Namen. Breite Abdeckung: Kategorie, (compliance.cookie_library, ~2287). Lädt die Library einmal, entwildcardet
Retention, Vendor.""" die Namen zu Basen und matcht jeden Cookie über _candidate_keys (exact +
uniq = sorted({(n or "").lower() for n in names if n}) Runtime-Suffix-Präfix). Schlüssel = ORIGINAL-Cookiename (lower) → Library-Row,
damit der Aufrufer wie gewohnt big_lib.get(name.lower()) nutzen kann."""
from compliance.services.cookie_library_lookup import _strip_wildcards
uniq = {(n or "").lower() for n in names if n}
if not uniq: if not uniq:
return {} return {}
rows = db.execute( rows = db.execute(
text( text(
"SELECT lower(cookie_name) AS n, actual_category, " "SELECT lower(cookie_name) AS n, actual_category, "
"typical_max_age_seconds, vendor_name, purpose_de, purpose_en, " "typical_max_age_seconds, vendor_name, purpose_de, purpose_en, "
"is_pii FROM compliance.cookie_library " "is_pii FROM compliance.cookie_library"
"WHERE lower(cookie_name) = ANY(:names)" )
),
{"names": uniq},
).mappings().fetchall() ).mappings().fetchall()
return {r["n"]: dict(r) for r in rows} lib_bases: dict[str, dict] = {}
for r in rows:
base = _strip_wildcards(r["n"])
if base and base not in lib_bases:
lib_bases[base] = dict(r)
out: dict[str, dict] = {}
for low in uniq:
hit = _match_lib(low, lib_bases)
if hit:
out[low] = hit
return out
_NECESSARY_CATS = { _NECESSARY_CATS = {
"necessary", "notwendig", "essential", "essenziell", "necessary", "notwendig", "essential", "essenziell",
@@ -2,7 +2,27 @@
from __future__ import annotations from __future__ import annotations
from compliance.services.cookie_library_check import analyze_cookies from compliance.services.cookie_library_check import (
_candidate_keys,
_match_lib,
analyze_cookies,
)
def test_candidate_keys_strips_runtime_suffix():
assert "_ga" in _candidate_keys("_ga_GTM-ABC123")
assert "amcvs" in _candidate_keys("AMCVS_1234@AdobeOrg")
assert "_pk_id" in _candidate_keys("_pk_id.5.7d8f")
def test_match_lib_prefix_and_exact():
lib = {"_ga": {"actual_category": "statistics"},
"phpsessid": {"actual_category": "essential"}}
assert _match_lib("_ga_GTM-XYZ", lib)["actual_category"] == "statistics"
assert _match_lib("PHPSESSID".lower(), lib)["actual_category"] == "essential"
assert _match_lib("totally_unknown_xyz", lib) is None
# kurze generische Basis darf NICHT über-matchen
assert _match_lib("id_charger", {"id": {"x": 1}}) is None
def test_tracker_declared_necessary_is_high_finding(): def test_tracker_declared_necessary_is_high_finding():