feat(cookie): präfix-bewusster Library-Match (Runtime-Suffixe)

load_big_library matchte nur EXAKT → nur ~27% der BMW-Cookies trafen die
Open-Cookie-DB, weil Per-Instanz-Suffixe abweichen (_ga_GTM-XYZ, AMCVS_###@
AdobeOrg, _pk_id.5.7d8). Jetzt: Library einmal laden, Namen entwildcarden,
über _candidate_keys (exact + Präfix an Trennzeichen, Mindestlänge 3 gegen
Über-Match) matchen. Reuse der bewährten _strip_wildcards-Logik.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-11 15:24:45 +02:00
parent 0f443b6a9c
commit 9dfdaae8e4
2 changed files with 77 additions and 10 deletions
@@ -46,23 +46,70 @@ _CONTROL_MAP = {
_HINWEIS_TYPES = {"third_country", "eu_alternative"}
# Trennzeichen, an denen ein Runtime-Suffix abgeschnitten werden darf
# (z.B. '_ga_GTM-XYZ' → '_ga', 'AMCVS_1234@AdobeOrg' → 'AMCVS').
_SEP_RE = re.compile(r"[_\-.:$%@\[]")
def _candidate_keys(name: str) -> list[str]:
"""Library-Match-Kandidaten: voller (entwildcardeter) Name + Präfixe an
Trennzeichen. Fängt Per-Instanz-Suffixe (GTM-Container, @AdobeOrg, Hash-IDs),
ohne kurze generische Namen zu über-matchen (Mindestlänge 3)."""
from compliance.services.cookie_library_lookup import _strip_wildcards
base = _strip_wildcards(name)
keys: list[str] = []
if base:
keys.append(base)
cur = base
while True:
seps = list(_SEP_RE.finditer(cur))
if not seps:
break
cur = cur[:seps[-1].start()].rstrip("_-.:$%@")
if len(cur) >= 3 and cur not in keys:
keys.append(cur)
else:
break
return keys
def _match_lib(name: str, lib_bases: dict) -> dict | None:
"""Erster Treffer eines Kandidaten-Schlüssels in der (entwildcardeten)
Library-Basis-Map. Pure + testbar."""
for k in _candidate_keys(name):
if len(k) >= 3 and k in lib_bases:
return lib_bases[k]
return None
def load_big_library(db, names: list[str]) -> dict:
"""Batch-Lookup der grossen Open-Cookie-Database (compliance.cookie_library,
~2287 Cookies) fuer die gegebenen Namen. Breite Abdeckung: Kategorie,
Retention, Vendor."""
uniq = sorted({(n or "").lower() for n in names if n})
"""Präfix-bewusster Lookup gegen die Open-Cookie-Database
(compliance.cookie_library, ~2287). Lädt die Library einmal, entwildcardet
die Namen zu Basen und matcht jeden Cookie über _candidate_keys (exact +
Runtime-Suffix-Präfix). Schlüssel = ORIGINAL-Cookiename (lower) → Library-Row,
damit der Aufrufer wie gewohnt big_lib.get(name.lower()) nutzen kann."""
from compliance.services.cookie_library_lookup import _strip_wildcards
uniq = {(n or "").lower() for n in names if n}
if not uniq:
return {}
rows = db.execute(
text(
"SELECT lower(cookie_name) AS n, actual_category, "
"typical_max_age_seconds, vendor_name, purpose_de, purpose_en, "
"is_pii FROM compliance.cookie_library "
"WHERE lower(cookie_name) = ANY(:names)"
),
{"names": uniq},
"is_pii FROM compliance.cookie_library"
)
).mappings().fetchall()
return {r["n"]: dict(r) for r in rows}
lib_bases: dict[str, dict] = {}
for r in rows:
base = _strip_wildcards(r["n"])
if base and base not in lib_bases:
lib_bases[base] = dict(r)
out: dict[str, dict] = {}
for low in uniq:
hit = _match_lib(low, lib_bases)
if hit:
out[low] = hit
return out
_NECESSARY_CATS = {
"necessary", "notwendig", "essential", "essenziell",