feat(cookie): präfix-bewusster Library-Match (Runtime-Suffixe)
load_big_library matchte nur EXAKT → nur ~27% der BMW-Cookies trafen die Open-Cookie-DB, weil Per-Instanz-Suffixe abweichen (_ga_GTM-XYZ, AMCVS_###@ AdobeOrg, _pk_id.5.7d8). Jetzt: Library einmal laden, Namen entwildcarden, über _candidate_keys (exact + Präfix an Trennzeichen, Mindestlänge 3 gegen Über-Match) matchen. Reuse der bewährten _strip_wildcards-Logik. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -46,23 +46,70 @@ _CONTROL_MAP = {
|
||||
_HINWEIS_TYPES = {"third_country", "eu_alternative"}
|
||||
|
||||
|
||||
# Trennzeichen, an denen ein Runtime-Suffix abgeschnitten werden darf
|
||||
# (z.B. '_ga_GTM-XYZ' → '_ga', 'AMCVS_1234@AdobeOrg' → 'AMCVS').
|
||||
_SEP_RE = re.compile(r"[_\-.:$%@\[]")
|
||||
|
||||
|
||||
def _candidate_keys(name: str) -> list[str]:
|
||||
"""Library-Match-Kandidaten: voller (entwildcardeter) Name + Präfixe an
|
||||
Trennzeichen. Fängt Per-Instanz-Suffixe (GTM-Container, @AdobeOrg, Hash-IDs),
|
||||
ohne kurze generische Namen zu über-matchen (Mindestlänge 3)."""
|
||||
from compliance.services.cookie_library_lookup import _strip_wildcards
|
||||
base = _strip_wildcards(name)
|
||||
keys: list[str] = []
|
||||
if base:
|
||||
keys.append(base)
|
||||
cur = base
|
||||
while True:
|
||||
seps = list(_SEP_RE.finditer(cur))
|
||||
if not seps:
|
||||
break
|
||||
cur = cur[:seps[-1].start()].rstrip("_-.:$%@")
|
||||
if len(cur) >= 3 and cur not in keys:
|
||||
keys.append(cur)
|
||||
else:
|
||||
break
|
||||
return keys
|
||||
|
||||
|
||||
def _match_lib(name: str, lib_bases: dict) -> dict | None:
|
||||
"""Erster Treffer eines Kandidaten-Schlüssels in der (entwildcardeten)
|
||||
Library-Basis-Map. Pure + testbar."""
|
||||
for k in _candidate_keys(name):
|
||||
if len(k) >= 3 and k in lib_bases:
|
||||
return lib_bases[k]
|
||||
return None
|
||||
|
||||
|
||||
def load_big_library(db, names: list[str]) -> dict:
|
||||
"""Batch-Lookup der grossen Open-Cookie-Database (compliance.cookie_library,
|
||||
~2287 Cookies) fuer die gegebenen Namen. Breite Abdeckung: Kategorie,
|
||||
Retention, Vendor."""
|
||||
uniq = sorted({(n or "").lower() for n in names if n})
|
||||
"""Präfix-bewusster Lookup gegen die Open-Cookie-Database
|
||||
(compliance.cookie_library, ~2287). Lädt die Library einmal, entwildcardet
|
||||
die Namen zu Basen und matcht jeden Cookie über _candidate_keys (exact +
|
||||
Runtime-Suffix-Präfix). Schlüssel = ORIGINAL-Cookiename (lower) → Library-Row,
|
||||
damit der Aufrufer wie gewohnt big_lib.get(name.lower()) nutzen kann."""
|
||||
from compliance.services.cookie_library_lookup import _strip_wildcards
|
||||
uniq = {(n or "").lower() for n in names if n}
|
||||
if not uniq:
|
||||
return {}
|
||||
rows = db.execute(
|
||||
text(
|
||||
"SELECT lower(cookie_name) AS n, actual_category, "
|
||||
"typical_max_age_seconds, vendor_name, purpose_de, purpose_en, "
|
||||
"is_pii FROM compliance.cookie_library "
|
||||
"WHERE lower(cookie_name) = ANY(:names)"
|
||||
),
|
||||
{"names": uniq},
|
||||
"is_pii FROM compliance.cookie_library"
|
||||
)
|
||||
).mappings().fetchall()
|
||||
return {r["n"]: dict(r) for r in rows}
|
||||
lib_bases: dict[str, dict] = {}
|
||||
for r in rows:
|
||||
base = _strip_wildcards(r["n"])
|
||||
if base and base not in lib_bases:
|
||||
lib_bases[base] = dict(r)
|
||||
out: dict[str, dict] = {}
|
||||
for low in uniq:
|
||||
hit = _match_lib(low, lib_bases)
|
||||
if hit:
|
||||
out[low] = hit
|
||||
return out
|
||||
|
||||
_NECESSARY_CATS = {
|
||||
"necessary", "notwendig", "essential", "essenziell",
|
||||
|
||||
@@ -2,7 +2,27 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from compliance.services.cookie_library_check import analyze_cookies
|
||||
from compliance.services.cookie_library_check import (
|
||||
_candidate_keys,
|
||||
_match_lib,
|
||||
analyze_cookies,
|
||||
)
|
||||
|
||||
|
||||
def test_candidate_keys_strips_runtime_suffix():
|
||||
assert "_ga" in _candidate_keys("_ga_GTM-ABC123")
|
||||
assert "amcvs" in _candidate_keys("AMCVS_1234@AdobeOrg")
|
||||
assert "_pk_id" in _candidate_keys("_pk_id.5.7d8f")
|
||||
|
||||
|
||||
def test_match_lib_prefix_and_exact():
|
||||
lib = {"_ga": {"actual_category": "statistics"},
|
||||
"phpsessid": {"actual_category": "essential"}}
|
||||
assert _match_lib("_ga_GTM-XYZ", lib)["actual_category"] == "statistics"
|
||||
assert _match_lib("PHPSESSID".lower(), lib)["actual_category"] == "essential"
|
||||
assert _match_lib("totally_unknown_xyz", lib) is None
|
||||
# kurze generische Basis darf NICHT über-matchen
|
||||
assert _match_lib("id_charger", {"id": {"x": 1}}) is None
|
||||
|
||||
|
||||
def test_tracker_declared_necessary_is_high_finding():
|
||||
|
||||
Reference in New Issue
Block a user