feat(cookie): präfix-bewusster Library-Match (Runtime-Suffixe)
load_big_library matchte nur EXAKT → nur ~27% der BMW-Cookies trafen die Open-Cookie-DB, weil Per-Instanz-Suffixe abweichen (_ga_GTM-XYZ, AMCVS_###@ AdobeOrg, _pk_id.5.7d8). Jetzt: Library einmal laden, Namen entwildcarden, über _candidate_keys (exact + Präfix an Trennzeichen, Mindestlänge 3 gegen Über-Match) matchen. Reuse der bewährten _strip_wildcards-Logik. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -46,23 +46,70 @@ _CONTROL_MAP = {
|
|||||||
_HINWEIS_TYPES = {"third_country", "eu_alternative"}
|
_HINWEIS_TYPES = {"third_country", "eu_alternative"}
|
||||||
|
|
||||||
|
|
||||||
|
# Trennzeichen, an denen ein Runtime-Suffix abgeschnitten werden darf
|
||||||
|
# (z.B. '_ga_GTM-XYZ' → '_ga', 'AMCVS_1234@AdobeOrg' → 'AMCVS').
|
||||||
|
_SEP_RE = re.compile(r"[_\-.:$%@\[]")
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_keys(name: str) -> list[str]:
|
||||||
|
"""Library-Match-Kandidaten: voller (entwildcardeter) Name + Präfixe an
|
||||||
|
Trennzeichen. Fängt Per-Instanz-Suffixe (GTM-Container, @AdobeOrg, Hash-IDs),
|
||||||
|
ohne kurze generische Namen zu über-matchen (Mindestlänge 3)."""
|
||||||
|
from compliance.services.cookie_library_lookup import _strip_wildcards
|
||||||
|
base = _strip_wildcards(name)
|
||||||
|
keys: list[str] = []
|
||||||
|
if base:
|
||||||
|
keys.append(base)
|
||||||
|
cur = base
|
||||||
|
while True:
|
||||||
|
seps = list(_SEP_RE.finditer(cur))
|
||||||
|
if not seps:
|
||||||
|
break
|
||||||
|
cur = cur[:seps[-1].start()].rstrip("_-.:$%@")
|
||||||
|
if len(cur) >= 3 and cur not in keys:
|
||||||
|
keys.append(cur)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return keys
|
||||||
|
|
||||||
|
|
||||||
|
def _match_lib(name: str, lib_bases: dict) -> dict | None:
|
||||||
|
"""Erster Treffer eines Kandidaten-Schlüssels in der (entwildcardeten)
|
||||||
|
Library-Basis-Map. Pure + testbar."""
|
||||||
|
for k in _candidate_keys(name):
|
||||||
|
if len(k) >= 3 and k in lib_bases:
|
||||||
|
return lib_bases[k]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def load_big_library(db, names: list[str]) -> dict:
|
def load_big_library(db, names: list[str]) -> dict:
|
||||||
"""Batch-Lookup der grossen Open-Cookie-Database (compliance.cookie_library,
|
"""Präfix-bewusster Lookup gegen die Open-Cookie-Database
|
||||||
~2287 Cookies) fuer die gegebenen Namen. Breite Abdeckung: Kategorie,
|
(compliance.cookie_library, ~2287). Lädt die Library einmal, entwildcardet
|
||||||
Retention, Vendor."""
|
die Namen zu Basen und matcht jeden Cookie über _candidate_keys (exact +
|
||||||
uniq = sorted({(n or "").lower() for n in names if n})
|
Runtime-Suffix-Präfix). Schlüssel = ORIGINAL-Cookiename (lower) → Library-Row,
|
||||||
|
damit der Aufrufer wie gewohnt big_lib.get(name.lower()) nutzen kann."""
|
||||||
|
from compliance.services.cookie_library_lookup import _strip_wildcards
|
||||||
|
uniq = {(n or "").lower() for n in names if n}
|
||||||
if not uniq:
|
if not uniq:
|
||||||
return {}
|
return {}
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
text(
|
text(
|
||||||
"SELECT lower(cookie_name) AS n, actual_category, "
|
"SELECT lower(cookie_name) AS n, actual_category, "
|
||||||
"typical_max_age_seconds, vendor_name, purpose_de, purpose_en, "
|
"typical_max_age_seconds, vendor_name, purpose_de, purpose_en, "
|
||||||
"is_pii FROM compliance.cookie_library "
|
"is_pii FROM compliance.cookie_library"
|
||||||
"WHERE lower(cookie_name) = ANY(:names)"
|
)
|
||||||
),
|
|
||||||
{"names": uniq},
|
|
||||||
).mappings().fetchall()
|
).mappings().fetchall()
|
||||||
return {r["n"]: dict(r) for r in rows}
|
lib_bases: dict[str, dict] = {}
|
||||||
|
for r in rows:
|
||||||
|
base = _strip_wildcards(r["n"])
|
||||||
|
if base and base not in lib_bases:
|
||||||
|
lib_bases[base] = dict(r)
|
||||||
|
out: dict[str, dict] = {}
|
||||||
|
for low in uniq:
|
||||||
|
hit = _match_lib(low, lib_bases)
|
||||||
|
if hit:
|
||||||
|
out[low] = hit
|
||||||
|
return out
|
||||||
|
|
||||||
_NECESSARY_CATS = {
|
_NECESSARY_CATS = {
|
||||||
"necessary", "notwendig", "essential", "essenziell",
|
"necessary", "notwendig", "essential", "essenziell",
|
||||||
|
|||||||
@@ -2,7 +2,27 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from compliance.services.cookie_library_check import analyze_cookies
|
from compliance.services.cookie_library_check import (
|
||||||
|
_candidate_keys,
|
||||||
|
_match_lib,
|
||||||
|
analyze_cookies,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_candidate_keys_strips_runtime_suffix():
|
||||||
|
assert "_ga" in _candidate_keys("_ga_GTM-ABC123")
|
||||||
|
assert "amcvs" in _candidate_keys("AMCVS_1234@AdobeOrg")
|
||||||
|
assert "_pk_id" in _candidate_keys("_pk_id.5.7d8f")
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_lib_prefix_and_exact():
|
||||||
|
lib = {"_ga": {"actual_category": "statistics"},
|
||||||
|
"phpsessid": {"actual_category": "essential"}}
|
||||||
|
assert _match_lib("_ga_GTM-XYZ", lib)["actual_category"] == "statistics"
|
||||||
|
assert _match_lib("PHPSESSID".lower(), lib)["actual_category"] == "essential"
|
||||||
|
assert _match_lib("totally_unknown_xyz", lib) is None
|
||||||
|
# kurze generische Basis darf NICHT über-matchen
|
||||||
|
assert _match_lib("id_charger", {"id": {"x": 1}}) is None
|
||||||
|
|
||||||
|
|
||||||
def test_tracker_declared_necessary_is_high_finding():
|
def test_tracker_declared_necessary_is_high_finding():
|
||||||
|
|||||||
Reference in New Issue
Block a user