diff --git a/backend-compliance/compliance/services/cookie_library_check.py b/backend-compliance/compliance/services/cookie_library_check.py index bba0a359..b097baf6 100644 --- a/backend-compliance/compliance/services/cookie_library_check.py +++ b/backend-compliance/compliance/services/cookie_library_check.py @@ -46,23 +46,70 @@ _CONTROL_MAP = { _HINWEIS_TYPES = {"third_country", "eu_alternative"} +# Trennzeichen, an denen ein Runtime-Suffix abgeschnitten werden darf +# (z.B. '_ga_GTM-XYZ' → '_ga', 'AMCVS_1234@AdobeOrg' → 'AMCVS'). +_SEP_RE = re.compile(r"[_\-.:$%@\[]") + + +def _candidate_keys(name: str) -> list[str]: + """Library-Match-Kandidaten: voller (entwildcardeter) Name + Präfixe an + Trennzeichen. Fängt Per-Instanz-Suffixe (GTM-Container, @AdobeOrg, Hash-IDs), + ohne kurze generische Namen zu über-matchen (Mindestlänge 3).""" + from compliance.services.cookie_library_lookup import _strip_wildcards + base = _strip_wildcards(name) + keys: list[str] = [] + if base: + keys.append(base) + cur = base + while True: + seps = list(_SEP_RE.finditer(cur)) + if not seps: + break + cur = cur[:seps[-1].start()].rstrip("_-.:$%@") + if len(cur) >= 3 and cur not in keys: + keys.append(cur) + else: + break + return keys + + +def _match_lib(name: str, lib_bases: dict) -> dict | None: + """Erster Treffer eines Kandidaten-Schlüssels in der (entwildcardeten) + Library-Basis-Map. Pure + testbar.""" + for k in _candidate_keys(name): + if len(k) >= 3 and k in lib_bases: + return lib_bases[k] + return None + + def load_big_library(db, names: list[str]) -> dict: - """Batch-Lookup der grossen Open-Cookie-Database (compliance.cookie_library, - ~2287 Cookies) fuer die gegebenen Namen. Breite Abdeckung: Kategorie, - Retention, Vendor.""" - uniq = sorted({(n or "").lower() for n in names if n}) + """Präfix-bewusster Lookup gegen die Open-Cookie-Database + (compliance.cookie_library, ~2287). Lädt die Library einmal, entwildcardet + die Namen zu Basen und matcht jeden Cookie über _candidate_keys (exact + + Runtime-Suffix-Präfix). Schlüssel = ORIGINAL-Cookiename (lower) → Library-Row, + damit der Aufrufer wie gewohnt big_lib.get(name.lower()) nutzen kann.""" + from compliance.services.cookie_library_lookup import _strip_wildcards + uniq = {(n or "").lower() for n in names if n} if not uniq: return {} rows = db.execute( text( "SELECT lower(cookie_name) AS n, actual_category, " "typical_max_age_seconds, vendor_name, purpose_de, purpose_en, " - "is_pii FROM compliance.cookie_library " - "WHERE lower(cookie_name) = ANY(:names)" - ), - {"names": uniq}, + "is_pii FROM compliance.cookie_library" + ) ).mappings().fetchall() - return {r["n"]: dict(r) for r in rows} + lib_bases: dict[str, dict] = {} + for r in rows: + base = _strip_wildcards(r["n"]) + if base and base not in lib_bases: + lib_bases[base] = dict(r) + out: dict[str, dict] = {} + for low in uniq: + hit = _match_lib(low, lib_bases) + if hit: + out[low] = hit + return out _NECESSARY_CATS = { "necessary", "notwendig", "essential", "essenziell", diff --git a/backend-compliance/compliance/tests/test_cookie_library_check.py b/backend-compliance/compliance/tests/test_cookie_library_check.py index f58b9e48..c5cfa562 100644 --- a/backend-compliance/compliance/tests/test_cookie_library_check.py +++ b/backend-compliance/compliance/tests/test_cookie_library_check.py @@ -2,7 +2,27 @@ from __future__ import annotations -from compliance.services.cookie_library_check import analyze_cookies +from compliance.services.cookie_library_check import ( + _candidate_keys, + _match_lib, + analyze_cookies, +) + + +def test_candidate_keys_strips_runtime_suffix(): + assert "_ga" in _candidate_keys("_ga_GTM-ABC123") + assert "amcvs" in _candidate_keys("AMCVS_1234@AdobeOrg") + assert "_pk_id" in _candidate_keys("_pk_id.5.7d8f") + + +def test_match_lib_prefix_and_exact(): + lib = {"_ga": {"actual_category": "statistics"}, + "phpsessid": {"actual_category": "essential"}} + assert _match_lib("_ga_GTM-XYZ", lib)["actual_category"] == "statistics" + assert _match_lib("PHPSESSID".lower(), lib)["actual_category"] == "essential" + assert _match_lib("totally_unknown_xyz", lib) is None + # kurze generische Basis darf NICHT über-matchen + assert _match_lib("id_charger", {"id": {"x": 1}}) is None def test_tracker_declared_necessary_is_high_finding():