From 4434e3827bd1f6e0efaeaf17d4257337ef11d7dc Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 21 May 2026 21:33:58 +0200 Subject: [PATCH] =?UTF-8?q?fix(audit):=20parse=5Fflat=5Fcookie=5Ftext=20?= =?UTF-8?q?=E2=80=94=20Anchor-Pattern=20fuer=20VW-textContent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VW Cookie-Doc-textContent verkettet HTML-Tabellen-Zellen OHNE Whitespace: 'Permanent/Protokoll_fbcTracking Cookies (Marketing)...' Neues Pattern hat 2 Anker: * Davor: typisches End-Token einer vorherigen Zelle (Permanent/Protokoll, Session Cookie, Persistent Cookie, TagePersistent, ...) * Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, Marketing, Analytics, Necessary) Dazwischen: Cookie-Name (3-50 Zeichen, alphanum/_/-) VW-Test (snapshot 4a465783): findet jetzt 40 unique Cookie-Namen, aggregiert zu 6 Vendors (Google, DoubleClick, Cloudflare, Borlabs, Meta, Unbekannter Anbieter mit 22 VW-internen Cookies). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../services/cookies_table_parser.py | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/backend-compliance/compliance/services/cookies_table_parser.py b/backend-compliance/compliance/services/cookies_table_parser.py index 66f0a576..2d47ec21 100644 --- a/backend-compliance/compliance/services/cookies_table_parser.py +++ b/backend-compliance/compliance/services/cookies_table_parser.py @@ -189,35 +189,41 @@ def parse_cookie_table(text: str) -> list[dict]: return out +# textContent-Output von HTML-Tabellen verkettet Zellen ohne Whitespace +# (z.B. VW: "Permanent/Protokoll_fbcTracking Cookies (Marketing)..."). Wir +# erkennen Cookie-Eintraege ueber 2 Anker: +# - Davor: typisches End-Token einer vorherigen Tabellen-Zelle +# (Speicherdauer-Suffix wie Permanent/Protokoll, Session Cookie, ...) +# - Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, ...) +# Dazwischen: der Cookie-Name (3-50 Zeichen, alphanum/underscore/dash). _FLAT_ROW_RE = re.compile( - r"\b([A-Za-z_][A-Za-z0-9_\-\.]{1,40})\s+" - r"((?:Tracking|Session|Funktional|Marketing|Analytics|Performance|" - r"Notwendig|Strictly\s+Necessary|Statistik|Personalisierung)" - r"[A-Za-zäöüÄÖÜß \-\(\)]*?Cookies?[^A-Z]{0,400}?)" - r"(?:(\d+)\s*(Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr|day|month|year)|" - r"\b(Session|Permanent)\b)", - re.I | re.S, + r"(?:Permanent/Protokoll|Session Cookie|Persistent Cookie|" + r"TagePersistent|TageSitzungs-Cookie|TageSession Cookie|" + r"MinutenPersistent|MinutenSession Cookie|StundenPersistent|" + r"MonatePersistent|JahrePersistent)" + r"([A-Za-z_][A-Za-z0-9_\-\.]{1,40}?)" + r"(?=Tracking Cookies|Session Cookies|Funktionscookie|Funktional|" + r"Marketing|Analytics|Necessary)", + re.I, ) def parse_flat_cookie_text(text: str) -> list[dict]: """Variante fuer Sites wie VW die ihre Cookie-Tabelle als flachen - Text liefern (Cookie-Name + Kategorie + Beschreibung + Dauer in - einem Block hintereinander, ohne klare Trenner). + Text liefern (textContent-Output ohne Whitespace zwischen Zellen). - Regex sucht nach 'NAME [Tracking|Session|Funktional...] Cookies - ... [13 Monate|Session|Permanent]' und behandelt jeden Match als - eine Tabellen-Zeile. + Regex anchored auf vorherige Speicherdauer-Suffixe + folgende + Kategorie-Token → extrahiert den Cookie-Namen dazwischen. """ if not text or len(text) < 500: return [] - matches = list(_FLAT_ROW_RE.finditer(text)) - if len(matches) < 3: + names = _FLAT_ROW_RE.findall(text) + if len(names) < 3: return [] by_vendor: dict[str, dict] = {} seen_names: set[str] = set() - for m in matches: - name = m.group(1).strip() + for raw in names: + name = raw.strip() nl = name.lower() if nl in seen_names: continue @@ -226,30 +232,23 @@ def parse_flat_cookie_text(text: str) -> list[dict]: "marketing", "analytics", "werbung", "anbieter", "tracking", "cookie", "cookies", "und", "von", "einer", "ist", "alle", "noch", "auch", "name", - "art", "zweck", "dauer"): + "art", "zweck", "dauer", "test"): continue if len(name) < 3 or len(name) > 60: continue seen_names.add(nl) - category = _normalize_category(m.group(2) or "") - persistence = "" - if m.group(3): - persistence = f"{m.group(3)} {m.group(4)}" - elif m.group(5): - persistence = m.group(5) - purpose = (m.group(2) or "").strip()[:300] vendor = _guess_vendor(name) or "Unbekannter Anbieter" entry = by_vendor.setdefault(vendor, { "name": vendor, "country": "", - "purpose": purpose, "category": category, + "purpose": "", "category": "", "opt_out_url": "", "privacy_policy_url": "", - "persistence": persistence, + "persistence": "", "cookies": [], "source": "flat_pattern", }) entry["cookies"].append({ - "name": name, "purpose": purpose[:200], - "expiry": persistence, "is_third_party": True, + "name": name, "purpose": "", + "expiry": "", "is_third_party": True, }) out = list(by_vendor.values()) logger.info("parse_flat_cookie_text: %d vendors / %d cookies",