diff --git a/backend-compliance/compliance/services/cookies_table_parser.py b/backend-compliance/compliance/services/cookies_table_parser.py index 66f0a576..2d47ec21 100644 --- a/backend-compliance/compliance/services/cookies_table_parser.py +++ b/backend-compliance/compliance/services/cookies_table_parser.py @@ -189,35 +189,41 @@ def parse_cookie_table(text: str) -> list[dict]: return out +# textContent-Output von HTML-Tabellen verkettet Zellen ohne Whitespace +# (z.B. VW: "Permanent/Protokoll_fbcTracking Cookies (Marketing)..."). Wir +# erkennen Cookie-Eintraege ueber 2 Anker: +# - Davor: typisches End-Token einer vorherigen Tabellen-Zelle +# (Speicherdauer-Suffix wie Permanent/Protokoll, Session Cookie, ...) +# - Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, ...) +# Dazwischen: der Cookie-Name (3-50 Zeichen, alphanum/underscore/dash). _FLAT_ROW_RE = re.compile( - r"\b([A-Za-z_][A-Za-z0-9_\-\.]{1,40})\s+" - r"((?:Tracking|Session|Funktional|Marketing|Analytics|Performance|" - r"Notwendig|Strictly\s+Necessary|Statistik|Personalisierung)" - r"[A-Za-zäöüÄÖÜß \-\(\)]*?Cookies?[^A-Z]{0,400}?)" - r"(?:(\d+)\s*(Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr|day|month|year)|" - r"\b(Session|Permanent)\b)", - re.I | re.S, + r"(?:Permanent/Protokoll|Session Cookie|Persistent Cookie|" + r"TagePersistent|TageSitzungs-Cookie|TageSession Cookie|" + r"MinutenPersistent|MinutenSession Cookie|StundenPersistent|" + r"MonatePersistent|JahrePersistent)" + r"([A-Za-z_][A-Za-z0-9_\-\.]{1,40}?)" + r"(?=Tracking Cookies|Session Cookies|Funktionscookie|Funktional|" + r"Marketing|Analytics|Necessary)", + re.I, ) def parse_flat_cookie_text(text: str) -> list[dict]: """Variante fuer Sites wie VW die ihre Cookie-Tabelle als flachen - Text liefern (Cookie-Name + Kategorie + Beschreibung + Dauer in - einem Block hintereinander, ohne klare Trenner). + Text liefern (textContent-Output ohne Whitespace zwischen Zellen). - Regex sucht nach 'NAME [Tracking|Session|Funktional...] Cookies - ... [13 Monate|Session|Permanent]' und behandelt jeden Match als - eine Tabellen-Zeile. + Regex anchored auf vorherige Speicherdauer-Suffixe + folgende + Kategorie-Token → extrahiert den Cookie-Namen dazwischen. """ if not text or len(text) < 500: return [] - matches = list(_FLAT_ROW_RE.finditer(text)) - if len(matches) < 3: + names = _FLAT_ROW_RE.findall(text) + if len(names) < 3: return [] by_vendor: dict[str, dict] = {} seen_names: set[str] = set() - for m in matches: - name = m.group(1).strip() + for raw in names: + name = raw.strip() nl = name.lower() if nl in seen_names: continue @@ -226,30 +232,23 @@ def parse_flat_cookie_text(text: str) -> list[dict]: "marketing", "analytics", "werbung", "anbieter", "tracking", "cookie", "cookies", "und", "von", "einer", "ist", "alle", "noch", "auch", "name", - "art", "zweck", "dauer"): + "art", "zweck", "dauer", "test"): continue if len(name) < 3 or len(name) > 60: continue seen_names.add(nl) - category = _normalize_category(m.group(2) or "") - persistence = "" - if m.group(3): - persistence = f"{m.group(3)} {m.group(4)}" - elif m.group(5): - persistence = m.group(5) - purpose = (m.group(2) or "").strip()[:300] vendor = _guess_vendor(name) or "Unbekannter Anbieter" entry = by_vendor.setdefault(vendor, { "name": vendor, "country": "", - "purpose": purpose, "category": category, + "purpose": "", "category": "", "opt_out_url": "", "privacy_policy_url": "", - "persistence": persistence, + "persistence": "", "cookies": [], "source": "flat_pattern", }) entry["cookies"].append({ - "name": name, "purpose": purpose[:200], - "expiry": persistence, "is_third_party": True, + "name": name, "purpose": "", + "expiry": "", "is_third_party": True, }) out = list(by_vendor.values()) logger.info("parse_flat_cookie_text: %d vendors / %d cookies",