fix(audit): parse_flat_cookie_text — Anchor-Pattern fuer VW-textContent
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 17s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 40s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 17s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 40s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
VW Cookie-Doc-textContent verkettet HTML-Tabellen-Zellen OHNE Whitespace: 'Permanent/Protokoll_fbcTracking Cookies (Marketing)...' Neues Pattern hat 2 Anker: * Davor: typisches End-Token einer vorherigen Zelle (Permanent/Protokoll, Session Cookie, Persistent Cookie, TagePersistent, ...) * Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, Marketing, Analytics, Necessary) Dazwischen: Cookie-Name (3-50 Zeichen, alphanum/_/-) VW-Test (snapshot 4a465783): findet jetzt 40 unique Cookie-Namen, aggregiert zu 6 Vendors (Google, DoubleClick, Cloudflare, Borlabs, Meta, Unbekannter Anbieter mit 22 VW-internen Cookies). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -189,35 +189,41 @@ def parse_cookie_table(text: str) -> list[dict]:
|
||||
return out
|
||||
|
||||
|
||||
# textContent-Output von HTML-Tabellen verkettet Zellen ohne Whitespace
|
||||
# (z.B. VW: "Permanent/Protokoll_fbcTracking Cookies (Marketing)..."). Wir
|
||||
# erkennen Cookie-Eintraege ueber 2 Anker:
|
||||
# - Davor: typisches End-Token einer vorherigen Tabellen-Zelle
|
||||
# (Speicherdauer-Suffix wie Permanent/Protokoll, Session Cookie, ...)
|
||||
# - Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, ...)
|
||||
# Dazwischen: der Cookie-Name (3-50 Zeichen, alphanum/underscore/dash).
|
||||
_FLAT_ROW_RE = re.compile(
|
||||
r"\b([A-Za-z_][A-Za-z0-9_\-\.]{1,40})\s+"
|
||||
r"((?:Tracking|Session|Funktional|Marketing|Analytics|Performance|"
|
||||
r"Notwendig|Strictly\s+Necessary|Statistik|Personalisierung)"
|
||||
r"[A-Za-zäöüÄÖÜß \-\(\)]*?Cookies?[^A-Z]{0,400}?)"
|
||||
r"(?:(\d+)\s*(Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr|day|month|year)|"
|
||||
r"\b(Session|Permanent)\b)",
|
||||
re.I | re.S,
|
||||
r"(?:Permanent/Protokoll|Session Cookie|Persistent Cookie|"
|
||||
r"TagePersistent|TageSitzungs-Cookie|TageSession Cookie|"
|
||||
r"MinutenPersistent|MinutenSession Cookie|StundenPersistent|"
|
||||
r"MonatePersistent|JahrePersistent)"
|
||||
r"([A-Za-z_][A-Za-z0-9_\-\.]{1,40}?)"
|
||||
r"(?=Tracking Cookies|Session Cookies|Funktionscookie|Funktional|"
|
||||
r"Marketing|Analytics|Necessary)",
|
||||
re.I,
|
||||
)
|
||||
|
||||
|
||||
def parse_flat_cookie_text(text: str) -> list[dict]:
|
||||
"""Variante fuer Sites wie VW die ihre Cookie-Tabelle als flachen
|
||||
Text liefern (Cookie-Name + Kategorie + Beschreibung + Dauer in
|
||||
einem Block hintereinander, ohne klare Trenner).
|
||||
Text liefern (textContent-Output ohne Whitespace zwischen Zellen).
|
||||
|
||||
Regex sucht nach 'NAME [Tracking|Session|Funktional...] Cookies
|
||||
... [13 Monate|Session|Permanent]' und behandelt jeden Match als
|
||||
eine Tabellen-Zeile.
|
||||
Regex anchored auf vorherige Speicherdauer-Suffixe + folgende
|
||||
Kategorie-Token → extrahiert den Cookie-Namen dazwischen.
|
||||
"""
|
||||
if not text or len(text) < 500:
|
||||
return []
|
||||
matches = list(_FLAT_ROW_RE.finditer(text))
|
||||
if len(matches) < 3:
|
||||
names = _FLAT_ROW_RE.findall(text)
|
||||
if len(names) < 3:
|
||||
return []
|
||||
by_vendor: dict[str, dict] = {}
|
||||
seen_names: set[str] = set()
|
||||
for m in matches:
|
||||
name = m.group(1).strip()
|
||||
for raw in names:
|
||||
name = raw.strip()
|
||||
nl = name.lower()
|
||||
if nl in seen_names:
|
||||
continue
|
||||
@@ -226,30 +232,23 @@ def parse_flat_cookie_text(text: str) -> list[dict]:
|
||||
"marketing", "analytics", "werbung", "anbieter",
|
||||
"tracking", "cookie", "cookies", "und", "von",
|
||||
"einer", "ist", "alle", "noch", "auch", "name",
|
||||
"art", "zweck", "dauer"):
|
||||
"art", "zweck", "dauer", "test"):
|
||||
continue
|
||||
if len(name) < 3 or len(name) > 60:
|
||||
continue
|
||||
seen_names.add(nl)
|
||||
category = _normalize_category(m.group(2) or "")
|
||||
persistence = ""
|
||||
if m.group(3):
|
||||
persistence = f"{m.group(3)} {m.group(4)}"
|
||||
elif m.group(5):
|
||||
persistence = m.group(5)
|
||||
purpose = (m.group(2) or "").strip()[:300]
|
||||
vendor = _guess_vendor(name) or "Unbekannter Anbieter"
|
||||
entry = by_vendor.setdefault(vendor, {
|
||||
"name": vendor, "country": "",
|
||||
"purpose": purpose, "category": category,
|
||||
"purpose": "", "category": "",
|
||||
"opt_out_url": "", "privacy_policy_url": "",
|
||||
"persistence": persistence,
|
||||
"persistence": "",
|
||||
"cookies": [],
|
||||
"source": "flat_pattern",
|
||||
})
|
||||
entry["cookies"].append({
|
||||
"name": name, "purpose": purpose[:200],
|
||||
"expiry": persistence, "is_third_party": True,
|
||||
"name": name, "purpose": "",
|
||||
"expiry": "", "is_third_party": True,
|
||||
})
|
||||
out = list(by_vendor.values())
|
||||
logger.info("parse_flat_cookie_text: %d vendors / %d cookies",
|
||||
|
||||
Reference in New Issue
Block a user