fix(audit): VW-Cookie-Tabelle — Library-Fallback + Pattern-Extract verstaerkt
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
VW-Lehre: cmp_vendors=6 (alle LLM-grob) wurde als ausreichend gewertet, obwohl die echte Cookie-Tabelle 30+ Eintraege hat. 3 Fixes: 1. fallback_vendors_for_run skip-Schwelle: existing_vendor_count >= 3 war zu niedrig. Jetzt nur skip wenn < 5 Cookies UND >= 5 Vendors schon vorhanden. 2. Library-Fallback wird jetzt aufgerufen bei < 20 cmp_vendors (statt < 3). VW-typische Setups (6 LLM-grob + 30 aus Library) bekommen damit eine vollstaendige Vendor-Liste. 3. _extract_cookie_names_from_doc: regex-Pattern-Extract aus dem Cookie-Doc-Text selbst — sucht nach 'NAME Tracking Cookies (Marketing)' etc. Findet Cookie-Namen die NICHT im Browser-Jar landen (z.B. nur nach Consent geladen werden). Diese werden zusaetzlich durch die Library matched. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -769,7 +769,10 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
|
|
||||||
# Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige
|
# Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige
|
||||||
# Vendors aber viele after_accept-Cookies, aus Library auflösen.
|
# Vendors aber viele after_accept-Cookies, aus Library auflösen.
|
||||||
if banner_result and len(cmp_vendors) < 3:
|
# VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library
|
||||||
|
# holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern.
|
||||||
|
# Schwelle: immer probieren wenn < 20 Vendors.
|
||||||
|
if banner_result and len(cmp_vendors) < 20:
|
||||||
try:
|
try:
|
||||||
from compliance.services.cookie_to_vendor_fallback import (
|
from compliance.services.cookie_to_vendor_fallback import (
|
||||||
fallback_vendors_for_run,
|
fallback_vendors_for_run,
|
||||||
@@ -779,6 +782,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
try:
|
try:
|
||||||
extra = fallback_vendors_for_run(
|
extra = fallback_vendors_for_run(
|
||||||
_fb_db, banner_result, len(cmp_vendors),
|
_fb_db, banner_result, len(cmp_vendors),
|
||||||
|
cookie_doc_text=cookie_text,
|
||||||
)
|
)
|
||||||
if extra:
|
if extra:
|
||||||
existing_names = {(v.get("name") or "").strip().lower()
|
existing_names = {(v.get("name") or "").strip().lower()
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ hat 28 Eintraege. Diese 28 Cookies sind in der Library = ~15-20 Vendors.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
@@ -80,21 +81,67 @@ def fallback_vendors_for_run(
|
|||||||
db: Session,
|
db: Session,
|
||||||
banner_result: dict | None,
|
banner_result: dict | None,
|
||||||
existing_vendor_count: int,
|
existing_vendor_count: int,
|
||||||
|
cookie_doc_text: str | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Returns extra vendor records to merge with the run's cmp_vendors.
|
"""Returns extra vendor records to merge with the run's cmp_vendors.
|
||||||
|
|
||||||
Only fires when existing_vendor_count is suspiciously low (< 3) AND
|
VW-Lehre: cmp_vendors=6 (alle LLM-grob) reicht NICHT — die echte
|
||||||
we have enough cookies to look up (>= 5). Otherwise skip.
|
Cookie-Tabelle hat 30+ Eintraege. Wir fuehren den Lookup jetzt auch
|
||||||
|
bei mid-tier-Counts aus, solange after_accept >= 15 Cookies hat
|
||||||
|
ODER der Cookie-Doc-Text Cookie-Tabellen-Signale enthaelt.
|
||||||
"""
|
"""
|
||||||
if existing_vendor_count >= 3:
|
|
||||||
return []
|
|
||||||
names = _collect_cookie_names(banner_result)
|
names = _collect_cookie_names(banner_result)
|
||||||
if len(names) < 5:
|
|
||||||
|
# Erweitere names um Cookie-Namen die im Cookie-Doc-Text als
|
||||||
|
# Tabellen-Eintraege auftauchen (Pattern: NAME gefolgt von
|
||||||
|
# "Tracking Cookies"/"Session Cookies"/"Funktional"/...).
|
||||||
|
if cookie_doc_text:
|
||||||
|
names |= _extract_cookie_names_from_doc(cookie_doc_text)
|
||||||
|
|
||||||
|
# Skip-Bedingungen ueberarbeitet:
|
||||||
|
# - sehr wenige Cookies UND >= 5 Vendors schon vorhanden → skip
|
||||||
|
# - sonst IMMER versuchen
|
||||||
|
if len(names) < 5 and existing_vendor_count >= 5:
|
||||||
return []
|
return []
|
||||||
|
if not names:
|
||||||
|
return []
|
||||||
|
|
||||||
vendors = lookup_vendors_from_library(db, names)
|
vendors = lookup_vendors_from_library(db, names)
|
||||||
if vendors:
|
if vendors:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Cookie-Library-Fallback: %d Vendors aus %d Cookies (vorher %d)",
|
"Cookie-Library-Fallback: %d Vendors aus %d Cookies "
|
||||||
|
"(existing cmp_vendors=%d)",
|
||||||
len(vendors), len(names), existing_vendor_count,
|
len(vendors), len(names), existing_vendor_count,
|
||||||
)
|
)
|
||||||
return vendors
|
return vendors
|
||||||
|
|
||||||
|
|
||||||
|
_TABLE_ROW_RE = re.compile(
|
||||||
|
r"\b([A-Za-z_][A-Za-z0-9_\-\.]{2,40})\s+"
|
||||||
|
r"(?:Tracking Cookies|Session Cookies|Funktional|Marketing|"
|
||||||
|
r"Analytics|Performance|Notwendig|Strictly\s+Necessary|"
|
||||||
|
r"Statistik|Werbung|Targeting|Personalisierung)",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_cookie_names_from_doc(text: str) -> set[str]:
|
||||||
|
"""Pattern-basiertes Erkennen von Cookie-Tabellen-Zeilen.
|
||||||
|
|
||||||
|
VW-Cookie-Tabelle hat Form:
|
||||||
|
'IDE Tracking Cookies (Marketing) Dieser Cookie ... 13 Monate'
|
||||||
|
Das fangen wir mit einem Cookie-Name-vor-Category-Pattern.
|
||||||
|
"""
|
||||||
|
out: set[str] = set()
|
||||||
|
for m in _TABLE_ROW_RE.finditer(text):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
# Filter offensichtliche Noise (Pronomen, Verben)
|
||||||
|
nl = name.lower()
|
||||||
|
if nl in ("dieser", "diese", "ein", "der", "die", "das",
|
||||||
|
"session", "permanent", "funktional", "notwendig",
|
||||||
|
"marketing", "analytics", "werbung", "anbieter",
|
||||||
|
"google", "facebook", "tracking", "cookie", "cookies"):
|
||||||
|
continue
|
||||||
|
if len(name) >= 3:
|
||||||
|
out.add(name)
|
||||||
|
return out
|
||||||
|
|||||||
Reference in New Issue
Block a user