feat(audit): Screenshot+Tesseract-OCR Cookie-Extract als Vendor-Quelle C
Statt fragiler text-Regex + LLM-Cascade-Workarounds: deterministische
Pipeline. consent-tester macht Full-Page-Screenshot der Cookie-Richtlinie
(akzeptiert Banner, klappt Accordions, brennt Timestamp ein). Backend
laesst Tesseract OCR (deu, PSM 4) drueber + anchor-basierter Parser
extrahiert {name, category, purpose, duration, type} pro Cookie.
VW-Smoke-Test:
- Vorher (parse_flat): 60 cookies / 16 vendors
- Jetzt (Tesseract): 79 cookies / 14 vendor-records (~79% GT-coverage)
Architektur:
- consent-tester: page_screenshot.py + /capture-evidence Endpoint
- backend: cookie_screenshot_ocr.py mit Tesseract-pipeline
- pipeline: nach parse_flat als komplementaere Stufe C
- Dockerfile: tesseract-ocr + deutsches Sprachpaket
- requirements: pytesseract
KEINE Textkorrektur auf Cookie-Namen (awsalb bleibt awsalb).
Timestamp im Screenshot = juristischer Beweis was wir zum Scan-Zeitpunkt
wirklich auf der Site gesehen haben.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -948,6 +948,83 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
except Exception as e:
|
||||
logger.warning("crawled-table-parse failed: %s", e)
|
||||
|
||||
# C — Screenshot + Vision-OCR der Cookie-Richtlinie.
|
||||
# Liefert deterministisch die echte Cookie-Tabelle aus dem
|
||||
# gerenderten DOM (Banner akzeptiert, Accordions ausgeklappt,
|
||||
# Timestamp eingebrannt). Komplementaer zu parse_flat: wenn
|
||||
# parse_flat versagt (textContent ohne Whitespace, ungewohntes
|
||||
# Spalten-Layout, andere Sprache), greift die Vision-Extraktion
|
||||
# immer noch zu — sie liest die Tabelle wie ein Mensch.
|
||||
cookie_url_for_shot = ""
|
||||
for _e in doc_entries:
|
||||
if _e.get("doc_type") == "cookie" and _e.get("url"):
|
||||
cookie_url_for_shot = _e["url"]; break
|
||||
cookie_evidence_screenshot: dict | None = None
|
||||
if cookie_url_for_shot:
|
||||
try:
|
||||
from compliance.services.cookie_screenshot_ocr import (
|
||||
capture_cookie_screenshot,
|
||||
extract_cookies_via_vision,
|
||||
cookies_to_vendor_records,
|
||||
)
|
||||
from compliance.services.cookies_table_parser import (
|
||||
_guess_vendor as _gv,
|
||||
)
|
||||
_update(check_id,
|
||||
"Cookie-Tabelle wird fotografiert + OCR-extrahiert...",
|
||||
93)
|
||||
cap = await capture_cookie_screenshot(
|
||||
cookie_url_for_shot, check_id=check_id,
|
||||
)
|
||||
if cap.get("png_b64"):
|
||||
cookie_evidence_screenshot = cap # fuer ZIP-Anhang
|
||||
vis_cookies = await extract_cookies_via_vision(
|
||||
cap["png_b64"],
|
||||
)
|
||||
if vis_cookies:
|
||||
vis_vendors = cookies_to_vendor_records(
|
||||
vis_cookies, guess_vendor_fn=_gv,
|
||||
)
|
||||
existing = {
|
||||
(v.get("name") or "").strip().lower()
|
||||
for v in cmp_vendors
|
||||
}
|
||||
added_v = 0
|
||||
for v in vis_vendors:
|
||||
nm = (v.get("name") or "").strip()
|
||||
if not nm:
|
||||
continue
|
||||
if nm.lower() in existing:
|
||||
# merge cookies into existing record
|
||||
for ex in cmp_vendors:
|
||||
if (ex.get("name") or "").strip().lower() == nm.lower():
|
||||
ex_names = {
|
||||
(c.get("name") or "").lower()
|
||||
for c in (ex.get("cookies") or [])
|
||||
}
|
||||
for c in (v.get("cookies") or []):
|
||||
if c["name"].lower() not in ex_names:
|
||||
ex.setdefault("cookies", []).append(c)
|
||||
ex_names.add(c["name"].lower())
|
||||
cur_src = ex.get("source", "")
|
||||
if "vision_ocr" not in cur_src:
|
||||
ex["source"] = (cur_src + ";vision_ocr").strip(";")
|
||||
break
|
||||
continue
|
||||
cmp_vendors.append(v)
|
||||
existing.add(nm.lower())
|
||||
added_v += 1
|
||||
logger.info(
|
||||
"C Vision-OCR: +%d Vendors / %d Cookies "
|
||||
"(total: %d)",
|
||||
added_v, len(vis_cookies), len(cmp_vendors),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Vision-OCR pipeline failed: %s (%s)",
|
||||
str(e) or "(no msg)", type(e).__name__,
|
||||
)
|
||||
|
||||
# User-pasted Cookie-Tabelle (deterministisch, kein LLM):
|
||||
# die hat IMMER Vorrang weil 100% genau.
|
||||
if pasted_table_vendors:
|
||||
|
||||
Reference in New Issue
Block a user