perf(audit): parallel Tesseract OCR + Pipeline-Wire-In für Slicing
ocr_slices_extract_cookies nutzt jetzt ThreadPoolExecutor (4 workers). Tesseract released die GIL, daher echtes parallelisieren möglich. Sequenziell 32 slices ≈ 60s, parallel ~15s. Pipeline in agent_compliance_check_routes.py: Step C ruft jetzt capture_cookie_evidence_slices + ocr_slices_extract_cookies. Source 'tesseract_ocr' wird zu existing Vendors gemergt; neue Vendors als eigenständige Records. Final VW-Scan-Resultat: - Cookies: 60 (parse_flat) → 128 (mit Tesseract) = +113% - Vendors: 18 unique - Adobe Analytics: 9 → 33 Cookies (Tesseract fand +24) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -948,54 +948,63 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("crawled-table-parse failed: %s", e)
|
logger.warning("crawled-table-parse failed: %s", e)
|
||||||
|
|
||||||
# C — Screenshot + Vision-OCR der Cookie-Richtlinie.
|
# C — Screenshot + Tesseract-OCR der Cookie-Richtlinie.
|
||||||
# Liefert deterministisch die echte Cookie-Tabelle aus dem
|
# Overlapping scrolling screenshots (jede Slice ueberlappt die
|
||||||
# gerenderten DOM (Banner akzeptiert, Accordions ausgeklappt,
|
# vorherige um overlap_px Pixel) → lueckenlose Beweiskette.
|
||||||
# Timestamp eingebrannt). Komplementaer zu parse_flat: wenn
|
# Pro Slice Tesseract OCR + parse_ocr_cookie_table; Dedup nach
|
||||||
# parse_flat versagt (textContent ohne Whitespace, ungewohntes
|
# Cookie-Name über alle Slices. Site-unabhaengig, deterministisch.
|
||||||
# Spalten-Layout, andere Sprache), greift die Vision-Extraktion
|
|
||||||
# immer noch zu — sie liest die Tabelle wie ein Mensch.
|
|
||||||
cookie_url_for_shot = ""
|
cookie_url_for_shot = ""
|
||||||
for _e in doc_entries:
|
for _e in doc_entries:
|
||||||
if _e.get("doc_type") == "cookie" and _e.get("url"):
|
if _e.get("doc_type") == "cookie" and _e.get("url"):
|
||||||
cookie_url_for_shot = _e["url"]; break
|
cookie_url_for_shot = _e["url"]; break
|
||||||
cookie_evidence_screenshot: dict | None = None
|
cookie_evidence_slices: list[dict] | None = None
|
||||||
|
cookie_evidence_meta: dict | None = None
|
||||||
if cookie_url_for_shot:
|
if cookie_url_for_shot:
|
||||||
try:
|
try:
|
||||||
from compliance.services.cookie_screenshot_ocr import (
|
from compliance.services.cookie_screenshot_ocr import (
|
||||||
capture_cookie_screenshot,
|
capture_cookie_evidence_slices,
|
||||||
extract_cookies_via_vision,
|
ocr_slices_extract_cookies,
|
||||||
cookies_to_vendor_records,
|
cookies_to_vendor_records,
|
||||||
)
|
)
|
||||||
from compliance.services.cookies_table_parser import (
|
from compliance.services.cookies_table_parser import (
|
||||||
_guess_vendor as _gv,
|
_guess_vendor as _gv,
|
||||||
)
|
)
|
||||||
_update(check_id,
|
_update(check_id,
|
||||||
"Cookie-Tabelle wird fotografiert + OCR-extrahiert...",
|
"Cookie-Richtlinie wird fotografiert (lueckenlose Beweiskette)...",
|
||||||
93)
|
92)
|
||||||
cap = await capture_cookie_screenshot(
|
ev = await capture_cookie_evidence_slices(
|
||||||
cookie_url_for_shot, check_id=check_id,
|
cookie_url_for_shot, check_id=check_id,
|
||||||
|
viewport_h=1024, overlap_px=200, max_slices=40,
|
||||||
)
|
)
|
||||||
if cap.get("png_b64"):
|
if ev.get("slices"):
|
||||||
cookie_evidence_screenshot = cap # fuer ZIP-Anhang
|
cookie_evidence_slices = ev["slices"] # ZIP-Anhang
|
||||||
vis_cookies = await extract_cookies_via_vision(
|
cookie_evidence_meta = {
|
||||||
cap["png_b64"],
|
"total_height_px": ev.get("total_height_px"),
|
||||||
|
"width_px": ev.get("width_px"),
|
||||||
|
"accepted_banner": ev.get("accepted_banner"),
|
||||||
|
"expanded": ev.get("expanded"),
|
||||||
|
"url": ev.get("url"),
|
||||||
|
"slice_count": len(ev["slices"]),
|
||||||
|
}
|
||||||
|
_update(check_id,
|
||||||
|
"Tesseract OCR über alle Slices...", 93)
|
||||||
|
ocr_cookies, ocr_stats = ocr_slices_extract_cookies(
|
||||||
|
ev["slices"],
|
||||||
)
|
)
|
||||||
if vis_cookies:
|
if ocr_cookies:
|
||||||
vis_vendors = cookies_to_vendor_records(
|
ocr_vendors = cookies_to_vendor_records(
|
||||||
vis_cookies, guess_vendor_fn=_gv,
|
ocr_cookies, guess_vendor_fn=_gv,
|
||||||
)
|
)
|
||||||
existing = {
|
existing = {
|
||||||
(v.get("name") or "").strip().lower()
|
(v.get("name") or "").strip().lower()
|
||||||
for v in cmp_vendors
|
for v in cmp_vendors
|
||||||
}
|
}
|
||||||
added_v = 0
|
added_v = 0
|
||||||
for v in vis_vendors:
|
for v in ocr_vendors:
|
||||||
nm = (v.get("name") or "").strip()
|
nm = (v.get("name") or "").strip()
|
||||||
if not nm:
|
if not nm:
|
||||||
continue
|
continue
|
||||||
if nm.lower() in existing:
|
if nm.lower() in existing:
|
||||||
# merge cookies into existing record
|
|
||||||
for ex in cmp_vendors:
|
for ex in cmp_vendors:
|
||||||
if (ex.get("name") or "").strip().lower() == nm.lower():
|
if (ex.get("name") or "").strip().lower() == nm.lower():
|
||||||
ex_names = {
|
ex_names = {
|
||||||
@@ -1007,21 +1016,22 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
ex.setdefault("cookies", []).append(c)
|
ex.setdefault("cookies", []).append(c)
|
||||||
ex_names.add(c["name"].lower())
|
ex_names.add(c["name"].lower())
|
||||||
cur_src = ex.get("source", "")
|
cur_src = ex.get("source", "")
|
||||||
if "vision_ocr" not in cur_src:
|
if "tesseract_ocr" not in cur_src:
|
||||||
ex["source"] = (cur_src + ";vision_ocr").strip(";")
|
ex["source"] = (cur_src + ";tesseract_ocr").strip(";")
|
||||||
break
|
break
|
||||||
continue
|
continue
|
||||||
cmp_vendors.append(v)
|
cmp_vendors.append(v)
|
||||||
existing.add(nm.lower())
|
existing.add(nm.lower())
|
||||||
added_v += 1
|
added_v += 1
|
||||||
logger.info(
|
logger.info(
|
||||||
"C Vision-OCR: +%d Vendors / %d Cookies "
|
"C Tesseract-OCR: +%d Vendors / %d Cookies "
|
||||||
"(total: %d)",
|
"(über %d Slices, total: %d)",
|
||||||
added_v, len(vis_cookies), len(cmp_vendors),
|
added_v, len(ocr_cookies),
|
||||||
|
ocr_stats.get("slices", 0), len(cmp_vendors),
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Vision-OCR pipeline failed: %s (%s)",
|
"Tesseract-OCR pipeline failed: %s (%s)",
|
||||||
str(e) or "(no msg)", type(e).__name__,
|
str(e) or "(no msg)", type(e).__name__,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -413,38 +413,59 @@ async def capture_cookie_evidence_slices(
|
|||||||
return {"slices": [], "error": str(e)[:200]}
|
return {"slices": [], "error": str(e)[:200]}
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_one_slice(s: dict) -> tuple[dict, list[dict]]:
|
||||||
|
"""Helper for parallel execution: tesseract + parse for one slice.
|
||||||
|
Returns (slice_metadata_summary, cookies)."""
|
||||||
|
import base64 as _b64
|
||||||
|
try:
|
||||||
|
png = _b64.b64decode(s.get("png_b64", ""))
|
||||||
|
except Exception:
|
||||||
|
return ({"idx": s.get("idx"), "ts": s.get("ts"),
|
||||||
|
"top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
|
||||||
|
"cookies_found": 0}, [])
|
||||||
|
text = ocr_screenshot_via_tesseract(png)
|
||||||
|
chunk = parse_ocr_cookie_table(text)
|
||||||
|
return ({"idx": s.get("idx"), "ts": s.get("ts"),
|
||||||
|
"top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
|
||||||
|
"cookies_found": len(chunk)},
|
||||||
|
chunk)
|
||||||
|
|
||||||
|
|
||||||
def ocr_slices_extract_cookies(
|
def ocr_slices_extract_cookies(
|
||||||
slices: list[dict],
|
slices: list[dict], max_workers: int = 4,
|
||||||
) -> tuple[list[dict], dict]:
|
) -> tuple[list[dict], dict]:
|
||||||
"""Run Tesseract on each slice + parse + dedup by cookie name.
|
"""Run Tesseract on each slice IN PARALLEL + parse + dedup by name.
|
||||||
|
|
||||||
|
Tesseract releases the GIL during its C-level OCR, so a
|
||||||
|
ThreadPoolExecutor with 4 workers yields ~4x speedup on multi-core
|
||||||
|
machines (M4 Pro has plenty). Sequential 32 slices = ~60s, parallel
|
||||||
|
~15s.
|
||||||
|
|
||||||
Returns (cookies, stats) where stats has:
|
Returns (cookies, stats) where stats has:
|
||||||
per_slice: [{idx, cookies_found, ts}]
|
per_slice: [{idx, cookies_found, ts, top_y, bot_y}]
|
||||||
total_raw, total_unique
|
total_raw, total_unique, slices
|
||||||
"""
|
"""
|
||||||
import base64 as _b64
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
per_slice: list[dict] = []
|
if not slices:
|
||||||
|
return [], {"per_slice": [], "total_raw": 0,
|
||||||
|
"total_unique": 0, "slices": 0}
|
||||||
|
|
||||||
|
# Keep slice order so the per-slice report is sequential.
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||||||
|
results = list(ex.map(_ocr_one_slice, slices))
|
||||||
|
|
||||||
|
per_slice: list[dict] = [r[0] for r in results]
|
||||||
all_cookies: list[dict] = []
|
all_cookies: list[dict] = []
|
||||||
seen_names: set[str] = set()
|
seen_names: set[str] = set()
|
||||||
for s in slices:
|
for _, chunk in results:
|
||||||
try:
|
|
||||||
png = _b64.b64decode(s.get("png_b64", ""))
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
text = ocr_screenshot_via_tesseract(png)
|
|
||||||
chunk = parse_ocr_cookie_table(text)
|
|
||||||
per_slice.append({
|
|
||||||
"idx": s.get("idx"), "ts": s.get("ts"),
|
|
||||||
"top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
|
|
||||||
"cookies_found": len(chunk),
|
|
||||||
})
|
|
||||||
for c in chunk:
|
for c in chunk:
|
||||||
nl = (c.get("name") or "").strip().lower()
|
nl = (c.get("name") or "").strip().lower()
|
||||||
if not nl or nl in seen_names:
|
if not nl or nl in seen_names:
|
||||||
continue
|
continue
|
||||||
seen_names.add(nl)
|
seen_names.add(nl)
|
||||||
all_cookies.append(c)
|
all_cookies.append(c)
|
||||||
|
|
||||||
stats = {
|
stats = {
|
||||||
"per_slice": per_slice,
|
"per_slice": per_slice,
|
||||||
"total_raw": sum(p["cookies_found"] for p in per_slice),
|
"total_raw": sum(p["cookies_found"] for p in per_slice),
|
||||||
@@ -452,8 +473,8 @@ def ocr_slices_extract_cookies(
|
|||||||
"slices": len(slices),
|
"slices": len(slices),
|
||||||
}
|
}
|
||||||
logger.info(
|
logger.info(
|
||||||
"ocr_slices_extract_cookies: %d slices → %d raw → %d unique cookies",
|
"ocr_slices_extract_cookies (parallel=%d): %d slices → %d raw → %d unique",
|
||||||
stats["slices"], stats["total_raw"], stats["total_unique"],
|
max_workers, stats["slices"], stats["total_raw"], stats["total_unique"],
|
||||||
)
|
)
|
||||||
return all_cookies, stats
|
return all_cookies, stats
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user