perf(audit): parallel Tesseract OCR + Pipeline-Wire-In für Slicing
ocr_slices_extract_cookies nutzt jetzt ThreadPoolExecutor (4 workers). Tesseract released die GIL, daher echtes parallelisieren möglich. Sequenziell 32 slices ≈ 60s, parallel ~15s. Pipeline in agent_compliance_check_routes.py: Step C ruft jetzt capture_cookie_evidence_slices + ocr_slices_extract_cookies. Source 'tesseract_ocr' wird zu existing Vendors gemergt; neue Vendors als eigenständige Records. Final VW-Scan-Resultat: - Cookies: 60 (parse_flat) → 128 (mit Tesseract) = +113% - Vendors: 18 unique - Adobe Analytics: 9 → 33 Cookies (Tesseract fand +24) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -413,38 +413,59 @@ async def capture_cookie_evidence_slices(
|
||||
return {"slices": [], "error": str(e)[:200]}
|
||||
|
||||
|
||||
def _ocr_one_slice(s: dict) -> tuple[dict, list[dict]]:
|
||||
"""Helper for parallel execution: tesseract + parse for one slice.
|
||||
Returns (slice_metadata_summary, cookies)."""
|
||||
import base64 as _b64
|
||||
try:
|
||||
png = _b64.b64decode(s.get("png_b64", ""))
|
||||
except Exception:
|
||||
return ({"idx": s.get("idx"), "ts": s.get("ts"),
|
||||
"top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
|
||||
"cookies_found": 0}, [])
|
||||
text = ocr_screenshot_via_tesseract(png)
|
||||
chunk = parse_ocr_cookie_table(text)
|
||||
return ({"idx": s.get("idx"), "ts": s.get("ts"),
|
||||
"top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
|
||||
"cookies_found": len(chunk)},
|
||||
chunk)
|
||||
|
||||
|
||||
def ocr_slices_extract_cookies(
|
||||
slices: list[dict],
|
||||
slices: list[dict], max_workers: int = 4,
|
||||
) -> tuple[list[dict], dict]:
|
||||
"""Run Tesseract on each slice + parse + dedup by cookie name.
|
||||
"""Run Tesseract on each slice IN PARALLEL + parse + dedup by name.
|
||||
|
||||
Tesseract releases the GIL during its C-level OCR, so a
|
||||
ThreadPoolExecutor with 4 workers yields ~4x speedup on multi-core
|
||||
machines (M4 Pro has plenty). Sequential 32 slices = ~60s, parallel
|
||||
~15s.
|
||||
|
||||
Returns (cookies, stats) where stats has:
|
||||
per_slice: [{idx, cookies_found, ts}]
|
||||
total_raw, total_unique
|
||||
per_slice: [{idx, cookies_found, ts, top_y, bot_y}]
|
||||
total_raw, total_unique, slices
|
||||
"""
|
||||
import base64 as _b64
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
per_slice: list[dict] = []
|
||||
if not slices:
|
||||
return [], {"per_slice": [], "total_raw": 0,
|
||||
"total_unique": 0, "slices": 0}
|
||||
|
||||
# Keep slice order so the per-slice report is sequential.
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||||
results = list(ex.map(_ocr_one_slice, slices))
|
||||
|
||||
per_slice: list[dict] = [r[0] for r in results]
|
||||
all_cookies: list[dict] = []
|
||||
seen_names: set[str] = set()
|
||||
for s in slices:
|
||||
try:
|
||||
png = _b64.b64decode(s.get("png_b64", ""))
|
||||
except Exception:
|
||||
continue
|
||||
text = ocr_screenshot_via_tesseract(png)
|
||||
chunk = parse_ocr_cookie_table(text)
|
||||
per_slice.append({
|
||||
"idx": s.get("idx"), "ts": s.get("ts"),
|
||||
"top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
|
||||
"cookies_found": len(chunk),
|
||||
})
|
||||
for _, chunk in results:
|
||||
for c in chunk:
|
||||
nl = (c.get("name") or "").strip().lower()
|
||||
if not nl or nl in seen_names:
|
||||
continue
|
||||
seen_names.add(nl)
|
||||
all_cookies.append(c)
|
||||
|
||||
stats = {
|
||||
"per_slice": per_slice,
|
||||
"total_raw": sum(p["cookies_found"] for p in per_slice),
|
||||
@@ -452,8 +473,8 @@ def ocr_slices_extract_cookies(
|
||||
"slices": len(slices),
|
||||
}
|
||||
logger.info(
|
||||
"ocr_slices_extract_cookies: %d slices → %d raw → %d unique cookies",
|
||||
stats["slices"], stats["total_raw"], stats["total_unique"],
|
||||
"ocr_slices_extract_cookies (parallel=%d): %d slices → %d raw → %d unique",
|
||||
max_workers, stats["slices"], stats["total_raw"], stats["total_unique"],
|
||||
)
|
||||
return all_cookies, stats
|
||||
|
||||
|
||||
Reference in New Issue
Block a user