refactor: split cookie_screenshot_ocr.py (642 → 290 + 353 LOC)
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 29s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

CI hard-cap 500 LOC. cookie_screenshot_ocr.py war auf 642 gewachsen,
also gesplittet:

  - cookie_screenshot_ocr_engines.py (353 LOC, NEU)
    OCR-Engine-Funktionen: _slice_screenshot, Vision-LLM (qwen2.5vl),
    PaddleOCR, Tesseract, parse_ocr_cookie_table, parse_vision_response,
    Konstanten VISION_MODEL/OLLAMA_URL/VISION_PROMPT.

  - cookie_screenshot_ocr.py (290 LOC, REWRITE)
    Orchestration: capture_cookie_evidence_slices, _ocr_one_slice,
    ocr_slices_extract_cookies, capture_cookie_screenshot,
    extract_cookies_via_vision, cookies_to_vendor_records.
    Re-Exports der Engine-Funktionen für Backward-Kompat.

Einziger externer Importer (_phase_d1_vendors_raw.py) braucht keinen
Code-Change — Public-API stabil.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-06 23:35:33 +02:00
parent ff796fb480
commit 02879a2c3a
9 changed files with 1790 additions and 384 deletions
@@ -0,0 +1,353 @@
"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split).
Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt:
- PIL-basiertes _slice_screenshot (zerteilt PNG in subimages)
- Vision-LLM-OCR (ollama qwen2.5vl:32b)
- PaddleOCR fallback
- Tesseract OCR (Hauptpfad)
- Anchor-basierter Parser parse_ocr_cookie_table
- _parse_vision_response (JSON-Toleranz für Vision-Output)
"""
from __future__ import annotations
import base64 as _b64
import json
import logging
import os
import re
import httpx
logger = logging.getLogger(__name__)
VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
_PADDLE_OCR = None # lazy-initialised PaddleOCR instance
# ── 1. Screenshot-Slicing für Vision-Models ────────────────────────
def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
max_slices: int = 25) -> list[str]:
"""Cut a tall full-page screenshot into 1280×slice_h slices and return
each as base64-encoded PNG. Vision models choke on 25k-tall images
(resampled down to ~1024 → unreadable text); slicing keeps DPI."""
if not png_bytes:
return []
try:
from PIL import Image
from io import BytesIO
except ImportError:
return []
img = Image.open(BytesIO(png_bytes)).convert("RGB")
w, h = img.size
n = min((h + slice_h - 1) // slice_h, max_slices)
out: list[str] = []
for i in range(n):
top = i * slice_h
bot = min((i + 1) * slice_h, h)
chunk = img.crop((0, top, w, bot))
buf = BytesIO()
chunk.save(buf, format="PNG", optimize=True)
out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
return out
# ── 2. Vision-LLM-OCR ──────────────────────────────────────────────
async def _call_vision_on_slice(b64_png: str,
timeout_s: float = 240.0) -> str:
"""Ask the vision model to dump all cookie-row text from one slice
as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
prompt = (
"Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
"Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
"Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
"Zweck, Speicherdauer, Art (Permanent/Session). "
"Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
"KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
"Zeilen, keine Erklaerungen."
)
payload = {
"model": VISION_MODEL,
"stream": False,
"messages": [{
"role": "user", "content": prompt, "images": [b64_png],
}],
"options": {"temperature": 0.05, "num_predict": 4000},
}
try:
async with httpx.AsyncClient(timeout=timeout_s) as c:
r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat",
json=payload)
r.raise_for_status()
return (r.json().get("message") or {}).get("content", "") or ""
except Exception as e:
logger.debug("vision slice failed: %s", e)
return ""
async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
max_slices: int = 20) -> str:
"""Slice + vision-OCR each slice + concatenate."""
slices = _slice_screenshot(png_bytes, slice_h=1500,
max_slices=max_slices)
if not slices:
return ""
logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
len(slices), VISION_MODEL)
parts: list[str] = []
for i, s in enumerate(slices):
txt = await _call_vision_on_slice(s)
if txt:
parts.append(txt)
logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
len(txt))
full = "\n".join(parts)
logger.info("Vision-OCR slicing total: %d chars from %d slices",
len(full), len(slices))
return full
# ── 3. PaddleOCR (fallback) ────────────────────────────────────────
def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
"""Run PaddleOCR over the full-page screenshot, returning the
concatenated text. Splits tall screenshots into 1280x3000 slices."""
if not png_bytes:
return ""
try:
from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR
except ImportError as e:
logger.warning("PaddleOCR / PIL not available: %s", e)
return ""
try:
img = Image.open(BytesIO(png_bytes)).convert("RGB")
except Exception as e:
logger.warning("PIL open failed: %s", e)
return ""
w, h = img.size
slice_h = 3000
n_slices = (h + slice_h - 1) // slice_h
logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
w, h, n_slices, slice_h)
global _PADDLE_OCR
if _PADDLE_OCR is None:
try:
_PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
show_log=False)
except Exception as e:
logger.warning("PaddleOCR init failed: %s", e)
return ""
parts: list[str] = []
import numpy as np
for i in range(n_slices):
top = i * slice_h
bot = min((i + 1) * slice_h, h)
crop = img.crop((0, top, w, bot))
arr = np.array(crop)
try:
result = _PADDLE_OCR.ocr(arr, cls=False)
except Exception as e:
logger.warning("PaddleOCR slice %d failed: %s", i, e)
continue
if not result:
continue
for page in result:
if not page:
continue
for line in page:
if not line:
continue
try:
if isinstance(line, list) and len(line) >= 2:
txt = (line[1][0]
if isinstance(line[1], (list, tuple))
else str(line[1]))
else:
txt = str(line)
if txt:
parts.append(txt)
except Exception:
continue
full_text = "\n".join(parts)
logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
len(parts), len(full_text), n_slices)
return full_text
# ── 4. Tesseract OCR (Hauptpfad) ───────────────────────────────────
def ocr_screenshot_via_tesseract(png_bytes: bytes,
lang: str = "deu",
psm: int = 4) -> str:
"""Run Tesseract OCR on a full-page screenshot. psm=4 = single column
of text of variable sizes (cookie-tables)."""
if not png_bytes:
return ""
try:
import pytesseract
from PIL import Image
from io import BytesIO
import re as _re
except ImportError as e:
logger.warning("tesseract/PIL not available: %s", e)
return ""
try:
img = Image.open(BytesIO(png_bytes)).convert("RGB")
raw = pytesseract.image_to_string(img, lang=lang,
config=f"--psm {psm}")
norm = _re.sub(r"[ \t]+", " ", raw)
norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
norm = _re.sub(r"\s{2,}", " ", norm)
logger.info(
"Tesseract OCR: %d chars / %d words (image %dx%d)",
len(norm), len(norm.split()), img.size[0], img.size[1],
)
return norm
except Exception as e:
logger.warning("Tesseract OCR failed: %s (%s)",
str(e) or "(no msg)", type(e).__name__)
return ""
# ── 5. Anchor-basierter Parser ─────────────────────────────────────
_CATEGORY_ANCHORS = (
r"Funktionscookie", r"Trackingcookie",
r"Tracking Cookies?", r"Session Cookies?",
r"Funktional", r"Marketing", r"Analytics", r"Necessary",
r"Werbung", r"Personalisierung", r"Statistik",
r"Notwendig", r"Erforderlich",
)
_CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS)
+ r")(?:\s*\([^)]*\))?")
_COOKIE_NAME_RE = (
r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
)
def parse_ocr_cookie_table(text: str) -> list[dict]:
"""Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens-
Korrektur — `awsalb` bleibt `awsalb`."""
if not text or len(text) < 200:
return []
pattern = re.compile(
rf"(?P<name>{_COOKIE_NAME_RE})\s+"
rf"(?P<category>{_CATEGORY_PATTERN})"
rf"(?P<rest>[^A-Z]{{0,300}}?)"
rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*"
rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|"
rf"Persistent\s*Cookie|Persistent\s*cookie))?",
re.IGNORECASE | re.DOTALL,
)
seen_names: set[str] = set()
out: list[dict] = []
for m in pattern.finditer(text):
name = (m.group("name") or "").strip()
if not name or len(name) < 3:
continue
nl = name.lower()
if nl in seen_names:
continue
if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
"cookie", "cookies", "name des cookies",
"this", "dieser", "diese", "alle", "und", "von", "der",
"die", "das", "ein", "eine", "session", "permanent",
"category"):
continue
has_marker = any(c in name for c in "_-.<>")
is_caps = name.upper() == name and len(name) >= 3
is_camel = (any(c.isupper() for c in name[1:])
and any(c.islower() for c in name))
if not (has_marker or is_caps or is_camel):
continue
seen_names.add(nl)
out.append({
"name": name[:80],
"category": (m.group("category") or "").strip()[:60],
"purpose": (m.group("rest") or "").strip()[:200],
"duration": (m.group("duration") or "").strip()[:60],
"type": (m.group("type") or "").strip()[:30],
"vendor": "",
})
logger.info("parse_ocr_cookie_table: %d unique cookies extracted",
len(out))
return out
# ── 6. Vision-Response-Parser ──────────────────────────────────────
VISION_PROMPT = (
"Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
"ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
"Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
"'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
"(z.B. 'Permanent', 'Session').\n\n"
"Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
"ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
"Halluzinationen.\n\n"
"Antworte als reines JSON-Objekt im Format:\n"
'{"cookies": [\n'
' {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
'"purpose": "<Kurzfassung Zweck max 120 chars>", '
'"duration": "<Speicherdauer mit Einheit>", '
'"type": "<Permanent|Session|...>", '
'"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
"]}\n\n"
"Nur JSON, kein Erklaerungstext, keine Code-Fences."
)
def parse_vision_response(content: str) -> list[dict]:
"""Be lenient: code fences, leading prose, partial JSON."""
if not content:
return []
txt = content.strip()
if txt.startswith("```"):
lines = txt.split("\n")
if lines and lines[-1].strip().startswith("```"):
txt = "\n".join(lines[1:-1])
else:
txt = "\n".join(lines[1:])
a, b = txt.find("{"), txt.rfind("}")
if not (0 <= a < b):
return []
try:
obj = json.loads(txt[a:b + 1])
except json.JSONDecodeError:
return []
if not isinstance(obj, dict):
return []
arr = obj.get("cookies") or obj.get("Cookies") or []
if not isinstance(arr, list):
return []
out: list[dict] = []
for item in arr[:300]:
if not isinstance(item, dict):
continue
name = (item.get("name") or "").strip()
if not name or len(name) < 2 or len(name) > 80:
continue
if re.fullmatch(r"[\s\-_.]+", name):
continue
out.append({
"name": name[:80],
"category": (item.get("category") or "").strip()[:60],
"purpose": (item.get("purpose") or "").strip()[:200],
"duration": (item.get("duration") or "").strip()[:60],
"type": (item.get("type") or "").strip()[:30],
"vendor": (item.get("vendor") or "").strip()[:80],
})
return out