refactor: split cookie_screenshot_ocr.py (642 → 290 + 353 LOC)

CI hard-cap 500 LOC. cookie_screenshot_ocr.py war auf 642 gewachsen, also gesplittet: - cookie_screenshot_ocr_engines.py (353 LOC, NEU) OCR-Engine-Funktionen: _slice_screenshot, Vision-LLM (qwen2.5vl), PaddleOCR, Tesseract, parse_ocr_cookie_table, parse_vision_response, Konstanten VISION_MODEL/OLLAMA_URL/VISION_PROMPT. - cookie_screenshot_ocr.py (290 LOC, REWRITE) Orchestration: capture_cookie_evidence_slices, _ocr_one_slice, ocr_slices_extract_cookies, capture_cookie_screenshot, extract_cookies_via_vision, cookies_to_vendor_records. Re-Exports der Engine-Funktionen für Backward-Kompat. Einziger externer Importer (_phase_d1_vendors_raw.py) braucht keinen Code-Change — Public-API stabil. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-06 23:35:33 +02:00
parent ff796fb480
commit 02879a2c3a
9 changed files with 1790 additions and 384 deletions
@@ -0,0 +1,353 @@
+"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split).
+
+Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt:
+  - PIL-basiertes _slice_screenshot (zerteilt PNG in subimages)
+  - Vision-LLM-OCR (ollama qwen2.5vl:32b)
+  - PaddleOCR fallback
+  - Tesseract OCR (Hauptpfad)
+  - Anchor-basierter Parser parse_ocr_cookie_table
+  - _parse_vision_response (JSON-Toleranz für Vision-Output)
+"""
+
+from __future__ import annotations
+
+import base64 as _b64
+import json
+import logging
+import os
+import re
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+
+_PADDLE_OCR = None  # lazy-initialised PaddleOCR instance
+
+
+# ── 1. Screenshot-Slicing für Vision-Models ────────────────────────
+
+def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
+                      max_slices: int = 25) -> list[str]:
+    """Cut a tall full-page screenshot into 1280×slice_h slices and return
+    each as base64-encoded PNG. Vision models choke on 25k-tall images
+    (resampled down to ~1024 → unreadable text); slicing keeps DPI."""
+    if not png_bytes:
+        return []
+    try:
+        from PIL import Image
+        from io import BytesIO
+    except ImportError:
+        return []
+    img = Image.open(BytesIO(png_bytes)).convert("RGB")
+    w, h = img.size
+    n = min((h + slice_h - 1) // slice_h, max_slices)
+    out: list[str] = []
+    for i in range(n):
+        top = i * slice_h
+        bot = min((i + 1) * slice_h, h)
+        chunk = img.crop((0, top, w, bot))
+        buf = BytesIO()
+        chunk.save(buf, format="PNG", optimize=True)
+        out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
+    return out
+
+
+# ── 2. Vision-LLM-OCR ──────────────────────────────────────────────
+
+async def _call_vision_on_slice(b64_png: str,
+                                  timeout_s: float = 240.0) -> str:
+    """Ask the vision model to dump all cookie-row text from one slice
+    as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
+    prompt = (
+        "Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
+        "Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
+        "Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
+        "Zweck, Speicherdauer, Art (Permanent/Session). "
+        "Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
+        "KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
+        "Zeilen, keine Erklaerungen."
+    )
+    payload = {
+        "model": VISION_MODEL,
+        "stream": False,
+        "messages": [{
+            "role": "user", "content": prompt, "images": [b64_png],
+        }],
+        "options": {"temperature": 0.05, "num_predict": 4000},
+    }
+    try:
+        async with httpx.AsyncClient(timeout=timeout_s) as c:
+            r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat",
+                              json=payload)
+            r.raise_for_status()
+        return (r.json().get("message") or {}).get("content", "") or ""
+    except Exception as e:
+        logger.debug("vision slice failed: %s", e)
+        return ""
+
+
+async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
+                                            max_slices: int = 20) -> str:
+    """Slice + vision-OCR each slice + concatenate."""
+    slices = _slice_screenshot(png_bytes, slice_h=1500,
+                                max_slices=max_slices)
+    if not slices:
+        return ""
+    logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
+                 len(slices), VISION_MODEL)
+    parts: list[str] = []
+    for i, s in enumerate(slices):
+        txt = await _call_vision_on_slice(s)
+        if txt:
+            parts.append(txt)
+        logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
+                     len(txt))
+    full = "\n".join(parts)
+    logger.info("Vision-OCR slicing total: %d chars from %d slices",
+                 len(full), len(slices))
+    return full
+
+
+# ── 3. PaddleOCR (fallback) ────────────────────────────────────────
+
+def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
+    """Run PaddleOCR over the full-page screenshot, returning the
+    concatenated text. Splits tall screenshots into 1280x3000 slices."""
+    if not png_bytes:
+        return ""
+    try:
+        from PIL import Image
+        from io import BytesIO
+        from paddleocr import PaddleOCR
+    except ImportError as e:
+        logger.warning("PaddleOCR / PIL not available: %s", e)
+        return ""
+
+    try:
+        img = Image.open(BytesIO(png_bytes)).convert("RGB")
+    except Exception as e:
+        logger.warning("PIL open failed: %s", e)
+        return ""
+
+    w, h = img.size
+    slice_h = 3000
+    n_slices = (h + slice_h - 1) // slice_h
+    logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
+                 w, h, n_slices, slice_h)
+
+    global _PADDLE_OCR
+    if _PADDLE_OCR is None:
+        try:
+            _PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
+                                     show_log=False)
+        except Exception as e:
+            logger.warning("PaddleOCR init failed: %s", e)
+            return ""
+
+    parts: list[str] = []
+    import numpy as np
+    for i in range(n_slices):
+        top = i * slice_h
+        bot = min((i + 1) * slice_h, h)
+        crop = img.crop((0, top, w, bot))
+        arr = np.array(crop)
+        try:
+            result = _PADDLE_OCR.ocr(arr, cls=False)
+        except Exception as e:
+            logger.warning("PaddleOCR slice %d failed: %s", i, e)
+            continue
+        if not result:
+            continue
+        for page in result:
+            if not page:
+                continue
+            for line in page:
+                if not line:
+                    continue
+                try:
+                    if isinstance(line, list) and len(line) >= 2:
+                        txt = (line[1][0]
+                                if isinstance(line[1], (list, tuple))
+                                else str(line[1]))
+                    else:
+                        txt = str(line)
+                    if txt:
+                        parts.append(txt)
+                except Exception:
+                    continue
+
+    full_text = "\n".join(parts)
+    logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
+                 len(parts), len(full_text), n_slices)
+    return full_text
+
+
+# ── 4. Tesseract OCR (Hauptpfad) ───────────────────────────────────
+
+def ocr_screenshot_via_tesseract(png_bytes: bytes,
+                                  lang: str = "deu",
+                                  psm: int = 4) -> str:
+    """Run Tesseract OCR on a full-page screenshot. psm=4 = single column
+    of text of variable sizes (cookie-tables)."""
+    if not png_bytes:
+        return ""
+    try:
+        import pytesseract
+        from PIL import Image
+        from io import BytesIO
+        import re as _re
+    except ImportError as e:
+        logger.warning("tesseract/PIL not available: %s", e)
+        return ""
+    try:
+        img = Image.open(BytesIO(png_bytes)).convert("RGB")
+        raw = pytesseract.image_to_string(img, lang=lang,
+                                            config=f"--psm {psm}")
+        norm = _re.sub(r"[ \t]+", " ", raw)
+        norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
+        norm = _re.sub(r"\s{2,}", " ", norm)
+        logger.info(
+            "Tesseract OCR: %d chars / %d words (image %dx%d)",
+            len(norm), len(norm.split()), img.size[0], img.size[1],
+        )
+        return norm
+    except Exception as e:
+        logger.warning("Tesseract OCR failed: %s (%s)",
+                        str(e) or "(no msg)", type(e).__name__)
+        return ""
+
+
+# ── 5. Anchor-basierter Parser ─────────────────────────────────────
+
+_CATEGORY_ANCHORS = (
+    r"Funktionscookie", r"Trackingcookie",
+    r"Tracking Cookies?", r"Session Cookies?",
+    r"Funktional", r"Marketing", r"Analytics", r"Necessary",
+    r"Werbung", r"Personalisierung", r"Statistik",
+    r"Notwendig", r"Erforderlich",
+)
+_CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS)
+                      + r")(?:\s*\([^)]*\))?")
+_COOKIE_NAME_RE = (
+    r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
+)
+
+
+def parse_ocr_cookie_table(text: str) -> list[dict]:
+    """Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens-
+    Korrektur — `awsalb` bleibt `awsalb`."""
+    if not text or len(text) < 200:
+        return []
+    pattern = re.compile(
+        rf"(?P<name>{_COOKIE_NAME_RE})\s+"
+        rf"(?P<category>{_CATEGORY_PATTERN})"
+        rf"(?P<rest>[^A-Z]{{0,300}}?)"
+        rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*"
+        rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
+        rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|"
+        rf"Persistent\s*Cookie|Persistent\s*cookie))?",
+        re.IGNORECASE | re.DOTALL,
+    )
+    seen_names: set[str] = set()
+    out: list[dict] = []
+    for m in pattern.finditer(text):
+        name = (m.group("name") or "").strip()
+        if not name or len(name) < 3:
+            continue
+        nl = name.lower()
+        if nl in seen_names:
+            continue
+        if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
+                  "cookie", "cookies", "name des cookies",
+                  "this", "dieser", "diese", "alle", "und", "von", "der",
+                  "die", "das", "ein", "eine", "session", "permanent",
+                  "category"):
+            continue
+        has_marker = any(c in name for c in "_-.<>")
+        is_caps = name.upper() == name and len(name) >= 3
+        is_camel = (any(c.isupper() for c in name[1:])
+                    and any(c.islower() for c in name))
+        if not (has_marker or is_caps or is_camel):
+            continue
+        seen_names.add(nl)
+        out.append({
+            "name": name[:80],
+            "category": (m.group("category") or "").strip()[:60],
+            "purpose": (m.group("rest") or "").strip()[:200],
+            "duration": (m.group("duration") or "").strip()[:60],
+            "type": (m.group("type") or "").strip()[:30],
+            "vendor": "",
+        })
+    logger.info("parse_ocr_cookie_table: %d unique cookies extracted",
+                 len(out))
+    return out
+
+
+# ── 6. Vision-Response-Parser ──────────────────────────────────────
+
+VISION_PROMPT = (
+    "Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
+    "ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
+    "Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
+    "'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
+    "(z.B. 'Permanent', 'Session').\n\n"
+    "Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
+    "ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
+    "Halluzinationen.\n\n"
+    "Antworte als reines JSON-Objekt im Format:\n"
+    '{"cookies": [\n'
+    '  {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
+    '"purpose": "<Kurzfassung Zweck max 120 chars>", '
+    '"duration": "<Speicherdauer mit Einheit>", '
+    '"type": "<Permanent|Session|...>", '
+    '"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
+    "]}\n\n"
+    "Nur JSON, kein Erklaerungstext, keine Code-Fences."
+)
+
+
+def parse_vision_response(content: str) -> list[dict]:
+    """Be lenient: code fences, leading prose, partial JSON."""
+    if not content:
+        return []
+    txt = content.strip()
+    if txt.startswith("```"):
+        lines = txt.split("\n")
+        if lines and lines[-1].strip().startswith("```"):
+            txt = "\n".join(lines[1:-1])
+        else:
+            txt = "\n".join(lines[1:])
+    a, b = txt.find("{"), txt.rfind("}")
+    if not (0 <= a < b):
+        return []
+    try:
+        obj = json.loads(txt[a:b + 1])
+    except json.JSONDecodeError:
+        return []
+    if not isinstance(obj, dict):
+        return []
+    arr = obj.get("cookies") or obj.get("Cookies") or []
+    if not isinstance(arr, list):
+        return []
+    out: list[dict] = []
+    for item in arr[:300]:
+        if not isinstance(item, dict):
+            continue
+        name = (item.get("name") or "").strip()
+        if not name or len(name) < 2 or len(name) > 80:
+            continue
+        if re.fullmatch(r"[\s\-_.]+", name):
+            continue
+        out.append({
+            "name": name[:80],
+            "category": (item.get("category") or "").strip()[:60],
+            "purpose": (item.get("purpose") or "").strip()[:200],
+            "duration": (item.get("duration") or "").strip()[:60],
+            "type": (item.get("type") or "").strip()[:30],
+            "vendor": (item.get("vendor") or "").strip()[:80],
+        })
+    return out