"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split). Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt: - PIL-basiertes _slice_screenshot (zerteilt PNG in subimages) - Vision-LLM-OCR (ollama qwen2.5vl:32b) - PaddleOCR fallback - Tesseract OCR (Hauptpfad) - Anchor-basierter Parser parse_ocr_cookie_table - _parse_vision_response (JSON-Toleranz für Vision-Output) """ from __future__ import annotations import base64 as _b64 import json import logging import os import re import httpx logger = logging.getLogger(__name__) VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b") OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") _PADDLE_OCR = None # lazy-initialised PaddleOCR instance # ── 1. Screenshot-Slicing für Vision-Models ──────────────────────── def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500, max_slices: int = 25) -> list[str]: """Cut a tall full-page screenshot into 1280×slice_h slices and return each as base64-encoded PNG. Vision models choke on 25k-tall images (resampled down to ~1024 → unreadable text); slicing keeps DPI.""" if not png_bytes: return [] try: from PIL import Image from io import BytesIO except ImportError: return [] img = Image.open(BytesIO(png_bytes)).convert("RGB") w, h = img.size n = min((h + slice_h - 1) // slice_h, max_slices) out: list[str] = [] for i in range(n): top = i * slice_h bot = min((i + 1) * slice_h, h) chunk = img.crop((0, top, w, bot)) buf = BytesIO() chunk.save(buf, format="PNG", optimize=True) out.append(_b64.b64encode(buf.getvalue()).decode("ascii")) return out # ── 2. Vision-LLM-OCR ────────────────────────────────────────────── async def _call_vision_on_slice(b64_png: str, timeout_s: float = 240.0) -> str: """Ask the vision model to dump all cookie-row text from one slice as raw text (NOT JSON). We parse it downstream with parse_flat regex.""" prompt = ( "Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. " "Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro " "Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, " "Zweck, Speicherdauer, Art (Permanent/Session). " "Format: ' | | | | '. " "KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-" "Zeilen, keine Erklaerungen." ) payload = { "model": VISION_MODEL, "stream": False, "messages": [{ "role": "user", "content": prompt, "images": [b64_png], }], "options": {"temperature": 0.05, "num_predict": 4000}, } try: async with httpx.AsyncClient(timeout=timeout_s) as c: r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat", json=payload) r.raise_for_status() return (r.json().get("message") or {}).get("content", "") or "" except Exception as e: logger.debug("vision slice failed: %s", e) return "" async def ocr_screenshot_via_vision_slices(png_bytes: bytes, max_slices: int = 20) -> str: """Slice + vision-OCR each slice + concatenate.""" slices = _slice_screenshot(png_bytes, slice_h=1500, max_slices=max_slices) if not slices: return "" logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)", len(slices), VISION_MODEL) parts: list[str] = [] for i, s in enumerate(slices): txt = await _call_vision_on_slice(s) if txt: parts.append(txt) logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices), len(txt)) full = "\n".join(parts) logger.info("Vision-OCR slicing total: %d chars from %d slices", len(full), len(slices)) return full # ── 3. PaddleOCR (fallback) ──────────────────────────────────────── def ocr_screenshot_via_paddle(png_bytes: bytes) -> str: """Run PaddleOCR over the full-page screenshot, returning the concatenated text. Splits tall screenshots into 1280x3000 slices.""" if not png_bytes: return "" try: from PIL import Image from io import BytesIO from paddleocr import PaddleOCR except ImportError as e: logger.warning("PaddleOCR / PIL not available: %s", e) return "" try: img = Image.open(BytesIO(png_bytes)).convert("RGB") except Exception as e: logger.warning("PIL open failed: %s", e) return "" w, h = img.size slice_h = 3000 n_slices = (h + slice_h - 1) // slice_h logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high", w, h, n_slices, slice_h) global _PADDLE_OCR if _PADDLE_OCR is None: try: _PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german", show_log=False) except Exception as e: logger.warning("PaddleOCR init failed: %s", e) return "" parts: list[str] = [] import numpy as np for i in range(n_slices): top = i * slice_h bot = min((i + 1) * slice_h, h) crop = img.crop((0, top, w, bot)) arr = np.array(crop) try: result = _PADDLE_OCR.ocr(arr, cls=False) except Exception as e: logger.warning("PaddleOCR slice %d failed: %s", i, e) continue if not result: continue for page in result: if not page: continue for line in page: if not line: continue try: if isinstance(line, list) and len(line) >= 2: txt = (line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])) else: txt = str(line) if txt: parts.append(txt) except Exception: continue full_text = "\n".join(parts) logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices", len(parts), len(full_text), n_slices) return full_text # ── 4. Tesseract OCR (Hauptpfad) ─────────────────────────────────── def ocr_screenshot_via_tesseract(png_bytes: bytes, lang: str = "deu", psm: int = 4) -> str: """Run Tesseract OCR on a full-page screenshot. psm=4 = single column of text of variable sizes (cookie-tables).""" if not png_bytes: return "" try: import pytesseract from PIL import Image from io import BytesIO import re as _re except ImportError as e: logger.warning("tesseract/PIL not available: %s", e) return "" try: img = Image.open(BytesIO(png_bytes)).convert("RGB") raw = pytesseract.image_to_string(img, lang=lang, config=f"--psm {psm}") norm = _re.sub(r"[ \t]+", " ", raw) norm = _re.sub(r"\n(?!\s*\n)", " ", norm) norm = _re.sub(r"\s{2,}", " ", norm) logger.info( "Tesseract OCR: %d chars / %d words (image %dx%d)", len(norm), len(norm.split()), img.size[0], img.size[1], ) return norm except Exception as e: logger.warning("Tesseract OCR failed: %s (%s)", str(e) or "(no msg)", type(e).__name__) return "" # ── 5. Anchor-basierter Parser ───────────────────────────────────── _CATEGORY_ANCHORS = ( r"Funktionscookie", r"Trackingcookie", r"Tracking Cookies?", r"Session Cookies?", r"Funktional", r"Marketing", r"Analytics", r"Necessary", r"Werbung", r"Personalisierung", r"Statistik", r"Notwendig", r"Erforderlich", ) _CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS) + r")(?:\s*\([^)]*\))?") _COOKIE_NAME_RE = ( r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)" ) def parse_ocr_cookie_table(text: str) -> list[dict]: """Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens- Korrektur — `awsalb` bleibt `awsalb`.""" if not text or len(text) < 200: return [] pattern = re.compile( rf"(?P{_COOKIE_NAME_RE})\s+" rf"(?P{_CATEGORY_PATTERN})" rf"(?P[^A-Z]{{0,300}}?)" rf"(?:(?P\d+(?:[.,]\s*)?\s*" rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*" rf"(?PPermanent/Protokoll|Session\s*Cookie|" rf"Persistent\s*Cookie|Persistent\s*cookie))?", re.IGNORECASE | re.DOTALL, ) seen_names: set[str] = set() out: list[dict] = [] for m in pattern.finditer(text): name = (m.group("name") or "").strip() if not name or len(name) < 3: continue nl = name.lower() if nl in seen_names: continue if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter", "cookie", "cookies", "name des cookies", "this", "dieser", "diese", "alle", "und", "von", "der", "die", "das", "ein", "eine", "session", "permanent", "category"): continue has_marker = any(c in name for c in "_-.<>") is_caps = name.upper() == name and len(name) >= 3 is_camel = (any(c.isupper() for c in name[1:]) and any(c.islower() for c in name)) if not (has_marker or is_caps or is_camel): continue seen_names.add(nl) out.append({ "name": name[:80], "category": (m.group("category") or "").strip()[:60], "purpose": (m.group("rest") or "").strip()[:200], "duration": (m.group("duration") or "").strip()[:60], "type": (m.group("type") or "").strip()[:30], "vendor": "", }) logger.info("parse_ocr_cookie_table: %d unique cookies extracted", len(out)) return out # ── 6. Vision-Response-Parser ────────────────────────────────────── VISION_PROMPT = ( "Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite " "ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: " "Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', " "'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies " "(z.B. 'Permanent', 'Session').\n\n" "Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten " "ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE " "Halluzinationen.\n\n" "Antworte als reines JSON-Objekt im Format:\n" '{"cookies": [\n' ' {"name": "", "category": "", ' '"purpose": "", ' '"duration": "", ' '"type": "", ' '"vendor": ""}\n' "]}\n\n" "Nur JSON, kein Erklaerungstext, keine Code-Fences." ) def parse_vision_response(content: str) -> list[dict]: """Be lenient: code fences, leading prose, partial JSON.""" if not content: return [] txt = content.strip() if txt.startswith("```"): lines = txt.split("\n") if lines and lines[-1].strip().startswith("```"): txt = "\n".join(lines[1:-1]) else: txt = "\n".join(lines[1:]) a, b = txt.find("{"), txt.rfind("}") if not (0 <= a < b): return [] try: obj = json.loads(txt[a:b + 1]) except json.JSONDecodeError: return [] if not isinstance(obj, dict): return [] arr = obj.get("cookies") or obj.get("Cookies") or [] if not isinstance(arr, list): return [] out: list[dict] = [] for item in arr[:300]: if not isinstance(item, dict): continue name = (item.get("name") or "").strip() if not name or len(name) < 2 or len(name) > 80: continue if re.fullmatch(r"[\s\-_.]+", name): continue out.append({ "name": name[:80], "category": (item.get("category") or "").strip()[:60], "purpose": (item.get("purpose") or "").strip()[:200], "duration": (item.get("duration") or "").strip()[:60], "type": (item.get("type") or "").strip()[:30], "vendor": (item.get("vendor") or "").strip()[:80], }) return out