breakpilot-compliance/backend-compliance/compliance/services/cookie_screenshot_ocr_engines.py

"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split).

Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt:
  - PIL-basiertes _slice_screenshot (zerteilt PNG in subimages)
  - Vision-LLM-OCR (ollama qwen2.5vl:32b)
  - PaddleOCR fallback
  - Tesseract OCR (Hauptpfad)
  - Anchor-basierter Parser parse_ocr_cookie_table
  - _parse_vision_response (JSON-Toleranz für Vision-Output)
"""

from __future__ import annotations

import base64 as _b64
import json
import logging
import os
import re

import httpx

logger = logging.getLogger(__name__)


VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")

_PADDLE_OCR = None  # lazy-initialised PaddleOCR instance


# ── 1. Screenshot-Slicing für Vision-Models ────────────────────────

def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
                      max_slices: int = 25) -> list[str]:
    """Cut a tall full-page screenshot into 1280×slice_h slices and return
    each as base64-encoded PNG. Vision models choke on 25k-tall images
    (resampled down to ~1024 → unreadable text); slicing keeps DPI."""
    if not png_bytes:
        return []
    try:
        from PIL import Image
        from io import BytesIO
    except ImportError:
        return []
    img = Image.open(BytesIO(png_bytes)).convert("RGB")
    w, h = img.size
    n = min((h + slice_h - 1) // slice_h, max_slices)
    out: list[str] = []
    for i in range(n):
        top = i * slice_h
        bot = min((i + 1) * slice_h, h)
        chunk = img.crop((0, top, w, bot))
        buf = BytesIO()
        chunk.save(buf, format="PNG", optimize=True)
        out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
    return out


# ── 2. Vision-LLM-OCR ──────────────────────────────────────────────

async def _call_vision_on_slice(b64_png: str,
                                  timeout_s: float = 240.0) -> str:
    """Ask the vision model to dump all cookie-row text from one slice
    as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
    prompt = (
        "Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
        "Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
        "Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
        "Zweck, Speicherdauer, Art (Permanent/Session). "
        "Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
        "KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
        "Zeilen, keine Erklaerungen."
    )
    payload = {
        "model": VISION_MODEL,
        "stream": False,
        "messages": [{
            "role": "user", "content": prompt, "images": [b64_png],
        }],
        "options": {"temperature": 0.05, "num_predict": 4000},
    }
    try:
        async with httpx.AsyncClient(timeout=timeout_s) as c:
            r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat",
                              json=payload)
            r.raise_for_status()
        return (r.json().get("message") or {}).get("content", "") or ""
    except Exception as e:
        logger.debug("vision slice failed: %s", e)
        return ""


async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
                                            max_slices: int = 20) -> str:
    """Slice + vision-OCR each slice + concatenate."""
    slices = _slice_screenshot(png_bytes, slice_h=1500,
                                max_slices=max_slices)
    if not slices:
        return ""
    logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
                 len(slices), VISION_MODEL)
    parts: list[str] = []
    for i, s in enumerate(slices):
        txt = await _call_vision_on_slice(s)
        if txt:
            parts.append(txt)
        logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
                     len(txt))
    full = "\n".join(parts)
    logger.info("Vision-OCR slicing total: %d chars from %d slices",
                 len(full), len(slices))
    return full


# ── 3. PaddleOCR (fallback) ────────────────────────────────────────

def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
    """Run PaddleOCR over the full-page screenshot, returning the
    concatenated text. Splits tall screenshots into 1280x3000 slices."""
    if not png_bytes:
        return ""
    try:
        from PIL import Image
        from io import BytesIO
        from paddleocr import PaddleOCR
    except ImportError as e:
        logger.warning("PaddleOCR / PIL not available: %s", e)
        return ""

    try:
        img = Image.open(BytesIO(png_bytes)).convert("RGB")
    except Exception as e:
        logger.warning("PIL open failed: %s", e)
        return ""

    w, h = img.size
    slice_h = 3000
    n_slices = (h + slice_h - 1) // slice_h
    logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
                 w, h, n_slices, slice_h)

    global _PADDLE_OCR
    if _PADDLE_OCR is None:
        try:
            _PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
                                     show_log=False)
        except Exception as e:
            logger.warning("PaddleOCR init failed: %s", e)
            return ""

    parts: list[str] = []
    import numpy as np
    for i in range(n_slices):
        top = i * slice_h
        bot = min((i + 1) * slice_h, h)
        crop = img.crop((0, top, w, bot))
        arr = np.array(crop)
        try:
            result = _PADDLE_OCR.ocr(arr, cls=False)
        except Exception as e:
            logger.warning("PaddleOCR slice %d failed: %s", i, e)
            continue
        if not result:
            continue
        for page in result:
            if not page:
                continue
            for line in page:
                if not line:
                    continue
                try:
                    if isinstance(line, list) and len(line) >= 2:
                        txt = (line[1][0]
                                if isinstance(line[1], (list, tuple))
                                else str(line[1]))
                    else:
                        txt = str(line)
                    if txt:
                        parts.append(txt)
                except Exception:
                    continue

    full_text = "\n".join(parts)
    logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
                 len(parts), len(full_text), n_slices)
    return full_text


# ── 4. Tesseract OCR (Hauptpfad) ───────────────────────────────────

def ocr_screenshot_via_tesseract(png_bytes: bytes,
                                  lang: str = "deu",
                                  psm: int = 4) -> str:
    """Run Tesseract OCR on a full-page screenshot. psm=4 = single column
    of text of variable sizes (cookie-tables)."""
    if not png_bytes:
        return ""
    try:
        import pytesseract
        from PIL import Image
        from io import BytesIO
        import re as _re
    except ImportError as e:
        logger.warning("tesseract/PIL not available: %s", e)
        return ""
    try:
        img = Image.open(BytesIO(png_bytes)).convert("RGB")
        raw = pytesseract.image_to_string(img, lang=lang,
                                            config=f"--psm {psm}")
        norm = _re.sub(r"[ \t]+", " ", raw)
        norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
        norm = _re.sub(r"\s{2,}", " ", norm)
        logger.info(
            "Tesseract OCR: %d chars / %d words (image %dx%d)",
            len(norm), len(norm.split()), img.size[0], img.size[1],
        )
        return norm
    except Exception as e:
        logger.warning("Tesseract OCR failed: %s (%s)",
                        str(e) or "(no msg)", type(e).__name__)
        return ""


# ── 5. Anchor-basierter Parser ─────────────────────────────────────

_CATEGORY_ANCHORS = (
    r"Funktionscookie", r"Trackingcookie",
    r"Tracking Cookies?", r"Session Cookies?",
    r"Funktional", r"Marketing", r"Analytics", r"Necessary",
    r"Werbung", r"Personalisierung", r"Statistik",
    r"Notwendig", r"Erforderlich",
)
_CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS)
                      + r")(?:\s*\([^)]*\))?")
_COOKIE_NAME_RE = (
    r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
)


def parse_ocr_cookie_table(text: str) -> list[dict]:
    """Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens-
    Korrektur — `awsalb` bleibt `awsalb`."""
    if not text or len(text) < 200:
        return []
    pattern = re.compile(
        rf"(?P<name>{_COOKIE_NAME_RE})\s+"
        rf"(?P<category>{_CATEGORY_PATTERN})"
        rf"(?P<rest>[^A-Z]{{0,300}}?)"
        rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*"
        rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
        rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|"
        rf"Persistent\s*Cookie|Persistent\s*cookie))?",
        re.IGNORECASE | re.DOTALL,
    )
    seen_names: set[str] = set()
    out: list[dict] = []
    for m in pattern.finditer(text):
        name = (m.group("name") or "").strip()
        if not name or len(name) < 3:
            continue
        nl = name.lower()
        if nl in seen_names:
            continue
        if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
                  "cookie", "cookies", "name des cookies",
                  "this", "dieser", "diese", "alle", "und", "von", "der",
                  "die", "das", "ein", "eine", "session", "permanent",
                  "category"):
            continue
        has_marker = any(c in name for c in "_-.<>")
        is_caps = name.upper() == name and len(name) >= 3
        is_camel = (any(c.isupper() for c in name[1:])
                    and any(c.islower() for c in name))
        if not (has_marker or is_caps or is_camel):
            continue
        seen_names.add(nl)
        out.append({
            "name": name[:80],
            "category": (m.group("category") or "").strip()[:60],
            "purpose": (m.group("rest") or "").strip()[:200],
            "duration": (m.group("duration") or "").strip()[:60],
            "type": (m.group("type") or "").strip()[:30],
            "vendor": "",
        })
    logger.info("parse_ocr_cookie_table: %d unique cookies extracted",
                 len(out))
    return out


# ── 6. Vision-Response-Parser ──────────────────────────────────────

VISION_PROMPT = (
    "Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
    "ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
    "Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
    "'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
    "(z.B. 'Permanent', 'Session').\n\n"
    "Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
    "ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
    "Halluzinationen.\n\n"
    "Antworte als reines JSON-Objekt im Format:\n"
    '{"cookies": [\n'
    '  {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
    '"purpose": "<Kurzfassung Zweck max 120 chars>", '
    '"duration": "<Speicherdauer mit Einheit>", '
    '"type": "<Permanent|Session|...>", '
    '"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
    "]}\n\n"
    "Nur JSON, kein Erklaerungstext, keine Code-Fences."
)


def parse_vision_response(content: str) -> list[dict]:
    """Be lenient: code fences, leading prose, partial JSON."""
    if not content:
        return []
    txt = content.strip()
    if txt.startswith("```"):
        lines = txt.split("\n")
        if lines and lines[-1].strip().startswith("```"):
            txt = "\n".join(lines[1:-1])
        else:
            txt = "\n".join(lines[1:])
    a, b = txt.find("{"), txt.rfind("}")
    if not (0 <= a < b):
        return []
    try:
        obj = json.loads(txt[a:b + 1])
    except json.JSONDecodeError:
        return []
    if not isinstance(obj, dict):
        return []
    arr = obj.get("cookies") or obj.get("Cookies") or []
    if not isinstance(arr, list):
        return []
    out: list[dict] = []
    for item in arr[:300]:
        if not isinstance(item, dict):
            continue
        name = (item.get("name") or "").strip()
        if not name or len(name) < 2 or len(name) > 80:
            continue
        if re.fullmatch(r"[\s\-_.]+", name):
            continue
        out.append({
            "name": name[:80],
            "category": (item.get("category") or "").strip()[:60],
            "purpose": (item.get("purpose") or "").strip()[:200],
            "duration": (item.get("duration") or "").strip()[:60],
            "type": (item.get("type") or "").strip()[:30],
            "vendor": (item.get("vendor") or "").strip()[:80],
        })
    return out