breakpilot-compliance/backend-compliance/compliance/services/cookie_screenshot_ocr.py

"""Screenshot-basierte Cookie-Extraktion (Orchestration).

Pipeline:
1. consent-tester macht Full-Page-Screenshot (Banner akzeptiert,
   Accordions ausgeklappt, Timestamp eingebrannt) → PNG b64
2. Tesseract OCR (lang=deu, psm=4) → Rohtext mit Tabellen-Reihen
3. parse_ocr_cookie_table(text) → strukturierte Liste

Phase-1-Split (2026-06-06): Engine-Funktionen
(_slice_screenshot / vision-OCR / paddle / tesseract / parse) leben
jetzt in `cookie_screenshot_ocr_engines.py`. Re-Exports halten die
Public-API stabil — externe Importer (`_phase_d1_vendors_raw.py`)
brauchen keinen Code-Change.
"""

from __future__ import annotations

import base64 as _b64
import logging
import os

import httpx

from .cookie_screenshot_ocr_engines import (  # noqa: F401  (re-exports)
    OLLAMA_URL,
    VISION_MODEL,
    VISION_PROMPT,
    _PADDLE_OCR,
    _call_vision_on_slice,
    _slice_screenshot,
    ocr_screenshot_via_paddle,
    ocr_screenshot_via_tesseract,
    ocr_screenshot_via_vision_slices,
    parse_ocr_cookie_table,
    parse_vision_response,
)

logger = logging.getLogger(__name__)

CONSENT_TESTER_URL = os.getenv(
    "CONSENT_TESTER_URL", "http://bp-compliance-consent-tester:8094"
)


# Backward-compat: some callers may import _parse_vision_response
_parse_vision_response = parse_vision_response


async def capture_cookie_evidence_slices(
    cookie_url: str, check_id: str = "",
    viewport_h: int = 1024, overlap_px: int = 200, max_slices: int = 40,
    timeout_s: float = 180.0,
) -> dict:
    """Capture a full-page screenshot and slice it (with overlap) in-memory.

    Why not scroll-based slicing in Playwright? VW's cookie-page uses
    scroll-snap / fixed-position elements that defeat window.scrollTo —
    all viewport screenshots came back identical (header overlay only).
    A full-page screenshot bypasses scrolling entirely, and we slice the
    PNG bytes locally via PIL to get the same overlapping evidence chain.
    """
    if not cookie_url:
        return {"slices": [], "error": "no url"}
    try:
        async with httpx.AsyncClient(timeout=timeout_s) as c:
            r = await c.post(
                f"{CONSENT_TESTER_URL}/capture-evidence",
                json={"url": cookie_url, "check_id": check_id},
                timeout=timeout_s,
            )
            r.raise_for_status()
            data = r.json()
    except Exception as e:
        logger.warning("capture full-page evidence failed: %s", e)
        return {"slices": [], "error": str(e)[:200]}

    png_b64 = data.get("png_b64", "")
    if not png_b64:
        return {"slices": [], "error": data.get("error", "no png")}

    try:
        from PIL import Image
        from io import BytesIO
        import hashlib as _hl
        png = _b64.b64decode(png_b64)
        img = Image.open(BytesIO(png)).convert("RGB")
        w, h = img.size
        step = max(1, viewport_h - overlap_px)
        slices: list[dict] = []
        idx = 0
        y = 0
        while y < h and idx < max_slices:
            top = y
            bot = min(y + viewport_h, h)
            chunk = img.crop((0, top, w, bot))
            buf = BytesIO()
            chunk.save(buf, format="PNG", optimize=True)
            png_chunk = buf.getvalue()
            slices.append({
                "idx": idx,
                "ts": data.get("captured_at", ""),
                "top_y": top, "bot_y": bot,
                "sha256": _hl.sha256(png_chunk).hexdigest()[:16],
                "png_b64": _b64.b64encode(png_chunk).decode("ascii"),
                "png_size": len(png_chunk),
            })
            y += step
            idx += 1
        logger.info(
            "Evidence-slices (PIL-cut): %s → %d slices (image %dx%d, "
            "viewport=%d, overlap=%d)",
            cookie_url, len(slices), w, h, viewport_h, overlap_px,
        )
        return {
            "slices": slices,
            "total_height_px": h,
            "width_px": w,
            "accepted_banner": data.get("accepted_banner"),
            "expanded": data.get("expanded"),
            "url": data.get("url", cookie_url),
            "captured_at": data.get("captured_at", ""),
        }
    except Exception as e:
        logger.warning("PIL-slice failed: %s (%s)",
                        str(e) or "(no msg)", type(e).__name__)
        return {"slices": [], "error": str(e)[:200]}


def _ocr_one_slice(s: dict) -> tuple[dict, list[dict]]:
    """Helper for parallel execution: tesseract + parse for one slice."""
    try:
        png = _b64.b64decode(s.get("png_b64", ""))
    except Exception:
        return ({"idx": s.get("idx"), "ts": s.get("ts"),
                 "top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
                 "cookies_found": 0}, [])
    text = ocr_screenshot_via_tesseract(png)
    chunk = parse_ocr_cookie_table(text)
    return ({"idx": s.get("idx"), "ts": s.get("ts"),
             "top_y": s.get("top_y"), "bot_y": s.get("bot_y"),
             "cookies_found": len(chunk)},
            chunk)


def ocr_slices_extract_cookies(
    slices: list[dict], max_workers: int = 4,
) -> tuple[list[dict], dict]:
    """Run Tesseract on each slice IN PARALLEL + parse + dedup by name.

    Tesseract releases the GIL during its C-level OCR, so a
    ThreadPoolExecutor with 4 workers yields ~4x speedup on multi-core
    machines (M4 Pro has plenty). Sequential 32 slices = ~60s, parallel
    ~15s.
    """
    from concurrent.futures import ThreadPoolExecutor

    if not slices:
        return [], {"per_slice": [], "total_raw": 0,
                    "total_unique": 0, "slices": 0}

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        results = list(ex.map(_ocr_one_slice, slices))

    per_slice: list[dict] = [r[0] for r in results]
    all_cookies: list[dict] = []
    seen_names: set[str] = set()
    for _, chunk in results:
        for c in chunk:
            nl = (c.get("name") or "").strip().lower()
            if not nl or nl in seen_names:
                continue
            seen_names.add(nl)
            all_cookies.append(c)

    stats = {
        "per_slice": per_slice,
        "total_raw": sum(p["cookies_found"] for p in per_slice),
        "total_unique": len(all_cookies),
        "slices": len(slices),
    }
    logger.info(
        "ocr_slices_extract_cookies (parallel=%d): %d slices → %d raw → %d unique",
        max_workers, stats["slices"], stats["total_raw"],
        stats["total_unique"],
    )
    return all_cookies, stats


async def capture_cookie_screenshot(
    cookie_url: str, check_id: str = "", timeout_s: float = 60.0,
) -> dict:
    """Trigger consent-tester to capture full-page screenshot of cookie URL."""
    if not cookie_url:
        return {"png_b64": "", "error": "no url"}
    try:
        async with httpx.AsyncClient(timeout=timeout_s) as c:
            r = await c.post(
                f"{CONSENT_TESTER_URL}/capture-evidence",
                json={"url": cookie_url, "check_id": check_id},
                timeout=timeout_s,
            )
            r.raise_for_status()
            data = r.json()
            logger.info(
                "Evidence-Screenshot: %s -> %d bytes (%dx%d, expanded=%d, accepted=%s)",
                cookie_url, data.get("png_size", 0),
                data.get("width_px", 0), data.get("height_px", 0),
                data.get("expanded", 0), data.get("accepted_banner"),
            )
            return data
    except Exception as e:
        logger.warning("capture_cookie_screenshot failed for %s: %s",
                        cookie_url, e)
        return {"png_b64": "", "error": str(e)[:200]}


async def extract_cookies_via_vision(
    png_b64: str, timeout_s: float = 240.0,
) -> list[dict]:
    """Call Ollama vision model with the screenshot + extraction prompt."""
    if not png_b64:
        return []
    payload = {
        "model": VISION_MODEL,
        "stream": False,
        "format": "json",
        "messages": [{
            "role": "user",
            "content": VISION_PROMPT,
            "images": [png_b64],
        }],
        "options": {"temperature": 0.05, "num_predict": 8000},
    }
    try:
        async with httpx.AsyncClient(timeout=timeout_s) as c:
            r = await c.post(
                f"{OLLAMA_URL.rstrip('/')}/api/chat",
                json=payload,
            )
            r.raise_for_status()
        content = (r.json().get("message") or {}).get("content", "") or ""
        cookies = parse_vision_response(content)
        logger.info(
            "Vision-OCR extracted %d cookies (model=%s, response_len=%d)",
            len(cookies), VISION_MODEL, len(content),
        )
        return cookies
    except Exception as e:
        logger.warning(
            "Vision-OCR call failed: %s (%s) model=%s",
            str(e) or "(no msg)", type(e).__name__, VISION_MODEL,
        )
        return []


def cookies_to_vendor_records(
    cookies: list[dict], guess_vendor_fn=None,
) -> list[dict]:
    """Aggregate OCR-extracted cookies into vendor records compatible with
    cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor."""
    by_vendor: dict[str, dict] = {}
    for c in cookies:
        v_name = (c.get("vendor") or "").strip()
        if not v_name and guess_vendor_fn:
            try:
                v_name = guess_vendor_fn(c["name"]) or ""
            except Exception:
                v_name = ""
        if not v_name:
            v_name = "Unbekannter Anbieter"
        v = by_vendor.setdefault(v_name, {
            "name": v_name,
            "country": "",
            "purpose": "",
            "category": c.get("category", ""),
            "opt_out_url": "",
            "privacy_policy_url": "",
            "persistence": c.get("duration", ""),
            "cookies": [],
            "source": "vision_ocr",
        })
        v["cookies"].append({
            "name": c["name"],
            "purpose": c.get("purpose", ""),
            "expiry": c.get("duration", ""),
            "is_third_party": True,
            "declared_category": c.get("category", ""),
            "type": c.get("type", ""),
        })
    return list(by_vendor.values())