breakpilot-lehrer/klausur-service/backend/handwriting_htr_api.py

"""
Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen.

Endpoints:
- POST /api/v1/htr/recognize          - Bild hochladen → handgeschriebener Text
- POST /api/v1/htr/recognize-session  - OCR-Pipeline Session als Quelle nutzen

Modell-Strategie:
  1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM)
  2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama)

DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
"""

import io
import os
import logging
import time
import base64
from typing import Optional

import cv2
import numpy as np
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
from pydantic import BaseModel

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/htr", tags=["HTR"])

OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large")


# ---------------------------------------------------------------------------
# Pydantic Models
# ---------------------------------------------------------------------------

class HTRSessionRequest(BaseModel):
    session_id: str
    model: str = "auto"       # "auto" | "qwen2.5vl" | "trocr-large"
    use_clean: bool = True    # Prefer clean_png (after handwriting removal)


# ---------------------------------------------------------------------------
# Preprocessing
# ---------------------------------------------------------------------------

def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray:
    """
    CLAHE contrast enhancement + upscale to improve HTR accuracy.
    Returns grayscale enhanced image.
    """
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)

    # Upscale if image is too small
    h, w = enhanced.shape
    if min(h, w) < 800:
        scale = 800 / min(h, w)
        enhanced = cv2.resize(
            enhanced, None, fx=scale, fy=scale,
            interpolation=cv2.INTER_CUBIC
        )

    return enhanced


def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes:
    """Convert BGR ndarray to PNG bytes."""
    success, buf = cv2.imencode(".png", img_bgr)
    if not success:
        raise RuntimeError("Failed to encode image to PNG")
    return buf.tobytes()


def _preprocess_image_bytes(image_bytes: bytes) -> bytes:
    """Load image, apply HTR preprocessing, return PNG bytes."""
    arr = np.frombuffer(image_bytes, dtype=np.uint8)
    img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    if img_bgr is None:
        raise ValueError("Could not decode image")

    enhanced = _preprocess_for_htr(img_bgr)
    # Convert grayscale back to BGR for encoding
    enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
    return _bgr_to_png_bytes(enhanced_bgr)


# ---------------------------------------------------------------------------
# Backend: Ollama qwen2.5vl
# ---------------------------------------------------------------------------

async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]:
    """
    Send image to Ollama qwen2.5vl:32b for HTR.
    Returns extracted text or None on error.
    """
    import httpx

    lang_hint = {
        "de": "Deutsch",
        "en": "Englisch",
        "de+en": "Deutsch und Englisch",
    }.get(language, "Deutsch")

    prompt = (
        f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. "
        "Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. "
        "Antworte NUR mit dem erkannten Text, ohne Erklaerungen."
    )

    img_b64 = base64.b64encode(image_bytes).decode("utf-8")

    payload = {
        "model": OLLAMA_HTR_MODEL,
        "prompt": prompt,
        "images": [img_b64],
        "stream": False,
    }

    try:
        async with httpx.AsyncClient(timeout=120.0) as client:
            resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
            resp.raise_for_status()
            data = resp.json()
            return data.get("response", "").strip()
    except Exception as e:
        logger.warning(f"Ollama qwen2.5vl HTR failed: {e}")
        return None


# ---------------------------------------------------------------------------
# Backend: TrOCR-large fallback
# ---------------------------------------------------------------------------

async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]:
    """
    Use microsoft/trocr-large-handwritten via trocr_service.py.
    Returns extracted text or None on error.
    """
    try:
        from services.trocr_service import run_trocr_ocr, _check_trocr_available
        if not _check_trocr_available():
            logger.warning("TrOCR not available for HTR fallback")
            return None

        text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large")
        return text.strip() if text else None
    except Exception as e:
        logger.warning(f"TrOCR-large HTR failed: {e}")
        return None


# ---------------------------------------------------------------------------
# Core recognition logic
# ---------------------------------------------------------------------------

async def _do_recognize(
    image_bytes: bytes,
    model: str = "auto",
    preprocess: bool = True,
    language: str = "de",
) -> dict:
    """
    Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large.
    Returns dict with text, model_used, processing_time_ms.
    """
    t0 = time.monotonic()

    if preprocess:
        try:
            image_bytes = _preprocess_image_bytes(image_bytes)
        except Exception as e:
            logger.warning(f"HTR preprocessing failed, using raw image: {e}")

    text: Optional[str] = None
    model_used: str = "none"

    use_qwen = model in ("auto", "qwen2.5vl")
    use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None)

    if use_qwen:
        text = await _recognize_with_qwen_vl(image_bytes, language)
        if text is not None:
            model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})"

    if text is None and (use_trocr or model == "trocr-large"):
        text = await _recognize_with_trocr_large(image_bytes)
        if text is not None:
            model_used = "trocr-large-handwritten"

    if text is None:
        text = ""
        model_used = "none (all backends failed)"

    elapsed_ms = int((time.monotonic() - t0) * 1000)

    return {
        "text": text,
        "model_used": model_used,
        "processing_time_ms": elapsed_ms,
        "language": language,
        "preprocessed": preprocess,
    }


# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------

@router.post("/recognize")
async def recognize_handwriting(
    file: UploadFile = File(...),
    model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"),
    preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"),
    language: str = Query("de", description="de | en | de+en"),
):
    """
    Upload an image and get back the handwritten text as plain text.

    Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten.
    """
    if model not in ("auto", "qwen2.5vl", "trocr-large"):
        raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large")
    if language not in ("de", "en", "de+en"):
        raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en")

    image_bytes = await file.read()
    if not image_bytes:
        raise HTTPException(status_code=400, detail="Empty file")

    return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language)


@router.post("/recognize-session")
async def recognize_from_session(req: HTRSessionRequest):
    """
    Use an OCR-Pipeline session as image source for HTR.

    Set use_clean=true to prefer the clean image (after handwriting removal step).
    This is useful when you want to do HTR on isolated handwriting regions.
    """
    from ocr_pipeline_session_store import get_session_db, get_session_image

    session = await get_session_db(req.session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found")

    # Choose source image
    image_bytes: Optional[bytes] = None
    source_used: str = ""

    if req.use_clean:
        image_bytes = await get_session_image(req.session_id, "clean")
        if image_bytes:
            source_used = "clean"

    if not image_bytes:
        image_bytes = await get_session_image(req.session_id, "deskewed")
        if image_bytes:
            source_used = "deskewed"

    if not image_bytes:
        image_bytes = await get_session_image(req.session_id, "original")
        source_used = "original"

    if not image_bytes:
        raise HTTPException(status_code=404, detail="No image available in session")

    result = await _do_recognize(image_bytes, model=req.model)
    result["session_id"] = req.session_id
    result["source_image"] = source_used
    return result