feat(klausur): Handschrift entfernen + Klausur-HTR implementiert

Feature 1: Handschrift entfernen via OCR-Pipeline Session - services/handwriting_detection.py: _detect_pencil() + target_ink Parameter ("all" | "colored" | "pencil") für gezielte Tinten-Erkennung - ocr_pipeline_session_store.py: clean_png + handwriting_removal_meta Spalten (idempotentes ALTER TABLE in init_ocr_pipeline_tables) - ocr_pipeline_api.py: POST /sessions/{id}/remove-handwriting Endpoint + "clean" zu valid_types für Image-Serving hinzugefügt Feature 2: Klausur-HTR (Hochwertige Handschriftenerkennung) - handwriting_htr_api.py: Neuer Router /api/v1/htr/recognize + /recognize-session Primary: qwen2.5vl:32b via Ollama, Fallback: trocr-large-handwritten - services/trocr_service.py: size Parameter (base | large) für get_trocr_model() + run_trocr_ocr() - unterstützt jetzt trocr-large-handwritten - main.py: HTR Router registriert Config: - docker-compose.yml: OLLAMA_HTR_MODEL, HTR_FALLBACK_MODEL - .env.example: HTR Env-Vars dokumentiert Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 12:04:26 +01:00
parent 606bef0591
commit 2e0f8632f8
8 changed files with 529 additions and 56 deletions
--- a/klausur-service/backend/handwriting_htr_api.py
+++ b/klausur-service/backend/handwriting_htr_api.py
@@ -0,0 +1,276 @@
+"""
+Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen.
+
+Endpoints:
+- POST /api/v1/htr/recognize          - Bild hochladen → handgeschriebener Text
+- POST /api/v1/htr/recognize-session  - OCR-Pipeline Session als Quelle nutzen
+
+Modell-Strategie:
+  1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM)
+  2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama)
+
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
+"""
+
+import io
+import os
+import logging
+import time
+import base64
+from typing import Optional
+
+import cv2
+import numpy as np
+from fastapi import APIRouter, HTTPException, Query, UploadFile, File
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/htr", tags=["HTR"])
+
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
+HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large")
+
+
+# ---------------------------------------------------------------------------
+# Pydantic Models
+# ---------------------------------------------------------------------------
+
+class HTRSessionRequest(BaseModel):
+    session_id: str
+    model: str = "auto"       # "auto" | "qwen2.5vl" | "trocr-large"
+    use_clean: bool = True    # Prefer clean_png (after handwriting removal)
+
+
+# ---------------------------------------------------------------------------
+# Preprocessing
+# ---------------------------------------------------------------------------
+
+def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray:
+    """
+    CLAHE contrast enhancement + upscale to improve HTR accuracy.
+    Returns grayscale enhanced image.
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+
+    # Upscale if image is too small
+    h, w = enhanced.shape
+    if min(h, w) < 800:
+        scale = 800 / min(h, w)
+        enhanced = cv2.resize(
+            enhanced, None, fx=scale, fy=scale,
+            interpolation=cv2.INTER_CUBIC
+        )
+
+    return enhanced
+
+
+def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes:
+    """Convert BGR ndarray to PNG bytes."""
+    success, buf = cv2.imencode(".png", img_bgr)
+    if not success:
+        raise RuntimeError("Failed to encode image to PNG")
+    return buf.tobytes()
+
+
+def _preprocess_image_bytes(image_bytes: bytes) -> bytes:
+    """Load image, apply HTR preprocessing, return PNG bytes."""
+    arr = np.frombuffer(image_bytes, dtype=np.uint8)
+    img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise ValueError("Could not decode image")
+
+    enhanced = _preprocess_for_htr(img_bgr)
+    # Convert grayscale back to BGR for encoding
+    enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
+    return _bgr_to_png_bytes(enhanced_bgr)
+
+
+# ---------------------------------------------------------------------------
+# Backend: Ollama qwen2.5vl
+# ---------------------------------------------------------------------------
+
+async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]:
+    """
+    Send image to Ollama qwen2.5vl:32b for HTR.
+    Returns extracted text or None on error.
+    """
+    import httpx
+
+    lang_hint = {
+        "de": "Deutsch",
+        "en": "Englisch",
+        "de+en": "Deutsch und Englisch",
+    }.get(language, "Deutsch")
+
+    prompt = (
+        f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. "
+        "Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. "
+        "Antworte NUR mit dem erkannten Text, ohne Erklaerungen."
+    )
+
+    img_b64 = base64.b64encode(image_bytes).decode("utf-8")
+
+    payload = {
+        "model": OLLAMA_HTR_MODEL,
+        "prompt": prompt,
+        "images": [img_b64],
+        "stream": False,
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+            return data.get("response", "").strip()
+    except Exception as e:
+        logger.warning(f"Ollama qwen2.5vl HTR failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Backend: TrOCR-large fallback
+# ---------------------------------------------------------------------------
+
+async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]:
+    """
+    Use microsoft/trocr-large-handwritten via trocr_service.py.
+    Returns extracted text or None on error.
+    """
+    try:
+        from services.trocr_service import run_trocr_ocr, _check_trocr_available
+        if not _check_trocr_available():
+            logger.warning("TrOCR not available for HTR fallback")
+            return None
+
+        text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large")
+        return text.strip() if text else None
+    except Exception as e:
+        logger.warning(f"TrOCR-large HTR failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Core recognition logic
+# ---------------------------------------------------------------------------
+
+async def _do_recognize(
+    image_bytes: bytes,
+    model: str = "auto",
+    preprocess: bool = True,
+    language: str = "de",
+) -> dict:
+    """
+    Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large.
+    Returns dict with text, model_used, processing_time_ms.
+    """
+    t0 = time.monotonic()
+
+    if preprocess:
+        try:
+            image_bytes = _preprocess_image_bytes(image_bytes)
+        except Exception as e:
+            logger.warning(f"HTR preprocessing failed, using raw image: {e}")
+
+    text: Optional[str] = None
+    model_used: str = "none"
+
+    use_qwen = model in ("auto", "qwen2.5vl")
+    use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None)
+
+    if use_qwen:
+        text = await _recognize_with_qwen_vl(image_bytes, language)
+        if text is not None:
+            model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})"
+
+    if text is None and (use_trocr or model == "trocr-large"):
+        text = await _recognize_with_trocr_large(image_bytes)
+        if text is not None:
+            model_used = "trocr-large-handwritten"
+
+    if text is None:
+        text = ""
+        model_used = "none (all backends failed)"
+
+    elapsed_ms = int((time.monotonic() - t0) * 1000)
+
+    return {
+        "text": text,
+        "model_used": model_used,
+        "processing_time_ms": elapsed_ms,
+        "language": language,
+        "preprocessed": preprocess,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/recognize")
+async def recognize_handwriting(
+    file: UploadFile = File(...),
+    model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"),
+    preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"),
+    language: str = Query("de", description="de | en | de+en"),
+):
+    """
+    Upload an image and get back the handwritten text as plain text.
+
+    Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten.
+    """
+    if model not in ("auto", "qwen2.5vl", "trocr-large"):
+        raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large")
+    if language not in ("de", "en", "de+en"):
+        raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en")
+
+    image_bytes = await file.read()
+    if not image_bytes:
+        raise HTTPException(status_code=400, detail="Empty file")
+
+    return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language)
+
+
+@router.post("/recognize-session")
+async def recognize_from_session(req: HTRSessionRequest):
+    """
+    Use an OCR-Pipeline session as image source for HTR.
+
+    Set use_clean=true to prefer the clean image (after handwriting removal step).
+    This is useful when you want to do HTR on isolated handwriting regions.
+    """
+    from ocr_pipeline_session_store import get_session_db, get_session_image
+
+    session = await get_session_db(req.session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found")
+
+    # Choose source image
+    image_bytes: Optional[bytes] = None
+    source_used: str = ""
+
+    if req.use_clean:
+        image_bytes = await get_session_image(req.session_id, "clean")
+        if image_bytes:
+            source_used = "clean"
+
+    if not image_bytes:
+        image_bytes = await get_session_image(req.session_id, "deskewed")
+        if image_bytes:
+            source_used = "deskewed"
+
+    if not image_bytes:
+        image_bytes = await get_session_image(req.session_id, "original")
+        source_used = "original"
+
+    if not image_bytes:
+        raise HTTPException(status_code=404, detail="No image available in session")
+
+    result = await _do_recognize(image_bytes, model=req.model)
+    result["session_id"] = req.session_id
+    result["source_image"] = source_used
+    return result