""" Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen. Endpoints: - POST /api/v1/htr/recognize - Bild hochladen → handgeschriebener Text - POST /api/v1/htr/recognize-session - OCR-Pipeline Session als Quelle nutzen Modell-Strategie: 1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM) 2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini. """ import io import os import logging import time import base64 from typing import Optional import cv2 import numpy as np from fastapi import APIRouter, HTTPException, Query, UploadFile, File from pydantic import BaseModel logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/htr", tags=["HTR"]) OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b") HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large") # --------------------------------------------------------------------------- # Pydantic Models # --------------------------------------------------------------------------- class HTRSessionRequest(BaseModel): session_id: str model: str = "auto" # "auto" | "qwen2.5vl" | "trocr-large" use_clean: bool = True # Prefer clean_png (after handwriting removal) # --------------------------------------------------------------------------- # Preprocessing # --------------------------------------------------------------------------- def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray: """ CLAHE contrast enhancement + upscale to improve HTR accuracy. Returns grayscale enhanced image. """ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # Upscale if image is too small h, w = enhanced.shape if min(h, w) < 800: scale = 800 / min(h, w) enhanced = cv2.resize( enhanced, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC ) return enhanced def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes: """Convert BGR ndarray to PNG bytes.""" success, buf = cv2.imencode(".png", img_bgr) if not success: raise RuntimeError("Failed to encode image to PNG") return buf.tobytes() def _preprocess_image_bytes(image_bytes: bytes) -> bytes: """Load image, apply HTR preprocessing, return PNG bytes.""" arr = np.frombuffer(image_bytes, dtype=np.uint8) img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) if img_bgr is None: raise ValueError("Could not decode image") enhanced = _preprocess_for_htr(img_bgr) # Convert grayscale back to BGR for encoding enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR) return _bgr_to_png_bytes(enhanced_bgr) # --------------------------------------------------------------------------- # Backend: Ollama qwen2.5vl # --------------------------------------------------------------------------- async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]: """ Send image to Ollama qwen2.5vl:32b for HTR. Returns extracted text or None on error. """ import httpx lang_hint = { "de": "Deutsch", "en": "Englisch", "de+en": "Deutsch und Englisch", }.get(language, "Deutsch") prompt = ( f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. " "Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. " "Antworte NUR mit dem erkannten Text, ohne Erklaerungen." ) img_b64 = base64.b64encode(image_bytes).decode("utf-8") payload = { "model": OLLAMA_HTR_MODEL, "prompt": prompt, "images": [img_b64], "stream": False, } try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload) resp.raise_for_status() data = resp.json() return data.get("response", "").strip() except Exception as e: logger.warning(f"Ollama qwen2.5vl HTR failed: {e}") return None # --------------------------------------------------------------------------- # Backend: TrOCR-large fallback # --------------------------------------------------------------------------- async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]: """ Use microsoft/trocr-large-handwritten via trocr_service.py. Returns extracted text or None on error. """ try: from services.trocr_service import run_trocr_ocr, _check_trocr_available if not _check_trocr_available(): logger.warning("TrOCR not available for HTR fallback") return None text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large") return text.strip() if text else None except Exception as e: logger.warning(f"TrOCR-large HTR failed: {e}") return None # --------------------------------------------------------------------------- # Core recognition logic # --------------------------------------------------------------------------- async def _do_recognize( image_bytes: bytes, model: str = "auto", preprocess: bool = True, language: str = "de", ) -> dict: """ Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large. Returns dict with text, model_used, processing_time_ms. """ t0 = time.monotonic() if preprocess: try: image_bytes = _preprocess_image_bytes(image_bytes) except Exception as e: logger.warning(f"HTR preprocessing failed, using raw image: {e}") text: Optional[str] = None model_used: str = "none" use_qwen = model in ("auto", "qwen2.5vl") use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None) if use_qwen: text = await _recognize_with_qwen_vl(image_bytes, language) if text is not None: model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})" if text is None and (use_trocr or model == "trocr-large"): text = await _recognize_with_trocr_large(image_bytes) if text is not None: model_used = "trocr-large-handwritten" if text is None: text = "" model_used = "none (all backends failed)" elapsed_ms = int((time.monotonic() - t0) * 1000) return { "text": text, "model_used": model_used, "processing_time_ms": elapsed_ms, "language": language, "preprocessed": preprocess, } # --------------------------------------------------------------------------- # Endpoints # --------------------------------------------------------------------------- @router.post("/recognize") async def recognize_handwriting( file: UploadFile = File(...), model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"), preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"), language: str = Query("de", description="de | en | de+en"), ): """ Upload an image and get back the handwritten text as plain text. Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten. """ if model not in ("auto", "qwen2.5vl", "trocr-large"): raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large") if language not in ("de", "en", "de+en"): raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en") image_bytes = await file.read() if not image_bytes: raise HTTPException(status_code=400, detail="Empty file") return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language) @router.post("/recognize-session") async def recognize_from_session(req: HTRSessionRequest): """ Use an OCR-Pipeline session as image source for HTR. Set use_clean=true to prefer the clean image (after handwriting removal step). This is useful when you want to do HTR on isolated handwriting regions. """ from ocr_pipeline_session_store import get_session_db, get_session_image session = await get_session_db(req.session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found") # Choose source image image_bytes: Optional[bytes] = None source_used: str = "" if req.use_clean: image_bytes = await get_session_image(req.session_id, "clean") if image_bytes: source_used = "clean" if not image_bytes: image_bytes = await get_session_image(req.session_id, "deskewed") if image_bytes: source_used = "deskewed" if not image_bytes: image_bytes = await get_session_image(req.session_id, "original") source_used = "original" if not image_bytes: raise HTTPException(status_code=404, detail="No image available in session") result = await _do_recognize(image_bytes, model=req.model) result["session_id"] = req.session_id result["source_image"] = source_used return result