feat(klausur): Handschrift entfernen + Klausur-HTR implementiert
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m49s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m49s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 15s
Feature 1: Handschrift entfernen via OCR-Pipeline Session
- services/handwriting_detection.py: _detect_pencil() + target_ink Parameter
("all" | "colored" | "pencil") für gezielte Tinten-Erkennung
- ocr_pipeline_session_store.py: clean_png + handwriting_removal_meta Spalten
(idempotentes ALTER TABLE in init_ocr_pipeline_tables)
- ocr_pipeline_api.py: POST /sessions/{id}/remove-handwriting Endpoint
+ "clean" zu valid_types für Image-Serving hinzugefügt
Feature 2: Klausur-HTR (Hochwertige Handschriftenerkennung)
- handwriting_htr_api.py: Neuer Router /api/v1/htr/recognize + /recognize-session
Primary: qwen2.5vl:32b via Ollama, Fallback: trocr-large-handwritten
- services/trocr_service.py: size Parameter (base | large) für get_trocr_model()
+ run_trocr_ocr() - unterstützt jetzt trocr-large-handwritten
- main.py: HTR Router registriert
Config:
- docker-compose.yml: OLLAMA_HTR_MODEL, HTR_FALLBACK_MODEL
- .env.example: HTR Env-Vars dokumentiert
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
276
klausur-service/backend/handwriting_htr_api.py
Normal file
276
klausur-service/backend/handwriting_htr_api.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen.
|
||||
|
||||
Endpoints:
|
||||
- POST /api/v1/htr/recognize - Bild hochladen → handgeschriebener Text
|
||||
- POST /api/v1/htr/recognize-session - OCR-Pipeline Session als Quelle nutzen
|
||||
|
||||
Modell-Strategie:
|
||||
1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM)
|
||||
2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama)
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
import base64
|
||||
from typing import Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/htr", tags=["HTR"])
|
||||
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
||||
HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pydantic Models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class HTRSessionRequest(BaseModel):
|
||||
session_id: str
|
||||
model: str = "auto" # "auto" | "qwen2.5vl" | "trocr-large"
|
||||
use_clean: bool = True # Prefer clean_png (after handwriting removal)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Preprocessing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
CLAHE contrast enhancement + upscale to improve HTR accuracy.
|
||||
Returns grayscale enhanced image.
|
||||
"""
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(gray)
|
||||
|
||||
# Upscale if image is too small
|
||||
h, w = enhanced.shape
|
||||
if min(h, w) < 800:
|
||||
scale = 800 / min(h, w)
|
||||
enhanced = cv2.resize(
|
||||
enhanced, None, fx=scale, fy=scale,
|
||||
interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
|
||||
return enhanced
|
||||
|
||||
|
||||
def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes:
|
||||
"""Convert BGR ndarray to PNG bytes."""
|
||||
success, buf = cv2.imencode(".png", img_bgr)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image to PNG")
|
||||
return buf.tobytes()
|
||||
|
||||
|
||||
def _preprocess_image_bytes(image_bytes: bytes) -> bytes:
|
||||
"""Load image, apply HTR preprocessing, return PNG bytes."""
|
||||
arr = np.frombuffer(image_bytes, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||
if img_bgr is None:
|
||||
raise ValueError("Could not decode image")
|
||||
|
||||
enhanced = _preprocess_for_htr(img_bgr)
|
||||
# Convert grayscale back to BGR for encoding
|
||||
enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
|
||||
return _bgr_to_png_bytes(enhanced_bgr)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backend: Ollama qwen2.5vl
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]:
|
||||
"""
|
||||
Send image to Ollama qwen2.5vl:32b for HTR.
|
||||
Returns extracted text or None on error.
|
||||
"""
|
||||
import httpx
|
||||
|
||||
lang_hint = {
|
||||
"de": "Deutsch",
|
||||
"en": "Englisch",
|
||||
"de+en": "Deutsch und Englisch",
|
||||
}.get(language, "Deutsch")
|
||||
|
||||
prompt = (
|
||||
f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. "
|
||||
"Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. "
|
||||
"Antworte NUR mit dem erkannten Text, ohne Erklaerungen."
|
||||
)
|
||||
|
||||
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_HTR_MODEL,
|
||||
"prompt": prompt,
|
||||
"images": [img_b64],
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data.get("response", "").strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Ollama qwen2.5vl HTR failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backend: TrOCR-large fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]:
|
||||
"""
|
||||
Use microsoft/trocr-large-handwritten via trocr_service.py.
|
||||
Returns extracted text or None on error.
|
||||
"""
|
||||
try:
|
||||
from services.trocr_service import run_trocr_ocr, _check_trocr_available
|
||||
if not _check_trocr_available():
|
||||
logger.warning("TrOCR not available for HTR fallback")
|
||||
return None
|
||||
|
||||
text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large")
|
||||
return text.strip() if text else None
|
||||
except Exception as e:
|
||||
logger.warning(f"TrOCR-large HTR failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core recognition logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _do_recognize(
|
||||
image_bytes: bytes,
|
||||
model: str = "auto",
|
||||
preprocess: bool = True,
|
||||
language: str = "de",
|
||||
) -> dict:
|
||||
"""
|
||||
Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large.
|
||||
Returns dict with text, model_used, processing_time_ms.
|
||||
"""
|
||||
t0 = time.monotonic()
|
||||
|
||||
if preprocess:
|
||||
try:
|
||||
image_bytes = _preprocess_image_bytes(image_bytes)
|
||||
except Exception as e:
|
||||
logger.warning(f"HTR preprocessing failed, using raw image: {e}")
|
||||
|
||||
text: Optional[str] = None
|
||||
model_used: str = "none"
|
||||
|
||||
use_qwen = model in ("auto", "qwen2.5vl")
|
||||
use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None)
|
||||
|
||||
if use_qwen:
|
||||
text = await _recognize_with_qwen_vl(image_bytes, language)
|
||||
if text is not None:
|
||||
model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})"
|
||||
|
||||
if text is None and (use_trocr or model == "trocr-large"):
|
||||
text = await _recognize_with_trocr_large(image_bytes)
|
||||
if text is not None:
|
||||
model_used = "trocr-large-handwritten"
|
||||
|
||||
if text is None:
|
||||
text = ""
|
||||
model_used = "none (all backends failed)"
|
||||
|
||||
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"model_used": model_used,
|
||||
"processing_time_ms": elapsed_ms,
|
||||
"language": language,
|
||||
"preprocessed": preprocess,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/recognize")
|
||||
async def recognize_handwriting(
|
||||
file: UploadFile = File(...),
|
||||
model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"),
|
||||
preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"),
|
||||
language: str = Query("de", description="de | en | de+en"),
|
||||
):
|
||||
"""
|
||||
Upload an image and get back the handwritten text as plain text.
|
||||
|
||||
Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten.
|
||||
"""
|
||||
if model not in ("auto", "qwen2.5vl", "trocr-large"):
|
||||
raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large")
|
||||
if language not in ("de", "en", "de+en"):
|
||||
raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en")
|
||||
|
||||
image_bytes = await file.read()
|
||||
if not image_bytes:
|
||||
raise HTTPException(status_code=400, detail="Empty file")
|
||||
|
||||
return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language)
|
||||
|
||||
|
||||
@router.post("/recognize-session")
|
||||
async def recognize_from_session(req: HTRSessionRequest):
|
||||
"""
|
||||
Use an OCR-Pipeline session as image source for HTR.
|
||||
|
||||
Set use_clean=true to prefer the clean image (after handwriting removal step).
|
||||
This is useful when you want to do HTR on isolated handwriting regions.
|
||||
"""
|
||||
from ocr_pipeline_session_store import get_session_db, get_session_image
|
||||
|
||||
session = await get_session_db(req.session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found")
|
||||
|
||||
# Choose source image
|
||||
image_bytes: Optional[bytes] = None
|
||||
source_used: str = ""
|
||||
|
||||
if req.use_clean:
|
||||
image_bytes = await get_session_image(req.session_id, "clean")
|
||||
if image_bytes:
|
||||
source_used = "clean"
|
||||
|
||||
if not image_bytes:
|
||||
image_bytes = await get_session_image(req.session_id, "deskewed")
|
||||
if image_bytes:
|
||||
source_used = "deskewed"
|
||||
|
||||
if not image_bytes:
|
||||
image_bytes = await get_session_image(req.session_id, "original")
|
||||
source_used = "original"
|
||||
|
||||
if not image_bytes:
|
||||
raise HTTPException(status_code=404, detail="No image available in session")
|
||||
|
||||
result = await _do_recognize(image_bytes, model=req.model)
|
||||
result["session_id"] = req.session_id
|
||||
result["source_image"] = source_used
|
||||
return result
|
||||
Reference in New Issue
Block a user