feat(klausur): Handschrift entfernen + Klausur-HTR implementiert

Feature 1: Handschrift entfernen via OCR-Pipeline Session - services/handwriting_detection.py: _detect_pencil() + target_ink Parameter ("all" | "colored" | "pencil") für gezielte Tinten-Erkennung - ocr_pipeline_session_store.py: clean_png + handwriting_removal_meta Spalten (idempotentes ALTER TABLE in init_ocr_pipeline_tables) - ocr_pipeline_api.py: POST /sessions/{id}/remove-handwriting Endpoint + "clean" zu valid_types für Image-Serving hinzugefügt Feature 2: Klausur-HTR (Hochwertige Handschriftenerkennung) - handwriting_htr_api.py: Neuer Router /api/v1/htr/recognize + /recognize-session Primary: qwen2.5vl:32b via Ollama, Fallback: trocr-large-handwritten - services/trocr_service.py: size Parameter (base | large) für get_trocr_model() + run_trocr_ocr() - unterstützt jetzt trocr-large-handwritten - main.py: HTR Router registriert Config: - docker-compose.yml: OLLAMA_HTR_MODEL, HTR_FALLBACK_MODEL - .env.example: HTR Env-Vars dokumentiert Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 12:04:26 +01:00
parent 606bef0591
commit 2e0f8632f8
8 changed files with 529 additions and 56 deletions
@@ -31,8 +31,10 @@ from datetime import datetime, timedelta
 logger = logging.getLogger(__name__)

 # Lazy loading for heavy dependencies
-_trocr_processor = None
-_trocr_model = None
+# Cache keyed by model_name to support base and large variants simultaneously
+_trocr_models: dict = {}  # {model_name: (processor, model)}
+_trocr_processor = None  # backwards-compat alias → base-printed
+_trocr_model = None      # backwards-compat alias → base-printed
 _trocr_available = None
 _model_loaded_at = None

@@ -124,12 +126,14 @@ def _check_trocr_available() -> bool:
    return _trocr_available


-def get_trocr_model(handwritten: bool = False):
+def get_trocr_model(handwritten: bool = False, size: str = "base"):
    """
    Lazy load TrOCR model and processor.

    Args:
        handwritten: Use handwritten model instead of printed model
+        size: Model size — "base" (300 MB) or "large" (340 MB, higher accuracy
+              for exam HTR). Only applies to handwritten variant.

    Returns tuple of (processor, model) or (None, None) if unavailable.
    """
@@ -138,31 +142,42 @@ def get_trocr_model(handwritten: bool = False):
    if not _check_trocr_available():
        return None, None

-    if _trocr_processor is None or _trocr_model is None:
-        try:
-            import torch
-            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+    # Select model name
+    if size == "large" and handwritten:
+        model_name = "microsoft/trocr-large-handwritten"
+    elif handwritten:
+        model_name = "microsoft/trocr-base-handwritten"
+    else:
+        model_name = "microsoft/trocr-base-printed"

-            # Choose model based on use case
-            if handwritten:
-                model_name = "microsoft/trocr-base-handwritten"
-            else:
-                model_name = "microsoft/trocr-base-printed"
+    if model_name in _trocr_models:
+        return _trocr_models[model_name]

-            logger.info(f"Loading TrOCR model: {model_name}")
-            _trocr_processor = TrOCRProcessor.from_pretrained(model_name)
-            _trocr_model = VisionEncoderDecoderModel.from_pretrained(model_name)
+    try:
+        import torch
+        from transformers import TrOCRProcessor, VisionEncoderDecoderModel

-            # Use GPU if available
-            device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
-            _trocr_model.to(device)
-            logger.info(f"TrOCR model loaded on device: {device}")
+        logger.info(f"Loading TrOCR model: {model_name}")
+        processor = TrOCRProcessor.from_pretrained(model_name)
+        model = VisionEncoderDecoderModel.from_pretrained(model_name)

-        except Exception as e:
-            logger.error(f"Failed to load TrOCR model: {e}")
-            return None, None
+        # Use GPU if available
+        device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+        model.to(device)
+        logger.info(f"TrOCR model loaded on device: {device}")

-    return _trocr_processor, _trocr_model
+        _trocr_models[model_name] = (processor, model)
+
+        # Keep backwards-compat globals pointing at base-printed
+        if model_name == "microsoft/trocr-base-printed":
+            _trocr_processor = processor
+            _trocr_model = model
+
+        return processor, model
+
+    except Exception as e:
+        logger.error(f"Failed to load TrOCR model {model_name}: {e}")
+        return None, None


 def preload_trocr_model(handwritten: bool = True) -> bool:
@@ -209,7 +224,8 @@ def get_model_status() -> Dict[str, Any]:
 async def run_trocr_ocr(
    image_data: bytes,
    handwritten: bool = False,
-    split_lines: bool = True
+    split_lines: bool = True,
+    size: str = "base",
 ) -> Tuple[Optional[str], float]:
    """
    Run TrOCR on an image.
@@ -223,11 +239,12 @@ async def run_trocr_ocr(
        image_data: Raw image bytes
        handwritten: Use handwritten model (slower but better for handwriting)
        split_lines: Whether to split image into lines first
+        size: "base" or "large" (only for handwritten variant)

    Returns:
        Tuple of (extracted_text, confidence)
    """
-    processor, model = get_trocr_model(handwritten=handwritten)
+    processor, model = get_trocr_model(handwritten=handwritten, size=size)

    if processor is None or model is None:
        logger.error("TrOCR model not available")