feat(klausur): Handschrift entfernen + Klausur-HTR implementiert

Feature 1: Handschrift entfernen via OCR-Pipeline Session - services/handwriting_detection.py: _detect_pencil() + target_ink Parameter ("all" | "colored" | "pencil") für gezielte Tinten-Erkennung - ocr_pipeline_session_store.py: clean_png + handwriting_removal_meta Spalten (idempotentes ALTER TABLE in init_ocr_pipeline_tables) - ocr_pipeline_api.py: POST /sessions/{id}/remove-handwriting Endpoint + "clean" zu valid_types für Image-Serving hinzugefügt Feature 2: Klausur-HTR (Hochwertige Handschriftenerkennung) - handwriting_htr_api.py: Neuer Router /api/v1/htr/recognize + /recognize-session Primary: qwen2.5vl:32b via Ollama, Fallback: trocr-large-handwritten - services/trocr_service.py: size Parameter (base | large) für get_trocr_model() + run_trocr_ocr() - unterstützt jetzt trocr-large-handwritten - main.py: HTR Router registriert Config: - docker-compose.yml: OLLAMA_HTR_MODEL, HTR_FALLBACK_MODEL - .env.example: HTR Env-Vars dokumentiert Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 12:04:26 +01:00
parent 606bef0591
commit 2e0f8632f8
8 changed files with 529 additions and 56 deletions
@@ -6,6 +6,7 @@ Uses multiple detection methods:
 1. Color-based detection (blue/red ink)
 2. Stroke analysis (thin irregular strokes)
 3. Edge density variance
+4. Pencil detection (gray ink)

 DATENSCHUTZ: All processing happens locally on Mac Mini.
 """
@@ -37,12 +38,16 @@ class DetectionResult:
    detection_method: str  # Which method was primarily used


-def detect_handwriting(image_bytes: bytes) -> DetectionResult:
+def detect_handwriting(image_bytes: bytes, target_ink: str = "all") -> DetectionResult:
    """
    Detect handwriting in an image.

    Args:
        image_bytes: Image as bytes (PNG, JPG, etc.)
+        target_ink: Which ink types to detect:
+            - "all"     → all methods combined (incl. pencil)
+            - "colored" → only color-based (blue/red/green pen)
+            - "pencil"  → only pencil (gray ink)

    Returns:
        DetectionResult with binary mask where handwriting is white (255)
@@ -62,35 +67,51 @@ def detect_handwriting(image_bytes: bytes) -> DetectionResult:

    # Convert to BGR if needed (OpenCV format)
    if len(img_array.shape) == 2:
-        # Grayscale to BGR
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
    elif img_array.shape[2] == 4:
-        # RGBA to BGR
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
    elif img_array.shape[2] == 3:
-        # RGB to BGR
        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    else:
        img_bgr = img_array

-    # Run multiple detection methods
-    color_mask, color_confidence = _detect_by_color(img_bgr)
-    stroke_mask, stroke_confidence = _detect_by_stroke_analysis(img_bgr)
-    variance_mask, variance_confidence = _detect_by_variance(img_bgr)
+    # Select detection methods based on target_ink
+    masks_and_weights = []
+
+    if target_ink in ("all", "colored"):
+        color_mask, color_conf = _detect_by_color(img_bgr)
+        masks_and_weights.append((color_mask, color_conf, "color"))
+
+    if target_ink == "all":
+        stroke_mask, stroke_conf = _detect_by_stroke_analysis(img_bgr)
+        variance_mask, variance_conf = _detect_by_variance(img_bgr)
+        masks_and_weights.append((stroke_mask, stroke_conf, "stroke"))
+        masks_and_weights.append((variance_mask, variance_conf, "variance"))
+
+    if target_ink in ("all", "pencil"):
+        pencil_mask, pencil_conf = _detect_pencil(img_bgr)
+        masks_and_weights.append((pencil_mask, pencil_conf, "pencil"))
+
+    if not masks_and_weights:
+        # Fallback: use all methods
+        color_mask, color_conf = _detect_by_color(img_bgr)
+        stroke_mask, stroke_conf = _detect_by_stroke_analysis(img_bgr)
+        variance_mask, variance_conf = _detect_by_variance(img_bgr)
+        pencil_mask, pencil_conf = _detect_pencil(img_bgr)
+        masks_and_weights = [
+            (color_mask, color_conf, "color"),
+            (stroke_mask, stroke_conf, "stroke"),
+            (variance_mask, variance_conf, "variance"),
+            (pencil_mask, pencil_conf, "pencil"),
+        ]

    # Combine masks using weighted average
-    weights = [color_confidence, stroke_confidence, variance_confidence]
-    total_weight = sum(weights)
+    total_weight = sum(w for _, w, _ in masks_and_weights)

    if total_weight > 0:
-        # Weighted combination
-        combined_mask = (
-            color_mask.astype(np.float32) * color_confidence +
-            stroke_mask.astype(np.float32) * stroke_confidence +
-            variance_mask.astype(np.float32) * variance_confidence
+        combined_mask = sum(
+            m.astype(np.float32) * w for m, w, _ in masks_and_weights
        ) / total_weight
-
-        # Threshold to binary
        combined_mask = (combined_mask > 127).astype(np.uint8) * 255
    else:
        combined_mask = np.zeros(img_bgr.shape[:2], dtype=np.uint8)
@@ -103,19 +124,11 @@ def detect_handwriting(image_bytes: bytes) -> DetectionResult:
    handwriting_pixels = np.sum(combined_mask > 0)
    handwriting_ratio = handwriting_pixels / total_pixels if total_pixels > 0 else 0

-    # Determine primary method
-    primary_method = "combined"
-    max_conf = max(color_confidence, stroke_confidence, variance_confidence)
-    if max_conf == color_confidence:
-        primary_method = "color"
-    elif max_conf == stroke_confidence:
-        primary_method = "stroke"
-    else:
-        primary_method = "variance"
+    # Determine primary method (highest confidence)
+    primary_method = max(masks_and_weights, key=lambda x: x[1])[2] if masks_and_weights else "combined"
+    overall_confidence = total_weight / len(masks_and_weights) if masks_and_weights else 0.0

-    overall_confidence = total_weight / 3.0  # Average confidence
-
-    logger.info(f"Handwriting detection: {handwriting_ratio:.2%} handwriting, "
+    logger.info(f"Handwriting detection (target_ink={target_ink}): {handwriting_ratio:.2%} handwriting, "
                f"confidence={overall_confidence:.2f}, method={primary_method}")

    return DetectionResult(
@@ -180,6 +193,27 @@ def _detect_by_color(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
    return color_mask, confidence


+def _detect_pencil(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
+    """
+    Detect pencil marks (gray ink, ~140-220 on 255-scale).
+
+    Paper is usually >230, dark ink <130.
+    Pencil falls in the 140-220 gray range.
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    pencil_mask = cv2.inRange(gray, 140, 220)
+
+    # Remove small noise artifacts
+    kernel = np.ones((2, 2), np.uint8)
+    pencil_mask = cv2.morphologyEx(pencil_mask, cv2.MORPH_OPEN, kernel, iterations=1)
+
+    ratio = np.sum(pencil_mask > 0) / pencil_mask.size
+    # Good confidence if pencil pixels are in a plausible range
+    confidence = 0.75 if 0.002 < ratio < 0.2 else 0.2
+
+    return pencil_mask, confidence
+
+
 def _detect_by_stroke_analysis(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
    """
    Detect handwriting by analyzing stroke characteristics.
@@ -31,8 +31,10 @@ from datetime import datetime, timedelta
 logger = logging.getLogger(__name__)

 # Lazy loading for heavy dependencies
-_trocr_processor = None
-_trocr_model = None
+# Cache keyed by model_name to support base and large variants simultaneously
+_trocr_models: dict = {}  # {model_name: (processor, model)}
+_trocr_processor = None  # backwards-compat alias → base-printed
+_trocr_model = None      # backwards-compat alias → base-printed
 _trocr_available = None
 _model_loaded_at = None

@@ -124,12 +126,14 @@ def _check_trocr_available() -> bool:
    return _trocr_available


-def get_trocr_model(handwritten: bool = False):
+def get_trocr_model(handwritten: bool = False, size: str = "base"):
    """
    Lazy load TrOCR model and processor.

    Args:
        handwritten: Use handwritten model instead of printed model
+        size: Model size — "base" (300 MB) or "large" (340 MB, higher accuracy
+              for exam HTR). Only applies to handwritten variant.

    Returns tuple of (processor, model) or (None, None) if unavailable.
    """
@@ -138,31 +142,42 @@ def get_trocr_model(handwritten: bool = False):
    if not _check_trocr_available():
        return None, None

-    if _trocr_processor is None or _trocr_model is None:
-        try:
-            import torch
-            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+    # Select model name
+    if size == "large" and handwritten:
+        model_name = "microsoft/trocr-large-handwritten"
+    elif handwritten:
+        model_name = "microsoft/trocr-base-handwritten"
+    else:
+        model_name = "microsoft/trocr-base-printed"

-            # Choose model based on use case
-            if handwritten:
-                model_name = "microsoft/trocr-base-handwritten"
-            else:
-                model_name = "microsoft/trocr-base-printed"
+    if model_name in _trocr_models:
+        return _trocr_models[model_name]

-            logger.info(f"Loading TrOCR model: {model_name}")
-            _trocr_processor = TrOCRProcessor.from_pretrained(model_name)
-            _trocr_model = VisionEncoderDecoderModel.from_pretrained(model_name)
+    try:
+        import torch
+        from transformers import TrOCRProcessor, VisionEncoderDecoderModel

-            # Use GPU if available
-            device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
-            _trocr_model.to(device)
-            logger.info(f"TrOCR model loaded on device: {device}")
+        logger.info(f"Loading TrOCR model: {model_name}")
+        processor = TrOCRProcessor.from_pretrained(model_name)
+        model = VisionEncoderDecoderModel.from_pretrained(model_name)

-        except Exception as e:
-            logger.error(f"Failed to load TrOCR model: {e}")
-            return None, None
+        # Use GPU if available
+        device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+        model.to(device)
+        logger.info(f"TrOCR model loaded on device: {device}")

-    return _trocr_processor, _trocr_model
+        _trocr_models[model_name] = (processor, model)
+
+        # Keep backwards-compat globals pointing at base-printed
+        if model_name == "microsoft/trocr-base-printed":
+            _trocr_processor = processor
+            _trocr_model = model
+
+        return processor, model
+
+    except Exception as e:
+        logger.error(f"Failed to load TrOCR model {model_name}: {e}")
+        return None, None


 def preload_trocr_model(handwritten: bool = True) -> bool:
@@ -209,7 +224,8 @@ def get_model_status() -> Dict[str, Any]:
 async def run_trocr_ocr(
    image_data: bytes,
    handwritten: bool = False,
-    split_lines: bool = True
+    split_lines: bool = True,
+    size: str = "base",
 ) -> Tuple[Optional[str], float]:
    """
    Run TrOCR on an image.
@@ -223,11 +239,12 @@ async def run_trocr_ocr(
        image_data: Raw image bytes
        handwritten: Use handwritten model (slower but better for handwriting)
        split_lines: Whether to split image into lines first
+        size: "base" or "large" (only for handwritten variant)

    Returns:
        Tuple of (extracted_text, confidence)
    """
-    processor, model = get_trocr_model(handwritten=handwritten)
+    processor, model = get_trocr_model(handwritten=handwritten, size=size)

    if processor is None or model is None:
        logger.error("TrOCR model not available")