feat: PaddleOCR Remote-Engine (PP-OCRv5 Latin auf Hetzner x86_64)

PaddleOCR als neue engine=paddle Option in der OCR-Pipeline. Microservice auf Hetzner (paddleocr-service/), async HTTP-Client (paddleocr_remote.py), Frontend-Dropdown, automatisch words_first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 09:31:22 +01:00
parent ced5bb3dd3
commit a6069631cc
10 changed files with 354 additions and 27 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -385,6 +385,51 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str
        return []


+# --- Remote PaddleOCR (Hetzner x86_64) ---
+
+
+async def ocr_region_paddle(
+    img_bgr: np.ndarray,
+    region: Optional["PageRegion"] = None,
+) -> List[Dict[str, Any]]:
+    """Run OCR via remote PaddleOCR service (Hetzner).
+
+    If *region* is given, crops before sending. Otherwise sends the full image.
+    Returns word dicts in the standard format (left/top in absolute coords).
+    """
+    from services.paddleocr_remote import ocr_remote_paddle
+
+    if region is not None:
+        crop = img_bgr[
+            region.y : region.y + region.height,
+            region.x : region.x + region.width,
+        ]
+        offset_x, offset_y = region.x, region.y
+    else:
+        crop = img_bgr
+        offset_x, offset_y = 0, 0
+
+    if crop.size == 0:
+        return []
+
+    # Encode as PNG
+    success, png_buf = cv2.imencode(".png", crop)
+    if not success:
+        logger.error("ocr_region_paddle: cv2.imencode failed")
+        return []
+
+    words, _w, _h = await ocr_remote_paddle(png_buf.tobytes())
+
+    # Shift coordinates to absolute image space
+    for w in words:
+        w["left"] += offset_x
+        w["top"] += offset_y
+        if region is not None:
+            w["region_type"] = region.type
+
+    return words
+
+
 # =============================================================================
 # Post-Processing: Deterministic Quality Fixes
 # =============================================================================