feat: PaddleOCR Remote-Engine (PP-OCRv5 Latin auf Hetzner x86_64)

PaddleOCR als neue engine=paddle Option in der OCR-Pipeline. Microservice auf Hetzner (paddleocr-service/), async HTTP-Client (paddleocr_remote.py), Frontend-Dropdown, automatisch words_first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 09:31:22 +01:00
parent ced5bb3dd3
commit a6069631cc
10 changed files with 354 additions and 27 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1865,7 +1865,7 @@ async def detect_words(
    """Build word grid from columns × rows, OCR each cell.

    Query params:
-        engine: 'auto' (default), 'tesseract', or 'rapid'
+        engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
        pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry
@@ -1874,6 +1874,11 @@ async def detect_words(
            'v2' uses pre-detected columns/rows (top-down).
            'words_first' clusters words bottom-up (no column/row detection needed).
    """
+    # PaddleOCR is full-page remote OCR → force words_first grid method
+    if engine == "paddle" and grid_method != "words_first":
+        logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
+        grid_method = "words_first"
+
    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
        await _load_session_to_cache(session_id)
@@ -1993,33 +1998,43 @@ async def detect_words(
        t0 = time.time()
        img_h, img_w = dewarped_bgr.shape[:2]

-        # Get word_dicts from cache or run Tesseract full-page
-        wf_word_dicts = cached.get("_word_dicts")
-        if wf_word_dicts is None:
-            ocr_img_tmp = create_ocr_image(dewarped_bgr)
-            geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
-            if geo_result is not None:
-                _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
-                cached["_word_dicts"] = wf_word_dicts
-                cached["_inv"] = inv
-                cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
+        # For paddle engine: run remote PaddleOCR full-page instead of Tesseract
+        if engine == "paddle":
+            from cv_ocr_engines import ocr_region_paddle
+
+            wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
+            # PaddleOCR returns absolute coordinates, no content_bounds offset needed
+            cached["_paddle_word_dicts"] = wf_word_dicts
+        else:
+            # Get word_dicts from cache or run Tesseract full-page
+            wf_word_dicts = cached.get("_word_dicts")
+            if wf_word_dicts is None:
+                ocr_img_tmp = create_ocr_image(dewarped_bgr)
+                geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
+                if geo_result is not None:
+                    _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
+                    cached["_word_dicts"] = wf_word_dicts
+                    cached["_inv"] = inv
+                    cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

        if not wf_word_dicts:
            raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")

        # Convert word coordinates to absolute image coordinates if needed
        # (detect_column_geometry returns words relative to content ROI)
-        content_bounds = cached.get("_content_bounds")
-        if content_bounds:
-            lx, _rx, ty, _by = content_bounds
-            abs_words = []
-            for w in wf_word_dicts:
-                abs_words.append({
-                    **w,
-                    'left': w['left'] + lx,
-                    'top': w['top'] + ty,
-                })
-            wf_word_dicts = abs_words
+        # PaddleOCR already returns absolute coordinates — skip offset.
+        if engine != "paddle":
+            content_bounds = cached.get("_content_bounds")
+            if content_bounds:
+                lx, _rx, ty, _by = content_bounds
+                abs_words = []
+                for w in wf_word_dicts:
+                    abs_words.append({
+                        **w,
+                        'left': w['left'] + lx,
+                        'top': w['top'] + ty,
+                    })
+                wf_word_dicts = abs_words

        cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
        duration = time.time() - t0
@@ -2035,7 +2050,7 @@ async def detect_words(
        is_vocab = bool(col_types & {'column_en', 'column_de'})
        n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
        n_cols = len(columns_meta)
-        used_engine = "words_first"
+        used_engine = "paddle" if engine == "paddle" else "words_first"

        word_result = {
            "cells": cells,