feat(ocr-pipeline): line grouping fix + RapidOCR integration

Fix A: Use _group_words_into_lines() with adaptive Y-tolerance to correctly order words in multi-line cells (fixes word reordering bug). RapidOCR: Add as alternative OCR engine (PaddleOCR models on ONNX Runtime, native ARM64). Engine selectable via dropdown in UI or ?engine= query param. Auto mode prefers RapidOCR when available. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 17:13:58 +01:00
parent 4ec7c20490
commit 45435f226f
4 changed files with 180 additions and 17 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1007,8 +1007,12 @@ async def get_row_ground_truth(session_id: str):
 # ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/words")
-async def detect_words(session_id: str):
-    """Build word grid from columns × rows, OCR each cell."""
+async def detect_words(session_id: str, engine: str = "auto"):
+    """Build word grid from columns × rows, OCR each cell.
+
+    Query params:
+        engine: 'auto' (default), 'tesseract', or 'rapid'
+    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
@@ -1030,7 +1034,7 @@ async def detect_words(session_id: str):

    t0 = time.time()

-    # Create binarized OCR image
+    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

@@ -1060,8 +1064,11 @@ async def detect_words(session_id: str):
        for r in row_result["rows"]
    ]

-    # Build word grid
-    entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h)
+    # Build word grid — pass both binarized (for Tesseract) and BGR (for RapidOCR)
+    entries = build_word_grid(
+        ocr_img, col_regions, row_geoms, img_w, img_h,
+        ocr_engine=engine, img_bgr=dewarped_bgr,
+    )
    duration = time.time() - t0

    # Build summary
@@ -1072,6 +1079,9 @@ async def detect_words(session_id: str):
        "low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
    }

+    # Determine which engine was actually used
+    used_engine = entries[0].get("ocr_engine", "tesseract") if entries else engine
+
    word_result = {
        "entries": entries,
        "entry_count": len(entries),
@@ -1079,6 +1089,7 @@ async def detect_words(session_id: str):
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "summary": summary,
+        "ocr_engine": used_engine,
    }

    # Persist to DB