feat(ocr-pipeline): line grouping fix + RapidOCR integration

Fix A: Use _group_words_into_lines() with adaptive Y-tolerance to
correctly order words in multi-line cells (fixes word reordering bug).

RapidOCR: Add as alternative OCR engine (PaddleOCR models on ONNX
Runtime, native ARM64). Engine selectable via dropdown in UI or
?engine= query param. Auto mode prefers RapidOCR when available.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 17:13:58 +01:00
parent 4ec7c20490
commit 45435f226f
4 changed files with 180 additions and 17 deletions

View File

@@ -1007,8 +1007,12 @@ async def get_row_ground_truth(session_id: str):
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/words")
async def detect_words(session_id: str):
"""Build word grid from columns × rows, OCR each cell."""
async def detect_words(session_id: str, engine: str = "auto"):
"""Build word grid from columns × rows, OCR each cell.
Query params:
engine: 'auto' (default), 'tesseract', or 'rapid'
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
@@ -1030,7 +1034,7 @@ async def detect_words(session_id: str):
t0 = time.time()
# Create binarized OCR image
# Create binarized OCR image (for Tesseract)
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
@@ -1060,8 +1064,11 @@ async def detect_words(session_id: str):
for r in row_result["rows"]
]
# Build word grid
entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h)
# Build word grid — pass both binarized (for Tesseract) and BGR (for RapidOCR)
entries = build_word_grid(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=engine, img_bgr=dewarped_bgr,
)
duration = time.time() - t0
# Build summary
@@ -1072,6 +1079,9 @@ async def detect_words(session_id: str):
"low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
}
# Determine which engine was actually used
used_engine = entries[0].get("ocr_engine", "tesseract") if entries else engine
word_result = {
"entries": entries,
"entry_count": len(entries),
@@ -1079,6 +1089,7 @@ async def detect_words(session_id: str):
"image_height": img_h,
"duration_seconds": round(duration, 2),
"summary": summary,
"ocr_engine": used_engine,
}
# Persist to DB