feat(ocr-pipeline): line grouping fix + RapidOCR integration
Fix A: Use _group_words_into_lines() with adaptive Y-tolerance to correctly order words in multi-line cells (fixes word reordering bug). RapidOCR: Add as alternative OCR engine (PaddleOCR models on ONNX Runtime, native ARM64). Engine selectable via dropdown in UI or ?engine= query param. Auto mode prefers RapidOCR when available. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1007,8 +1007,12 @@ async def get_row_ground_truth(session_id: str):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sessions/{session_id}/words")
|
||||
async def detect_words(session_id: str):
|
||||
"""Build word grid from columns × rows, OCR each cell."""
|
||||
async def detect_words(session_id: str, engine: str = "auto"):
|
||||
"""Build word grid from columns × rows, OCR each cell.
|
||||
|
||||
Query params:
|
||||
engine: 'auto' (default), 'tesseract', or 'rapid'
|
||||
"""
|
||||
if session_id not in _cache:
|
||||
await _load_session_to_cache(session_id)
|
||||
cached = _get_cached(session_id)
|
||||
@@ -1030,7 +1034,7 @@ async def detect_words(session_id: str):
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Create binarized OCR image
|
||||
# Create binarized OCR image (for Tesseract)
|
||||
ocr_img = create_ocr_image(dewarped_bgr)
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
@@ -1060,8 +1064,11 @@ async def detect_words(session_id: str):
|
||||
for r in row_result["rows"]
|
||||
]
|
||||
|
||||
# Build word grid
|
||||
entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h)
|
||||
# Build word grid — pass both binarized (for Tesseract) and BGR (for RapidOCR)
|
||||
entries = build_word_grid(
|
||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
|
||||
# Build summary
|
||||
@@ -1072,6 +1079,9 @@ async def detect_words(session_id: str):
|
||||
"low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
|
||||
}
|
||||
|
||||
# Determine which engine was actually used
|
||||
used_engine = entries[0].get("ocr_engine", "tesseract") if entries else engine
|
||||
|
||||
word_result = {
|
||||
"entries": entries,
|
||||
"entry_count": len(entries),
|
||||
@@ -1079,6 +1089,7 @@ async def detect_words(session_id: str):
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"summary": summary,
|
||||
"ocr_engine": used_engine,
|
||||
}
|
||||
|
||||
# Persist to DB
|
||||
|
||||
Reference in New Issue
Block a user