feat(ocr-pipeline): line grouping fix + RapidOCR integration

Fix A: Use _group_words_into_lines() with adaptive Y-tolerance to correctly order words in multi-line cells (fixes word reordering bug). RapidOCR: Add as alternative OCR engine (PaddleOCR models on ONNX Runtime, native ARM64). Engine selectable via dropdown in UI or ?engine= query param. Auto mode prefers RapidOCR when available. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 17:13:58 +01:00
parent 4ec7c20490
commit 45435f226f
4 changed files with 180 additions and 17 deletions
@@ -2173,6 +2173,101 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
 # Pipeline Step 5: Word Grid from Columns × Rows
 # =============================================================================

+def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words into text in correct reading order.
+
+    Groups words into visual lines by Y-tolerance, sorts each line by X,
+    then joins lines with spaces. This fixes multi-line cell reading order.
+    """
+    if not words:
+        return ''
+
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    line_texts = []
+    for line in lines:
+        line_texts.append(' '.join(w['text'] for w in line))
+    return ' '.join(line_texts)
+
+
+# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
+
+_rapid_engine = None
+RAPIDOCR_AVAILABLE = False
+
+try:
+    from rapidocr import RapidOCR as _RapidOCRClass
+    RAPIDOCR_AVAILABLE = True
+    logger.info("RapidOCR available — can be used as alternative to Tesseract")
+except ImportError:
+    logger.info("RapidOCR not installed — using Tesseract only")
+
+
+def _get_rapid_engine():
+    """Lazy-init RapidOCR engine (downloads models on first use)."""
+    global _rapid_engine
+    if _rapid_engine is None:
+        _rapid_engine = _RapidOCRClass()
+        logger.info("RapidOCR engine initialized")
+    return _rapid_engine
+
+
+def ocr_region_rapid(
+    img_bgr: np.ndarray,
+    region: PageRegion,
+) -> List[Dict[str, Any]]:
+    """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
+
+    Args:
+        img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
+        region: Region to crop and OCR.
+
+    Returns:
+        List of word dicts with text, left, top, width, height, conf, region_type.
+    """
+    engine = _get_rapid_engine()
+
+    # Crop region from BGR image
+    crop = img_bgr[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    result = engine(crop)
+
+    if result is None or result.boxes is None or result.txts is None:
+        return []
+
+    words = []
+    boxes = result.boxes    # shape (N, 4, 2) — 4 corner points per text line
+    txts = result.txts      # tuple of strings
+    scores = result.scores  # tuple of floats
+
+    for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
+        if not txt or not txt.strip():
+            continue
+
+        # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        left = int(min(xs))
+        top = int(min(ys))
+        w = int(max(xs) - left)
+        h = int(max(ys) - top)
+
+        words.append({
+            'text': txt.strip(),
+            'left': left + region.x,   # Absolute coords
+            'top': top + region.y,
+            'width': w,
+            'height': h,
+            'conf': int(score * 100),  # 0-100 like Tesseract
+            'region_type': region.type,
+        })
+
+    return words
+
+
 def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -2180,20 +2275,37 @@ def build_word_grid(
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
 ) -> List[Dict[str, Any]]:
    """Build a word grid by intersecting columns and rows, then OCR each cell.

    Args:
-        ocr_img: Binarized full-page image.
+        ocr_img: Binarized full-page image (for Tesseract).
        column_regions: Classified columns from Step 3 (PageRegion list).
        row_geometries: Rows from Step 4 (RowGeometry list).
        img_w: Image width in pixels.
        img_h: Image height in pixels.
        lang: Default Tesseract language.
+        ocr_engine: 'tesseract', 'rapid', or 'auto' (rapid if available, else tesseract).
+        img_bgr: BGR color image (required for RapidOCR).

    Returns:
        List of entry dicts with english/german/example text and bbox info (percent).
    """
+    # Resolve engine choice
+    use_rapid = False
+    if ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+
+    engine_name = "rapid" if use_rapid else "tesseract"
+    logger.info(f"build_word_grid: using OCR engine '{engine_name}'")
+
    # Filter to content rows only (skip header/footer)
    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
@@ -2210,7 +2322,7 @@ def build_word_grid(
    # Sort columns left-to-right
    relevant_cols.sort(key=lambda c: c.x)

-    # Choose OCR language per column type
+    # Choose OCR language per column type (Tesseract only)
    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
@@ -2235,6 +2347,7 @@ def build_word_grid(
            'bbox_en': None,
            'bbox_de': None,
            'bbox_ex': None,
+            'ocr_engine': engine_name,
        }

        confidences: List[float] = []
@@ -2263,12 +2376,22 @@ def build_word_grid(
                width=cell_w, height=cell_h,
            )

-            cell_lang = lang_map.get(col.type, lang)
-            words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
+            # OCR the cell
+            if use_rapid:
+                words = ocr_region_rapid(img_bgr, cell_region)
+            else:
+                cell_lang = lang_map.get(col.type, lang)
+                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
+
+            # Group into lines, then join in reading order (Fix A)
+            # Use half of average word height as Y-tolerance
+            if words:
+                avg_h = sum(w['height'] for w in words) / len(words)
+                y_tol = max(10, int(avg_h * 0.5))
+            else:
+                y_tol = 15
+            text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)

-            # Sort words by Y then X (reading order for multi-line cells)
-            words.sort(key=lambda w: (w['top'], w['left']))
-            text = ' '.join(w['text'] for w in words)
            if words:
                avg_conf = sum(w['conf'] for w in words) / len(words)
                confidences.append(avg_conf)
@@ -2300,7 +2423,8 @@ def build_word_grid(
            entries.append(entry)

    logger.info(f"build_word_grid: {len(entries)} entries from "
-                f"{len(content_rows)} content rows × {len(relevant_cols)} columns")
+                f"{len(content_rows)} content rows × {len(relevant_cols)} columns "
+                f"(engine={engine_name})")

    return entries

@@ -1007,8 +1007,12 @@ async def get_row_ground_truth(session_id: str):
 # ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/words")
-async def detect_words(session_id: str):
-    """Build word grid from columns × rows, OCR each cell."""
+async def detect_words(session_id: str, engine: str = "auto"):
+    """Build word grid from columns × rows, OCR each cell.
+
+    Query params:
+        engine: 'auto' (default), 'tesseract', or 'rapid'
+    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)
@@ -1030,7 +1034,7 @@ async def detect_words(session_id: str):

    t0 = time.time()

-    # Create binarized OCR image
+    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

@@ -1060,8 +1064,11 @@ async def detect_words(session_id: str):
        for r in row_result["rows"]
    ]

-    # Build word grid
-    entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h)
+    # Build word grid — pass both binarized (for Tesseract) and BGR (for RapidOCR)
+    entries = build_word_grid(
+        ocr_img, col_regions, row_geoms, img_w, img_h,
+        ocr_engine=engine, img_bgr=dewarped_bgr,
+    )
    duration = time.time() - t0

    # Build summary
@@ -1072,6 +1079,9 @@ async def detect_words(session_id: str):
        "low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
    }

+    # Determine which engine was actually used
+    used_engine = entries[0].get("ocr_engine", "tesseract") if entries else engine
+
    word_result = {
        "entries": entries,
        "entry_count": len(entries),
@@ -1079,6 +1089,7 @@ async def detect_words(session_id: str):
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "summary": summary,
+        "ocr_engine": used_engine,
    }

    # Persist to DB