feat: add Kombi-Vergleich mode for side-by-side Paddle vs RapidOCR comparison

Add /rapid-kombi backend endpoint using local RapidOCR + Tesseract merge, KombiCompareStep component for parallel execution and side-by-side overlay, and wordResultOverride prop on OverlayReconstruction for direct data injection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 07:59:06 +01:00
parent c2c082d4b4
commit a994ddee83
6 changed files with 504 additions and 35 deletions
@@ -2976,6 +2976,141 @@ async def paddle_kombi(session_id: str):
    return {"session_id": session_id, **word_result}


+@router.post("/sessions/{session_id}/rapid-kombi")
+async def rapid_kombi(session_id: str):
+    """Run RapidOCR + Tesseract on the preprocessed image and merge results.
+
+    Same merge logic as paddle-kombi, but uses local RapidOCR (ONNX Runtime)
+    instead of remote PaddleOCR service.
+    """
+    img_png = await get_session_image(session_id, "cropped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "dewarped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "original")
+    if not img_png:
+        raise HTTPException(status_code=404, detail="No image found for this session")
+
+    img_arr = np.frombuffer(img_png, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise HTTPException(status_code=400, detail="Failed to decode image")
+
+    img_h, img_w = img_bgr.shape[:2]
+
+    from cv_ocr_engines import ocr_region_rapid
+    from cv_vocab_types import PageRegion
+
+    t0 = time.time()
+
+    # --- RapidOCR (local, synchronous) ---
+    full_region = PageRegion(
+        type="full_page", x=0, y=0, width=img_w, height=img_h,
+    )
+    rapid_words = ocr_region_rapid(img_bgr, full_region)
+    if not rapid_words:
+        rapid_words = []
+
+    # --- Tesseract ---
+    from PIL import Image
+    import pytesseract
+
+    pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
+    data = pytesseract.image_to_data(
+        pil_img, lang="eng+deu",
+        config="--psm 6 --oem 3",
+        output_type=pytesseract.Output.DICT,
+    )
+    tess_words = []
+    for i in range(len(data["text"])):
+        text = str(data["text"][i]).strip()
+        conf_raw = str(data["conf"][i])
+        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
+        if not text or conf < 20:
+            continue
+        tess_words.append({
+            "text": text,
+            "left": data["left"][i],
+            "top": data["top"][i],
+            "width": data["width"][i],
+            "height": data["height"][i],
+            "conf": conf,
+        })
+
+    # --- Split multi-word RapidOCR boxes into individual words ---
+    rapid_words_split = _split_paddle_multi_words(rapid_words)
+    logger.info(
+        "rapid_kombi: split %d rapid boxes → %d individual words",
+        len(rapid_words), len(rapid_words_split),
+    )
+
+    # --- Merge ---
+    if not rapid_words_split and not tess_words:
+        raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
+
+    merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
+
+    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
+    duration = time.time() - t0
+
+    for cell in cells:
+        cell["ocr_engine"] = "rapid_kombi"
+
+    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+    col_types = {c.get("type") for c in columns_meta}
+    is_vocab = bool(col_types & {"column_en", "column_de"})
+
+    word_result = {
+        "cells": cells,
+        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
+        "columns_used": columns_meta,
+        "layout": "vocab" if is_vocab else "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "ocr_engine": "rapid_kombi",
+        "grid_method": "rapid_kombi",
+        "raw_rapid_words": rapid_words,
+        "raw_rapid_words_split": rapid_words_split,
+        "raw_tesseract_words": tess_words,
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+            "rapid_words": len(rapid_words),
+            "rapid_words_split": len(rapid_words_split),
+            "tesseract_words": len(tess_words),
+            "merged_words": len(merged_words),
+        },
+    }
+
+    await update_session_db(
+        session_id,
+        word_result=word_result,
+        cropped_png=img_png,
+        current_step=8,
+    )
+
+    logger.info(
+        "rapid_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
+        "[rapid=%d, tess=%d, merged=%d]",
+        session_id, len(cells), n_rows, n_cols, duration,
+        len(rapid_words), len(tess_words), len(merged_words),
+    )
+
+    await _append_pipeline_log(session_id, "rapid_kombi", {
+        "total_cells": len(cells),
+        "non_empty_cells": word_result["summary"]["non_empty_cells"],
+        "rapid_words": len(rapid_words),
+        "tesseract_words": len(tess_words),
+        "merged_words": len(merged_words),
+        "ocr_engine": "rapid_kombi",
+    }, duration_ms=int(duration * 1000))
+
+    return {"session_id": session_id, **word_result}
+
+
 class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None
@@ -449,6 +449,67 @@ class TestSpatialOverlapDedup:
        assert len(merged) == 2


+class TestRapidOcrMergeCompatibility:
+    """Test that _merge_paddle_tesseract works with RapidOCR word format.
+
+    RapidOCR words include an extra 'region_type' key that PaddleOCR words
+    don't have. The merge logic must tolerate this extra field.
+    """
+
+    def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
+        """Create a word dict in RapidOCR format (has region_type)."""
+        return {
+            "text": text,
+            "left": left,
+            "top": top,
+            "width": width,
+            "height": height,
+            "conf": conf,
+            "region_type": region_type,
+        }
+
+    def test_rapid_words_merge_with_tesseract(self):
+        """RapidOCR words (with region_type) merge correctly with Tesseract words."""
+        rapid = [
+            self._rapid_word("apple", 50, 10, 70, 20, conf=90),
+            self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
+        ]
+        tess = [
+            _word("apple", 52, 11, 68, 19, conf=75),
+            _word("Apfel", 298, 12, 62, 18, conf=70),
+        ]
+        merged = _merge_paddle_tesseract(rapid, tess)
+        assert len(merged) == 2
+        texts = sorted(w["text"] for w in merged)
+        assert texts == ["Apfel", "apple"]
+
+    def test_rapid_words_split_then_merge(self):
+        """Split + merge works with RapidOCR multi-word boxes."""
+        rapid_raw = [
+            self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
+        ]
+        tess = [
+            _word("More", 948, 292, 60, 20, conf=90),
+            _word("than", 1017, 291, 49, 21, conf=96),
+            _word("200", 1076, 292, 43, 20, conf=93),
+        ]
+        rapid_split = _split_paddle_multi_words(rapid_raw)
+        assert len(rapid_split) == 3
+        merged = _merge_paddle_tesseract(rapid_split, tess)
+        texts = [w["text"] for w in merged]
+        assert texts.count("More") == 1
+        assert texts.count("than") == 1
+        assert texts.count("200") == 1
+
+    def test_region_type_preserved_in_unmatched(self):
+        """Unmatched RapidOCR words keep their region_type field."""
+        rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
+        tess = []  # No Tesseract words
+        merged = _merge_paddle_tesseract(rapid, tess)
+        assert len(merged) == 1
+        assert merged[0]["text"] == "unique"
+
+
 class TestSplitThenMerge:
    """Test the full pipeline: split multi-word Paddle boxes, then merge."""