feat: add Kombi-Vergleich mode for side-by-side Paddle vs RapidOCR comparison

Add /rapid-kombi backend endpoint using local RapidOCR + Tesseract merge, KombiCompareStep component for parallel execution and side-by-side overlay, and wordResultOverride prop on OverlayReconstruction for direct data injection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 07:59:06 +01:00
parent c2c082d4b4
commit a994ddee83
6 changed files with 504 additions and 35 deletions
@@ -449,6 +449,67 @@ class TestSpatialOverlapDedup:
        assert len(merged) == 2


+class TestRapidOcrMergeCompatibility:
+    """Test that _merge_paddle_tesseract works with RapidOCR word format.
+
+    RapidOCR words include an extra 'region_type' key that PaddleOCR words
+    don't have. The merge logic must tolerate this extra field.
+    """
+
+    def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
+        """Create a word dict in RapidOCR format (has region_type)."""
+        return {
+            "text": text,
+            "left": left,
+            "top": top,
+            "width": width,
+            "height": height,
+            "conf": conf,
+            "region_type": region_type,
+        }
+
+    def test_rapid_words_merge_with_tesseract(self):
+        """RapidOCR words (with region_type) merge correctly with Tesseract words."""
+        rapid = [
+            self._rapid_word("apple", 50, 10, 70, 20, conf=90),
+            self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
+        ]
+        tess = [
+            _word("apple", 52, 11, 68, 19, conf=75),
+            _word("Apfel", 298, 12, 62, 18, conf=70),
+        ]
+        merged = _merge_paddle_tesseract(rapid, tess)
+        assert len(merged) == 2
+        texts = sorted(w["text"] for w in merged)
+        assert texts == ["Apfel", "apple"]
+
+    def test_rapid_words_split_then_merge(self):
+        """Split + merge works with RapidOCR multi-word boxes."""
+        rapid_raw = [
+            self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
+        ]
+        tess = [
+            _word("More", 948, 292, 60, 20, conf=90),
+            _word("than", 1017, 291, 49, 21, conf=96),
+            _word("200", 1076, 292, 43, 20, conf=93),
+        ]
+        rapid_split = _split_paddle_multi_words(rapid_raw)
+        assert len(rapid_split) == 3
+        merged = _merge_paddle_tesseract(rapid_split, tess)
+        texts = [w["text"] for w in merged]
+        assert texts.count("More") == 1
+        assert texts.count("than") == 1
+        assert texts.count("200") == 1
+
+    def test_region_type_preserved_in_unmatched(self):
+        """Unmatched RapidOCR words keep their region_type field."""
+        rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
+        tess = []  # No Tesseract words
+        merged = _merge_paddle_tesseract(rapid, tess)
+        assert len(merged) == 1
+        assert merged[0]["text"] == "unique"
+
+
 class TestSplitThenMerge:
    """Test the full pipeline: split multi-word Paddle boxes, then merge."""