fix: split PaddleOCR multi-word boxes before merge

PaddleOCR returns entire phrases as single boxes (e.g. "More than 200 singers took part in the"). The merge algorithm compared word-by-word but Paddle had multi-word boxes vs Tesseract's individual words, so nothing matched and all Tesseract words were added as "extras" causing duplicates. Now splits Paddle boxes into individual words before merge. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 10:39:10 +01:00
parent 41ff7671cd
commit 703e110bab
2 changed files with 172 additions and 2 deletions
@@ -1,6 +1,7 @@
 """Tests for the Kombi-Modus row-based sequence merge algorithm.

 Functions under test (ocr_pipeline_api.py):
+- _split_paddle_multi_words: Split multi-word PaddleOCR boxes into individual words
 - _group_words_into_rows: Cluster words by Y-position into rows
 - _merge_row_sequences: Merge two word sequences within the same row
 - _merge_paddle_tesseract: Full merge with row matching + sequence dedup
@@ -13,6 +14,7 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 from ocr_pipeline_api import (
+    _split_paddle_multi_words,
    _group_words_into_rows,
    _merge_row_sequences,
    _merge_paddle_tesseract,
@@ -35,6 +37,65 @@ def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, con
    }


+# ---------------------------------------------------------------------------
+# _split_paddle_multi_words
+# ---------------------------------------------------------------------------
+
+class TestSplitPaddleMultiWords:
+
+    def test_single_word_unchanged(self):
+        words = [_word("hello", 100, 50, 80, 20)]
+        result = _split_paddle_multi_words(words)
+        assert len(result) == 1
+        assert result[0]["text"] == "hello"
+
+    def test_multi_word_split(self):
+        """'More than 200' as one box → 3 individual words."""
+        words = [_word("More than 200", 100, 50, 300, 20)]
+        result = _split_paddle_multi_words(words)
+        assert len(result) == 3
+        assert result[0]["text"] == "More"
+        assert result[1]["text"] == "than"
+        assert result[2]["text"] == "200"
+        # All should be within the original box
+        assert result[0]["left"] >= 100
+        assert result[2]["left"] + result[2]["width"] <= 400 + 5  # allow rounding
+
+    def test_exclamation_split(self):
+        """'!Betonung' → ['!', 'Betonung']."""
+        words = [_word("!Betonung", 100, 50, 120, 20)]
+        result = _split_paddle_multi_words(words)
+        assert len(result) == 2
+        assert result[0]["text"] == "!"
+        assert result[1]["text"] == "Betonung"
+
+    def test_ipa_bracket_split(self):
+        """'badge[bxd3]' → ['badge', '[bxd3]']."""
+        words = [_word("badge[bxd3]", 100, 50, 150, 20)]
+        result = _split_paddle_multi_words(words)
+        assert len(result) == 2
+        assert result[0]["text"] == "badge"
+        assert result[1]["text"] == "[bxd3]"
+
+    def test_long_phrase(self):
+        """'More than 200 singers took part in the' → 8 words."""
+        words = [_word("More than 200 singers took part in the", 944, 287, 454, 29)]
+        result = _split_paddle_multi_words(words)
+        assert len(result) == 8
+        texts = [w["text"] for w in result]
+        assert texts == ["More", "than", "200", "singers", "took", "part", "in", "the"]
+
+    def test_empty_input(self):
+        assert _split_paddle_multi_words([]) == []
+
+    def test_preserves_top_and_height(self):
+        words = [_word("a b", 100, 50, 200, 25)]
+        result = _split_paddle_multi_words(words)
+        for w in result:
+            assert w["top"] == 50
+            assert w["height"] == 25
+
+
 # ---------------------------------------------------------------------------
 # _group_words_into_rows
 # ---------------------------------------------------------------------------
@@ -347,3 +408,56 @@ class TestMergeRealWorldRegression:
        be_word = [w for w in merged if w["text"] == "be"][0]
        take_word = [w for w in merged if w["text"] == "take"][0]
        assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
+
+
+class TestSplitThenMerge:
+    """Test the full pipeline: split multi-word Paddle boxes, then merge."""
+
+    def test_multi_word_paddle_boxes_no_duplicates(self):
+        """PaddleOCR returns phrases as single boxes — after splitting,
+        merge should produce no duplicates."""
+        # Paddle returns multi-word boxes (real-world behavior)
+        paddle_raw = [
+            _word("take part(in) [teik'pa:t]", 185, 287, 281, 29, conf=90),
+            _word("teilnehmen (an.mitmachen", 526, 282, 329, 35, conf=93),
+            _word("More than 200 singers took part in the", 944, 287, 454, 29, conf=96),
+        ]
+        tess = [
+            _word("take", 188, 289, 52, 21, conf=96),
+            _word("part", 249, 292, 48, 24, conf=96),
+            _word("(in)", 305, 290, 38, 24, conf=93),
+            _word("[teık", 352, 292, 47, 21, conf=90),
+            _word("'pa:t]", 407, 292, 55, 23, conf=89),
+            _word("teilnehmen", 534, 290, 127, 21, conf=95),
+            _word("(an),", 671, 291, 48, 23, conf=96),
+            _word("mitmachen", 730, 290, 123, 22, conf=96),
+            _word("More", 948, 292, 60, 20, conf=90),
+            _word("than", 1017, 291, 49, 21, conf=96),
+            _word("200", 1076, 292, 43, 20, conf=93),
+            _word("singers", 1128, 293, 75, 26, conf=93),
+            _word("took", 1212, 291, 55, 22, conf=96),
+            _word("part", 1276, 294, 47, 25, conf=96),
+            _word("in", 1332, 292, 20, 20, conf=95),
+            _word("the", 1361, 292, 36, 21, conf=95),
+        ]
+
+        # Split paddle multi-word boxes first
+        paddle_split = _split_paddle_multi_words(paddle_raw)
+        assert len(paddle_split) > len(paddle_raw), "Should have more words after split"
+
+        # Merge
+        merged = _merge_paddle_tesseract(paddle_split, tess)
+
+        # Check no near-duplicates
+        for i, w1 in enumerate(merged):
+            for j in range(i + 1, len(merged)):
+                w2 = merged[j]
+                if w1["text"].lower() == w2["text"].lower():
+                    cx1 = w1["left"] + w1.get("width", 0) / 2
+                    cx2 = w2["left"] + w2.get("width", 0) / 2
+                    cy1 = w1["top"] + w1.get("height", 0) / 2
+                    cy2 = w2["top"] + w2.get("height", 0) / 2
+                    assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
+                        f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
+                        f"vs ({w2['left']},{w2['top']})"
+                    )