fix: add _deduplicate_words safety net to Kombi merge

Even after multi-criteria matching, near-duplicate words can slip through (same text, centers within 30px horizontal / 15px vertical). The new _deduplicate_words() removes these, keeping the higher-confidence copy. Regression test with real session data (row 2 with 145 near-dupes) confirms no duplicates remain after merge + deduplication. Tests: 37 → 45 (added TestDeduplicateWords, TestMergeRealWorldRegression). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 08:27:45 +01:00
parent 4f2fb0e94c
commit 4280298e02
2 changed files with 171 additions and 1 deletions
@@ -19,6 +19,7 @@ from ocr_pipeline_api import (
    _box_center_dist,
    _text_similarity,
    _words_match,
+    _deduplicate_words,
    _merge_paddle_tesseract,
 )

@@ -332,3 +333,134 @@ class TestMergePaddleTesseractBulletPoints:
        assert "•" in texts
        assert "word" in texts
        assert len(merged) == 3
+
+
+# ---------------------------------------------------------------------------
+# _deduplicate_words
+# ---------------------------------------------------------------------------
+
+class TestDeduplicateWords:
+
+    def test_no_duplicates(self):
+        """Different words at different positions: all kept."""
+        words = [_word("a", 10, 10), _word("b", 200, 10), _word("c", 10, 100)]
+        result = _deduplicate_words(words)
+        assert len(result) == 3
+
+    def test_exact_duplicate_removed(self):
+        """Same text at same position: only one kept."""
+        words = [
+            _word("take", 185, 287, 47, 29, conf=90),
+            _word("take", 188, 289, 52, 21, conf=96),
+        ]
+        result = _deduplicate_words(words)
+        assert len(result) == 1
+        assert result[0]["conf"] == 96  # higher confidence kept
+
+    def test_same_text_far_apart_kept(self):
+        """Same word at very different positions (e.g. repeated in text): both kept."""
+        words = [
+            _word("the", 100, 10),
+            _word("the", 500, 10),
+        ]
+        result = _deduplicate_words(words)
+        assert len(result) == 2
+
+    def test_different_text_same_position_kept(self):
+        """Different words at same position: both kept (not duplicates)."""
+        words = [
+            _word("apple", 100, 50),
+            _word("Apfel", 105, 52),
+        ]
+        result = _deduplicate_words(words)
+        assert len(result) == 2
+
+    def test_empty_list(self):
+        assert _deduplicate_words([]) == []
+
+    def test_single_word(self):
+        words = [_word("hello", 10, 10)]
+        assert len(_deduplicate_words(words)) == 1
+
+    def test_real_world_near_duplicates(self):
+        """Simulate real-world: Paddle (height=29) + Tesseract (height=21) near-dupes."""
+        words = [
+            _word("take", 185, 287, 47, 29, conf=90),
+            _word("part", 249, 292, 48, 24, conf=96),
+            _word("More", 944, 287, 50, 29, conf=96),
+            _word("than", 1003, 287, 50, 29, conf=96),
+            # near-dupes from other engine
+            _word("take", 188, 289, 52, 21, conf=96),
+            _word("part", 249, 294, 47, 25, conf=96),
+            _word("More", 948, 292, 60, 20, conf=90),
+            _word("than", 1017, 291, 49, 21, conf=96),
+        ]
+        result = _deduplicate_words(words)
+        # Each word should appear only once
+        assert len(result) == 4
+        texts = sorted(w["text"] for w in result)
+        assert texts == ["More", "part", "take", "than"]
+
+
+class TestMergeRealWorldRegression:
+    """Regression test with actual data from the doubled-words bug."""
+
+    def test_row2_no_duplicates(self):
+        """Reproduce the row-2 bug: both engines return the same words at
+        slightly different positions. Merge should produce no duplicates."""
+        paddle = [
+            _word("teilnehmen", 526, 282, 140, 35, conf=93),
+            _word("take", 185, 287, 47, 29, conf=90),
+            _word("part(in)", 238, 287, 94, 29, conf=90),
+            _word("More", 944, 287, 50, 29, conf=96),
+            _word("than", 1003, 287, 50, 29, conf=96),
+            _word("200", 1063, 287, 38, 29, conf=96),
+            _word("singers", 1110, 287, 88, 29, conf=96),
+            _word("took", 1207, 287, 50, 29, conf=96),
+            _word("part", 1266, 287, 50, 29, conf=96),
+            _word("in", 1326, 287, 25, 29, conf=96),
+            _word("the", 1360, 287, 38, 29, conf=96),
+        ]
+        tess = [
+            _word("take", 188, 289, 52, 21, conf=96),
+            _word("part", 249, 292, 48, 24, conf=96),
+            _word("(in)", 305, 290, 38, 24, conf=93),
+            _word("teilnehmen", 534, 290, 127, 21, conf=95),
+            _word("(an),", 671, 291, 48, 23, conf=96),
+            _word("mitmachen", 730, 290, 123, 22, conf=96),
+            _word("More", 948, 292, 60, 20, conf=90),
+            _word("than", 1017, 291, 49, 21, conf=96),
+            _word("200", 1076, 292, 43, 20, conf=93),
+            _word("singers", 1128, 293, 75, 26, conf=93),
+            _word("took", 1212, 291, 55, 22, conf=96),
+            _word("part", 1276, 294, 47, 25, conf=96),
+            _word("in", 1332, 292, 20, 20, conf=95),
+            _word("the", 1361, 292, 36, 21, conf=95),
+            # Tesseract-only: phonetic transcriptions
+            _word("[teık", 352, 292, 47, 21, conf=90),
+            _word("'pa:t]", 407, 292, 55, 23, conf=89),
+        ]
+        merged = _merge_paddle_tesseract(paddle, tess)
+
+        # Check no near-duplicates remain
+        for i, w1 in enumerate(merged):
+            for j, w2 in enumerate(merged):
+                if j <= i:
+                    continue
+                if w1["text"].lower() == w2["text"].lower():
+                    cx1 = w1["left"] + w1.get("width", 0) / 2
+                    cx2 = w2["left"] + w2.get("width", 0) / 2
+                    cy1 = w1["top"] + w1.get("height", 0) / 2
+                    cy2 = w2["top"] + w2.get("height", 0) / 2
+                    assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
+                        f"Near-duplicate found: '{w1['text']}' at ({w1['left']},{w1['top']}) "
+                        f"vs ({w2['left']},{w2['top']})"
+                    )
+
+        # Tesseract-only words should be present
+        texts = [w["text"] for w in merged]
+        assert "(in)" in texts  # Tesseract split "part(in)" differently
+        assert "(an)," in texts
+        assert "mitmachen" in texts
+        assert "[teık" in texts  # phonetic from Tesseract
+        assert "'pa:t]" in texts