fix: deduplicate overlapping OCR words and use per-word Y positions in overlay

Backend: Add spatial overlap check (>=50% horizontal IoU) to Kombi merge so words at the same position are deduplicated even when OCR text differs. Frontend: Add yPct/hPct to WordPosition so each word renders at its actual vertical position instead of all words collapsing to the cell center Y. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 20:27:08 +01:00
parent 703e110bab
commit d6f51e4418
5 changed files with 70 additions and 4 deletions
@@ -410,6 +410,45 @@ class TestMergeRealWorldRegression:
        assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"


+class TestSpatialOverlapDedup:
+    """Test that words at the same position are deduplicated even if text differs."""
+
+    def test_same_position_different_text_deduplicated(self):
+        """Both engines find same physical word but OCR text differs slightly.
+        Spatial overlap should catch this as a duplicate."""
+        paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
+        tess = [_word("helo", 102, 52, 76, 18, conf=70)]
+        merged = _merge_row_sequences(paddle, tess)
+        assert len(merged) == 1, (
+            f"Expected 1 word (deduped by overlap), got {len(merged)}: "
+            f"{[w['text'] for w in merged]}"
+        )
+        # Paddle text preferred (higher confidence)
+        assert merged[0]["text"] == "hello"
+
+    def test_same_position_single_char_deduplicated(self):
+        """Single-char words at same position should be deduplicated via overlap."""
+        paddle = [_word("a", 100, 50, 20, 20, conf=90)]
+        tess = [_word("a!", 101, 51, 22, 19, conf=60)]
+        merged = _merge_row_sequences(paddle, tess)
+        assert len(merged) == 1
+
+    def test_no_overlap_different_words_kept(self):
+        """Different words at different positions should both be kept."""
+        paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
+        tess = [_word("dog", 300, 50, 50, 20, conf=70)]
+        merged = _merge_row_sequences(paddle, tess)
+        assert len(merged) == 2
+
+    def test_partial_overlap_below_threshold_kept(self):
+        """Words with < 50% overlap are different words and both kept."""
+        paddle = [_word("take", 100, 50, 60, 20, conf=90)]
+        tess = [_word("part", 145, 50, 60, 20, conf=70)]
+        merged = _merge_row_sequences(paddle, tess)
+        # 15px overlap / 60px min width = 25% < 50% → kept as separate
+        assert len(merged) == 2
+
+
 class TestSplitThenMerge:
    """Test the full pipeline: split multi-word Paddle boxes, then merge."""