fix: add _deduplicate_words safety net to Kombi merge
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Even after multi-criteria matching, near-duplicate words can slip through (same text, centers within 30px horizontal / 15px vertical). The new _deduplicate_words() removes these, keeping the higher-confidence copy. Regression test with real session data (row 2 with 145 near-dupes) confirms no duplicates remain after merge + deduplication. Tests: 37 → 45 (added TestDeduplicateWords, TestMergeRealWorldRegression). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2729,7 +2729,45 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
|
||||
if ti not in used_tess and tw.get("conf", 0) >= 40:
|
||||
merged.append(tw)
|
||||
|
||||
return merged
|
||||
# Safety net: deduplicate any remaining near-duplicate words
|
||||
return _deduplicate_words(merged)
|
||||
|
||||
|
||||
def _deduplicate_words(words: list) -> list:
|
||||
"""Remove near-duplicate words that slipped through matching.
|
||||
|
||||
Two words are considered duplicates if:
|
||||
- Same text (case-insensitive)
|
||||
- Centers within 30px horizontally and 15px vertically
|
||||
The word with higher confidence is kept.
|
||||
"""
|
||||
if len(words) <= 1:
|
||||
return words
|
||||
keep = [True] * len(words)
|
||||
for i in range(len(words)):
|
||||
if not keep[i]:
|
||||
continue
|
||||
w1 = words[i]
|
||||
cx1 = w1["left"] + w1.get("width", 0) / 2
|
||||
cy1 = w1["top"] + w1.get("height", 0) / 2
|
||||
t1 = w1.get("text", "").lower().strip()
|
||||
for j in range(i + 1, len(words)):
|
||||
if not keep[j]:
|
||||
continue
|
||||
w2 = words[j]
|
||||
t2 = w2.get("text", "").lower().strip()
|
||||
if t1 != t2:
|
||||
continue
|
||||
cx2 = w2["left"] + w2.get("width", 0) / 2
|
||||
cy2 = w2["top"] + w2.get("height", 0) / 2
|
||||
if abs(cx1 - cx2) < 30 and abs(cy1 - cy2) < 15:
|
||||
# Drop the one with lower confidence
|
||||
if w1.get("conf", 0) >= w2.get("conf", 0):
|
||||
keep[j] = False
|
||||
else:
|
||||
keep[i] = False
|
||||
break # w1 is dropped, stop comparing
|
||||
return [w for w, k in zip(words, keep) if k]
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/paddle-kombi")
|
||||
|
||||
@@ -19,6 +19,7 @@ from ocr_pipeline_api import (
|
||||
_box_center_dist,
|
||||
_text_similarity,
|
||||
_words_match,
|
||||
_deduplicate_words,
|
||||
_merge_paddle_tesseract,
|
||||
)
|
||||
|
||||
@@ -332,3 +333,134 @@ class TestMergePaddleTesseractBulletPoints:
|
||||
assert "•" in texts
|
||||
assert "word" in texts
|
||||
assert len(merged) == 3
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _deduplicate_words
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDeduplicateWords:
|
||||
|
||||
def test_no_duplicates(self):
|
||||
"""Different words at different positions: all kept."""
|
||||
words = [_word("a", 10, 10), _word("b", 200, 10), _word("c", 10, 100)]
|
||||
result = _deduplicate_words(words)
|
||||
assert len(result) == 3
|
||||
|
||||
def test_exact_duplicate_removed(self):
|
||||
"""Same text at same position: only one kept."""
|
||||
words = [
|
||||
_word("take", 185, 287, 47, 29, conf=90),
|
||||
_word("take", 188, 289, 52, 21, conf=96),
|
||||
]
|
||||
result = _deduplicate_words(words)
|
||||
assert len(result) == 1
|
||||
assert result[0]["conf"] == 96 # higher confidence kept
|
||||
|
||||
def test_same_text_far_apart_kept(self):
|
||||
"""Same word at very different positions (e.g. repeated in text): both kept."""
|
||||
words = [
|
||||
_word("the", 100, 10),
|
||||
_word("the", 500, 10),
|
||||
]
|
||||
result = _deduplicate_words(words)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_different_text_same_position_kept(self):
|
||||
"""Different words at same position: both kept (not duplicates)."""
|
||||
words = [
|
||||
_word("apple", 100, 50),
|
||||
_word("Apfel", 105, 52),
|
||||
]
|
||||
result = _deduplicate_words(words)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_empty_list(self):
|
||||
assert _deduplicate_words([]) == []
|
||||
|
||||
def test_single_word(self):
|
||||
words = [_word("hello", 10, 10)]
|
||||
assert len(_deduplicate_words(words)) == 1
|
||||
|
||||
def test_real_world_near_duplicates(self):
|
||||
"""Simulate real-world: Paddle (height=29) + Tesseract (height=21) near-dupes."""
|
||||
words = [
|
||||
_word("take", 185, 287, 47, 29, conf=90),
|
||||
_word("part", 249, 292, 48, 24, conf=96),
|
||||
_word("More", 944, 287, 50, 29, conf=96),
|
||||
_word("than", 1003, 287, 50, 29, conf=96),
|
||||
# near-dupes from other engine
|
||||
_word("take", 188, 289, 52, 21, conf=96),
|
||||
_word("part", 249, 294, 47, 25, conf=96),
|
||||
_word("More", 948, 292, 60, 20, conf=90),
|
||||
_word("than", 1017, 291, 49, 21, conf=96),
|
||||
]
|
||||
result = _deduplicate_words(words)
|
||||
# Each word should appear only once
|
||||
assert len(result) == 4
|
||||
texts = sorted(w["text"] for w in result)
|
||||
assert texts == ["More", "part", "take", "than"]
|
||||
|
||||
|
||||
class TestMergeRealWorldRegression:
|
||||
"""Regression test with actual data from the doubled-words bug."""
|
||||
|
||||
def test_row2_no_duplicates(self):
|
||||
"""Reproduce the row-2 bug: both engines return the same words at
|
||||
slightly different positions. Merge should produce no duplicates."""
|
||||
paddle = [
|
||||
_word("teilnehmen", 526, 282, 140, 35, conf=93),
|
||||
_word("take", 185, 287, 47, 29, conf=90),
|
||||
_word("part(in)", 238, 287, 94, 29, conf=90),
|
||||
_word("More", 944, 287, 50, 29, conf=96),
|
||||
_word("than", 1003, 287, 50, 29, conf=96),
|
||||
_word("200", 1063, 287, 38, 29, conf=96),
|
||||
_word("singers", 1110, 287, 88, 29, conf=96),
|
||||
_word("took", 1207, 287, 50, 29, conf=96),
|
||||
_word("part", 1266, 287, 50, 29, conf=96),
|
||||
_word("in", 1326, 287, 25, 29, conf=96),
|
||||
_word("the", 1360, 287, 38, 29, conf=96),
|
||||
]
|
||||
tess = [
|
||||
_word("take", 188, 289, 52, 21, conf=96),
|
||||
_word("part", 249, 292, 48, 24, conf=96),
|
||||
_word("(in)", 305, 290, 38, 24, conf=93),
|
||||
_word("teilnehmen", 534, 290, 127, 21, conf=95),
|
||||
_word("(an),", 671, 291, 48, 23, conf=96),
|
||||
_word("mitmachen", 730, 290, 123, 22, conf=96),
|
||||
_word("More", 948, 292, 60, 20, conf=90),
|
||||
_word("than", 1017, 291, 49, 21, conf=96),
|
||||
_word("200", 1076, 292, 43, 20, conf=93),
|
||||
_word("singers", 1128, 293, 75, 26, conf=93),
|
||||
_word("took", 1212, 291, 55, 22, conf=96),
|
||||
_word("part", 1276, 294, 47, 25, conf=96),
|
||||
_word("in", 1332, 292, 20, 20, conf=95),
|
||||
_word("the", 1361, 292, 36, 21, conf=95),
|
||||
# Tesseract-only: phonetic transcriptions
|
||||
_word("[teık", 352, 292, 47, 21, conf=90),
|
||||
_word("'pa:t]", 407, 292, 55, 23, conf=89),
|
||||
]
|
||||
merged = _merge_paddle_tesseract(paddle, tess)
|
||||
|
||||
# Check no near-duplicates remain
|
||||
for i, w1 in enumerate(merged):
|
||||
for j, w2 in enumerate(merged):
|
||||
if j <= i:
|
||||
continue
|
||||
if w1["text"].lower() == w2["text"].lower():
|
||||
cx1 = w1["left"] + w1.get("width", 0) / 2
|
||||
cx2 = w2["left"] + w2.get("width", 0) / 2
|
||||
cy1 = w1["top"] + w1.get("height", 0) / 2
|
||||
cy2 = w2["top"] + w2.get("height", 0) / 2
|
||||
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
|
||||
f"Near-duplicate found: '{w1['text']}' at ({w1['left']},{w1['top']}) "
|
||||
f"vs ({w2['left']},{w2['top']})"
|
||||
)
|
||||
|
||||
# Tesseract-only words should be present
|
||||
texts = [w["text"] for w in merged]
|
||||
assert "(in)" in texts # Tesseract split "part(in)" differently
|
||||
assert "(an)," in texts
|
||||
assert "mitmachen" in texts
|
||||
assert "[teık" in texts # phonetic from Tesseract
|
||||
assert "'pa:t]" in texts
|
||||
|
||||
Reference in New Issue
Block a user