Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m5s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Even after multi-criteria matching, near-duplicate words can slip through (same text, centers within 30px horizontal / 15px vertical). The new _deduplicate_words() removes these, keeping the higher-confidence copy. Regression test with real session data (row 2 with 145 near-dupes) confirms no duplicates remain after merge + deduplication. Tests: 37 → 45 (added TestDeduplicateWords, TestMergeRealWorldRegression). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
467 lines
17 KiB
Python
467 lines
17 KiB
Python
"""Tests for the Kombi-Modus merge algorithm.
|
||
|
||
Functions under test (ocr_pipeline_api.py):
|
||
- _box_iou: IoU between two word boxes
|
||
- _box_center_dist: Euclidean distance between box centers
|
||
- _text_similarity: Simple text similarity (0-1)
|
||
- _words_match: Multi-criteria match (IoU + center + text)
|
||
- _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists
|
||
"""
|
||
|
||
import pytest
|
||
import sys
|
||
import os
|
||
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||
|
||
from ocr_pipeline_api import (
|
||
_box_iou,
|
||
_box_center_dist,
|
||
_text_similarity,
|
||
_words_match,
|
||
_deduplicate_words,
|
||
_merge_paddle_tesseract,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
|
||
"""Create a synthetic word dict."""
|
||
return {
|
||
"text": text,
|
||
"left": left,
|
||
"top": top,
|
||
"width": width,
|
||
"height": height,
|
||
"conf": conf,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _box_iou
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBoxIoU:
|
||
|
||
def test_identical_boxes(self):
|
||
a = _word("hello", 10, 10, 100, 20)
|
||
assert _box_iou(a, a) == pytest.approx(1.0)
|
||
|
||
def test_no_overlap(self):
|
||
a = _word("a", 0, 0, 50, 20)
|
||
b = _word("b", 200, 200, 50, 20)
|
||
assert _box_iou(a, b) == 0.0
|
||
|
||
def test_partial_overlap(self):
|
||
a = _word("a", 0, 0, 100, 20)
|
||
b = _word("b", 50, 0, 100, 20)
|
||
assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)
|
||
|
||
def test_contained_box(self):
|
||
big = _word("big", 0, 0, 200, 40)
|
||
small = _word("small", 50, 10, 30, 10)
|
||
assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)
|
||
|
||
def test_touching_edges(self):
|
||
a = _word("a", 0, 0, 50, 20)
|
||
b = _word("b", 50, 0, 50, 20)
|
||
assert _box_iou(a, b) == 0.0
|
||
|
||
def test_zero_area_box(self):
|
||
a = _word("a", 10, 10, 0, 0)
|
||
b = _word("b", 10, 10, 50, 20)
|
||
assert _box_iou(a, b) == 0.0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _box_center_dist
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBoxCenterDist:
|
||
|
||
def test_same_center(self):
|
||
a = _word("a", 100, 50, 60, 20)
|
||
assert _box_center_dist(a, a) == 0.0
|
||
|
||
def test_horizontal_offset(self):
|
||
a = _word("a", 100, 50, 60, 20)
|
||
b = _word("b", 110, 50, 60, 20)
|
||
assert _box_center_dist(a, b) == pytest.approx(10.0)
|
||
|
||
def test_diagonal(self):
|
||
a = _word("a", 0, 0, 20, 20) # center (10, 10)
|
||
b = _word("b", 20, 20, 20, 20) # center (30, 30)
|
||
expected = (20**2 + 20**2) ** 0.5
|
||
assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _text_similarity
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestTextSimilarity:
|
||
|
||
def test_identical(self):
|
||
assert _text_similarity("hello", "hello") == 1.0
|
||
|
||
def test_case_insensitive(self):
|
||
assert _text_similarity("Hello", "hello") == 1.0
|
||
|
||
def test_substring(self):
|
||
"""One is substring of other (e.g. '!Betonung' vs 'Betonung')."""
|
||
assert _text_similarity("!Betonung", "Betonung") == 0.8
|
||
|
||
def test_completely_different(self):
|
||
assert _text_similarity("abc", "xyz") == 0.0
|
||
|
||
def test_empty_strings(self):
|
||
assert _text_similarity("", "hello") == 0.0
|
||
assert _text_similarity("", "") == 0.0
|
||
|
||
def test_partial_overlap(self):
|
||
"""Some shared characters."""
|
||
sim = _text_similarity("apple", "ape")
|
||
assert 0.0 < sim < 1.0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _words_match
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestWordsMatch:
|
||
|
||
def test_high_iou_matches(self):
|
||
"""IoU > 0.15 is sufficient for a match."""
|
||
a = _word("hello", 100, 50, 80, 20)
|
||
b = _word("hello", 105, 50, 80, 20)
|
||
assert _words_match(a, b) is True
|
||
|
||
def test_same_text_same_row_matches(self):
|
||
"""Same text on same row matches even with low IoU."""
|
||
a = _word("Betonung", 100, 50, 80, 20)
|
||
b = _word("Betonung", 130, 52, 70, 18) # shifted but same row
|
||
assert _words_match(a, b) is True
|
||
|
||
def test_close_centers_same_row_matches(self):
|
||
"""Nearby centers on same row match."""
|
||
a = _word("x", 100, 50, 40, 20)
|
||
b = _word("y", 110, 52, 50, 22) # close, same row
|
||
assert _words_match(a, b) is True
|
||
|
||
def test_different_rows_no_match(self):
|
||
"""Words on different rows don't match even with same text."""
|
||
a = _word("hello", 100, 50, 80, 20)
|
||
b = _word("hello", 100, 200, 80, 20) # far away vertically
|
||
assert _words_match(a, b) is False
|
||
|
||
def test_far_apart_same_row_different_text(self):
|
||
"""Different text far apart on same row: no match."""
|
||
a = _word("cat", 10, 50, 40, 20)
|
||
b = _word("dog", 400, 50, 40, 20)
|
||
assert _words_match(a, b) is False
|
||
|
||
def test_no_overlap_no_proximity_no_text(self):
|
||
"""Completely different words far apart: no match."""
|
||
a = _word("abc", 0, 0, 50, 20)
|
||
b = _word("xyz", 500, 500, 50, 20)
|
||
assert _words_match(a, b) is False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _merge_paddle_tesseract
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMergePaddleTesseract:
|
||
|
||
def test_perfect_match_averages_coords(self):
|
||
"""Same word at same position: coordinates averaged by confidence."""
|
||
pw = [_word("hello", 100, 50, 80, 20, conf=90)]
|
||
tw = [_word("hello", 110, 55, 70, 18, conf=60)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
m = merged[0]
|
||
assert m["text"] == "hello"
|
||
assert m["left"] == 104 # (100*90 + 110*60) / 150
|
||
assert m["conf"] == 90
|
||
|
||
def test_same_word_slightly_offset_merges(self):
|
||
"""Same word with slight offset still merges (center proximity)."""
|
||
pw = [_word("Betonung", 100, 50, 90, 22, conf=85)]
|
||
tw = [_word("Betonung", 115, 52, 80, 20, conf=60)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
assert merged[0]["text"] == "Betonung"
|
||
|
||
def test_truly_different_words_kept_separate(self):
|
||
"""Non-overlapping different words: both kept."""
|
||
pw = [_word("hello", 10, 10)]
|
||
tw = [_word("bullet", 500, 500, conf=50)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2
|
||
texts = {m["text"] for m in merged}
|
||
assert texts == {"hello", "bullet"}
|
||
|
||
def test_low_conf_tesseract_dropped(self):
|
||
"""Unmatched Tesseract words with conf < 40 are dropped."""
|
||
pw = [_word("hello", 10, 10)]
|
||
tw = [_word("noise", 500, 500, conf=20)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
|
||
def test_empty_paddle(self):
|
||
pw = []
|
||
tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
assert merged[0]["text"] == "bullet"
|
||
|
||
def test_empty_tesseract(self):
|
||
pw = [_word("a", 10, 10), _word("b", 200, 10)]
|
||
merged = _merge_paddle_tesseract(pw, [])
|
||
assert len(merged) == 2
|
||
|
||
def test_both_empty(self):
|
||
assert _merge_paddle_tesseract([], []) == []
|
||
|
||
def test_one_to_one_matching(self):
|
||
"""Each Tesseract word matches at most one Paddle word."""
|
||
pw = [
|
||
_word("cat", 10, 10, 60, 20, conf=80),
|
||
_word("dog", 200, 10, 60, 20, conf=80),
|
||
]
|
||
tw = [_word("cat", 15, 12, 55, 18, conf=70)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2 # cat (merged) + dog (unmatched paddle)
|
||
|
||
def test_far_apart_different_text_not_merged(self):
|
||
"""Different words far apart stay separate."""
|
||
pw = [_word("hello", 0, 0, 100, 20, conf=80)]
|
||
tw = [_word("world", 500, 300, 100, 20, conf=70)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2
|
||
|
||
def test_paddle_text_preferred(self):
|
||
"""Merged word uses Paddle's text."""
|
||
pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
|
||
tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
assert merged[0]["text"] == "Betonung"
|
||
|
||
def test_confidence_weighted_positions(self):
|
||
"""Equal confidence → simple average of coordinates."""
|
||
pw = [_word("x", 100, 200, 60, 20, conf=50)]
|
||
tw = [_word("x", 110, 200, 60, 20, conf=50)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
m = merged[0]
|
||
assert m["left"] == 105
|
||
assert m["top"] == 200
|
||
|
||
def test_zero_confidence_no_division_error(self):
|
||
"""Words with conf=0 don't cause division by zero."""
|
||
pw = [_word("a", 100, 50, 80, 20, conf=0)]
|
||
tw = [_word("a", 100, 50, 80, 20, conf=0)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
|
||
def test_duplicate_words_same_position_deduplicated(self):
|
||
"""The core bug fix: same word at same position from both engines
|
||
should appear only once, not doubled."""
|
||
# Simulate typical case: both engines find same words
|
||
pw = [
|
||
_word("apple", 50, 10, 70, 20, conf=90),
|
||
_word("Apfel", 300, 10, 60, 20, conf=85),
|
||
_word("dog", 50, 50, 50, 20, conf=88),
|
||
_word("Hund", 300, 50, 60, 20, conf=82),
|
||
]
|
||
tw = [
|
||
_word("apple", 52, 11, 68, 19, conf=75),
|
||
_word("Apfel", 298, 12, 62, 18, conf=70),
|
||
_word("dog", 48, 49, 52, 21, conf=72),
|
||
_word("Hund", 302, 51, 58, 19, conf=68),
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
# Each word should appear exactly once
|
||
assert len(merged) == 4
|
||
texts = [m["text"] for m in merged]
|
||
assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"]
|
||
|
||
|
||
class TestMergePaddleTesseractBulletPoints:
|
||
"""Tesseract catches bullet points / symbols that PaddleOCR misses."""
|
||
|
||
def test_bullet_added_from_tesseract(self):
|
||
"""Bullet character from Tesseract is added."""
|
||
pw = [_word("Betonung", 60, 10, 80, 20)]
|
||
tw = [
|
||
_word("•", 10, 10, 15, 15, conf=65),
|
||
_word("Betonung", 60, 10, 80, 20, conf=50),
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
texts = [m["text"] for m in merged]
|
||
assert "•" in texts
|
||
assert "Betonung" in texts
|
||
assert len(merged) == 2
|
||
|
||
def test_exclamation_added_from_tesseract(self):
|
||
"""Exclamation mark from Tesseract is added."""
|
||
pw = [_word("important", 60, 10, 100, 20)]
|
||
tw = [
|
||
_word("!", 40, 10, 12, 20, conf=70),
|
||
_word("important", 60, 10, 100, 20, conf=55),
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
texts = [m["text"] for m in merged]
|
||
assert "!" in texts
|
||
assert len(merged) == 2
|
||
|
||
def test_multiple_unique_tesseract_symbols(self):
|
||
"""Multiple symbols only found by Tesseract are all added."""
|
||
pw = [_word("word", 100, 10, 60, 20)]
|
||
tw = [
|
||
_word("!", 20, 10, 10, 20, conf=70),
|
||
_word("•", 40, 10, 10, 15, conf=65),
|
||
_word("word", 100, 10, 60, 20, conf=50),
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
texts = [m["text"] for m in merged]
|
||
assert "!" in texts
|
||
assert "•" in texts
|
||
assert "word" in texts
|
||
assert len(merged) == 3
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _deduplicate_words
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestDeduplicateWords:
|
||
|
||
def test_no_duplicates(self):
|
||
"""Different words at different positions: all kept."""
|
||
words = [_word("a", 10, 10), _word("b", 200, 10), _word("c", 10, 100)]
|
||
result = _deduplicate_words(words)
|
||
assert len(result) == 3
|
||
|
||
def test_exact_duplicate_removed(self):
|
||
"""Same text at same position: only one kept."""
|
||
words = [
|
||
_word("take", 185, 287, 47, 29, conf=90),
|
||
_word("take", 188, 289, 52, 21, conf=96),
|
||
]
|
||
result = _deduplicate_words(words)
|
||
assert len(result) == 1
|
||
assert result[0]["conf"] == 96 # higher confidence kept
|
||
|
||
def test_same_text_far_apart_kept(self):
|
||
"""Same word at very different positions (e.g. repeated in text): both kept."""
|
||
words = [
|
||
_word("the", 100, 10),
|
||
_word("the", 500, 10),
|
||
]
|
||
result = _deduplicate_words(words)
|
||
assert len(result) == 2
|
||
|
||
def test_different_text_same_position_kept(self):
|
||
"""Different words at same position: both kept (not duplicates)."""
|
||
words = [
|
||
_word("apple", 100, 50),
|
||
_word("Apfel", 105, 52),
|
||
]
|
||
result = _deduplicate_words(words)
|
||
assert len(result) == 2
|
||
|
||
def test_empty_list(self):
|
||
assert _deduplicate_words([]) == []
|
||
|
||
def test_single_word(self):
|
||
words = [_word("hello", 10, 10)]
|
||
assert len(_deduplicate_words(words)) == 1
|
||
|
||
def test_real_world_near_duplicates(self):
|
||
"""Simulate real-world: Paddle (height=29) + Tesseract (height=21) near-dupes."""
|
||
words = [
|
||
_word("take", 185, 287, 47, 29, conf=90),
|
||
_word("part", 249, 292, 48, 24, conf=96),
|
||
_word("More", 944, 287, 50, 29, conf=96),
|
||
_word("than", 1003, 287, 50, 29, conf=96),
|
||
# near-dupes from other engine
|
||
_word("take", 188, 289, 52, 21, conf=96),
|
||
_word("part", 249, 294, 47, 25, conf=96),
|
||
_word("More", 948, 292, 60, 20, conf=90),
|
||
_word("than", 1017, 291, 49, 21, conf=96),
|
||
]
|
||
result = _deduplicate_words(words)
|
||
# Each word should appear only once
|
||
assert len(result) == 4
|
||
texts = sorted(w["text"] for w in result)
|
||
assert texts == ["More", "part", "take", "than"]
|
||
|
||
|
||
class TestMergeRealWorldRegression:
|
||
"""Regression test with actual data from the doubled-words bug."""
|
||
|
||
def test_row2_no_duplicates(self):
|
||
"""Reproduce the row-2 bug: both engines return the same words at
|
||
slightly different positions. Merge should produce no duplicates."""
|
||
paddle = [
|
||
_word("teilnehmen", 526, 282, 140, 35, conf=93),
|
||
_word("take", 185, 287, 47, 29, conf=90),
|
||
_word("part(in)", 238, 287, 94, 29, conf=90),
|
||
_word("More", 944, 287, 50, 29, conf=96),
|
||
_word("than", 1003, 287, 50, 29, conf=96),
|
||
_word("200", 1063, 287, 38, 29, conf=96),
|
||
_word("singers", 1110, 287, 88, 29, conf=96),
|
||
_word("took", 1207, 287, 50, 29, conf=96),
|
||
_word("part", 1266, 287, 50, 29, conf=96),
|
||
_word("in", 1326, 287, 25, 29, conf=96),
|
||
_word("the", 1360, 287, 38, 29, conf=96),
|
||
]
|
||
tess = [
|
||
_word("take", 188, 289, 52, 21, conf=96),
|
||
_word("part", 249, 292, 48, 24, conf=96),
|
||
_word("(in)", 305, 290, 38, 24, conf=93),
|
||
_word("teilnehmen", 534, 290, 127, 21, conf=95),
|
||
_word("(an),", 671, 291, 48, 23, conf=96),
|
||
_word("mitmachen", 730, 290, 123, 22, conf=96),
|
||
_word("More", 948, 292, 60, 20, conf=90),
|
||
_word("than", 1017, 291, 49, 21, conf=96),
|
||
_word("200", 1076, 292, 43, 20, conf=93),
|
||
_word("singers", 1128, 293, 75, 26, conf=93),
|
||
_word("took", 1212, 291, 55, 22, conf=96),
|
||
_word("part", 1276, 294, 47, 25, conf=96),
|
||
_word("in", 1332, 292, 20, 20, conf=95),
|
||
_word("the", 1361, 292, 36, 21, conf=95),
|
||
# Tesseract-only: phonetic transcriptions
|
||
_word("[teık", 352, 292, 47, 21, conf=90),
|
||
_word("'pa:t]", 407, 292, 55, 23, conf=89),
|
||
]
|
||
merged = _merge_paddle_tesseract(paddle, tess)
|
||
|
||
# Check no near-duplicates remain
|
||
for i, w1 in enumerate(merged):
|
||
for j, w2 in enumerate(merged):
|
||
if j <= i:
|
||
continue
|
||
if w1["text"].lower() == w2["text"].lower():
|
||
cx1 = w1["left"] + w1.get("width", 0) / 2
|
||
cx2 = w2["left"] + w2.get("width", 0) / 2
|
||
cy1 = w1["top"] + w1.get("height", 0) / 2
|
||
cy2 = w2["top"] + w2.get("height", 0) / 2
|
||
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
|
||
f"Near-duplicate found: '{w1['text']}' at ({w1['left']},{w1['top']}) "
|
||
f"vs ({w2['left']},{w2['top']})"
|
||
)
|
||
|
||
# Tesseract-only words should be present
|
||
texts = [w["text"] for w in merged]
|
||
assert "(in)" in texts # Tesseract split "part(in)" differently
|
||
assert "(an)," in texts
|
||
assert "mitmachen" in texts
|
||
assert "[teık" in texts # phonetic from Tesseract
|
||
assert "'pa:t]" in texts
|