"""Tests for the Kombi-Modus merge algorithm. Functions under test (ocr_pipeline_api.py): - _box_iou: IoU between two word boxes - _box_center_dist: Euclidean distance between box centers - _text_similarity: Simple text similarity (0-1) - _words_match: Multi-criteria match (IoU + center + text) - _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists """ import pytest import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from ocr_pipeline_api import ( _box_iou, _box_center_dist, _text_similarity, _words_match, _deduplicate_words, _merge_paddle_tesseract, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80): """Create a synthetic word dict.""" return { "text": text, "left": left, "top": top, "width": width, "height": height, "conf": conf, } # --------------------------------------------------------------------------- # _box_iou # --------------------------------------------------------------------------- class TestBoxIoU: def test_identical_boxes(self): a = _word("hello", 10, 10, 100, 20) assert _box_iou(a, a) == pytest.approx(1.0) def test_no_overlap(self): a = _word("a", 0, 0, 50, 20) b = _word("b", 200, 200, 50, 20) assert _box_iou(a, b) == 0.0 def test_partial_overlap(self): a = _word("a", 0, 0, 100, 20) b = _word("b", 50, 0, 100, 20) assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01) def test_contained_box(self): big = _word("big", 0, 0, 200, 40) small = _word("small", 50, 10, 30, 10) assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01) def test_touching_edges(self): a = _word("a", 0, 0, 50, 20) b = _word("b", 50, 0, 50, 20) assert _box_iou(a, b) == 0.0 def test_zero_area_box(self): a = _word("a", 10, 10, 0, 0) b = _word("b", 10, 10, 50, 20) assert _box_iou(a, b) == 0.0 # --------------------------------------------------------------------------- # _box_center_dist # --------------------------------------------------------------------------- class TestBoxCenterDist: def test_same_center(self): a = _word("a", 100, 50, 60, 20) assert _box_center_dist(a, a) == 0.0 def test_horizontal_offset(self): a = _word("a", 100, 50, 60, 20) b = _word("b", 110, 50, 60, 20) assert _box_center_dist(a, b) == pytest.approx(10.0) def test_diagonal(self): a = _word("a", 0, 0, 20, 20) # center (10, 10) b = _word("b", 20, 20, 20, 20) # center (30, 30) expected = (20**2 + 20**2) ** 0.5 assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1) # --------------------------------------------------------------------------- # _text_similarity # --------------------------------------------------------------------------- class TestTextSimilarity: def test_identical(self): assert _text_similarity("hello", "hello") == 1.0 def test_case_insensitive(self): assert _text_similarity("Hello", "hello") == 1.0 def test_substring(self): """One is substring of other (e.g. '!Betonung' vs 'Betonung').""" assert _text_similarity("!Betonung", "Betonung") == 0.8 def test_completely_different(self): assert _text_similarity("abc", "xyz") == 0.0 def test_empty_strings(self): assert _text_similarity("", "hello") == 0.0 assert _text_similarity("", "") == 0.0 def test_partial_overlap(self): """Some shared characters.""" sim = _text_similarity("apple", "ape") assert 0.0 < sim < 1.0 # --------------------------------------------------------------------------- # _words_match # --------------------------------------------------------------------------- class TestWordsMatch: def test_high_iou_matches(self): """IoU > 0.15 is sufficient for a match.""" a = _word("hello", 100, 50, 80, 20) b = _word("hello", 105, 50, 80, 20) assert _words_match(a, b) is True def test_same_text_same_row_matches(self): """Same text on same row matches even with low IoU.""" a = _word("Betonung", 100, 50, 80, 20) b = _word("Betonung", 130, 52, 70, 18) # shifted but same row assert _words_match(a, b) is True def test_close_centers_same_row_matches(self): """Nearby centers on same row match.""" a = _word("x", 100, 50, 40, 20) b = _word("y", 110, 52, 50, 22) # close, same row assert _words_match(a, b) is True def test_different_rows_no_match(self): """Words on different rows don't match even with same text.""" a = _word("hello", 100, 50, 80, 20) b = _word("hello", 100, 200, 80, 20) # far away vertically assert _words_match(a, b) is False def test_far_apart_same_row_different_text(self): """Different text far apart on same row: no match.""" a = _word("cat", 10, 50, 40, 20) b = _word("dog", 400, 50, 40, 20) assert _words_match(a, b) is False def test_no_overlap_no_proximity_no_text(self): """Completely different words far apart: no match.""" a = _word("abc", 0, 0, 50, 20) b = _word("xyz", 500, 500, 50, 20) assert _words_match(a, b) is False # --------------------------------------------------------------------------- # _merge_paddle_tesseract # --------------------------------------------------------------------------- class TestMergePaddleTesseract: def test_perfect_match_averages_coords(self): """Same word at same position: coordinates averaged by confidence.""" pw = [_word("hello", 100, 50, 80, 20, conf=90)] tw = [_word("hello", 110, 55, 70, 18, conf=60)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 m = merged[0] assert m["text"] == "hello" assert m["left"] == 104 # (100*90 + 110*60) / 150 assert m["conf"] == 90 def test_same_word_slightly_offset_merges(self): """Same word with slight offset still merges (center proximity).""" pw = [_word("Betonung", 100, 50, 90, 22, conf=85)] tw = [_word("Betonung", 115, 52, 80, 20, conf=60)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "Betonung" def test_truly_different_words_kept_separate(self): """Non-overlapping different words: both kept.""" pw = [_word("hello", 10, 10)] tw = [_word("bullet", 500, 500, conf=50)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 texts = {m["text"] for m in merged} assert texts == {"hello", "bullet"} def test_low_conf_tesseract_dropped(self): """Unmatched Tesseract words with conf < 40 are dropped.""" pw = [_word("hello", 10, 10)] tw = [_word("noise", 500, 500, conf=20)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 def test_empty_paddle(self): pw = [] tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "bullet" def test_empty_tesseract(self): pw = [_word("a", 10, 10), _word("b", 200, 10)] merged = _merge_paddle_tesseract(pw, []) assert len(merged) == 2 def test_both_empty(self): assert _merge_paddle_tesseract([], []) == [] def test_one_to_one_matching(self): """Each Tesseract word matches at most one Paddle word.""" pw = [ _word("cat", 10, 10, 60, 20, conf=80), _word("dog", 200, 10, 60, 20, conf=80), ] tw = [_word("cat", 15, 12, 55, 18, conf=70)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 # cat (merged) + dog (unmatched paddle) def test_far_apart_different_text_not_merged(self): """Different words far apart stay separate.""" pw = [_word("hello", 0, 0, 100, 20, conf=80)] tw = [_word("world", 500, 300, 100, 20, conf=70)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 def test_paddle_text_preferred(self): """Merged word uses Paddle's text.""" pw = [_word("Betonung", 100, 50, 80, 20, conf=85)] tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "Betonung" def test_confidence_weighted_positions(self): """Equal confidence → simple average of coordinates.""" pw = [_word("x", 100, 200, 60, 20, conf=50)] tw = [_word("x", 110, 200, 60, 20, conf=50)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 m = merged[0] assert m["left"] == 105 assert m["top"] == 200 def test_zero_confidence_no_division_error(self): """Words with conf=0 don't cause division by zero.""" pw = [_word("a", 100, 50, 80, 20, conf=0)] tw = [_word("a", 100, 50, 80, 20, conf=0)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 def test_duplicate_words_same_position_deduplicated(self): """The core bug fix: same word at same position from both engines should appear only once, not doubled.""" # Simulate typical case: both engines find same words pw = [ _word("apple", 50, 10, 70, 20, conf=90), _word("Apfel", 300, 10, 60, 20, conf=85), _word("dog", 50, 50, 50, 20, conf=88), _word("Hund", 300, 50, 60, 20, conf=82), ] tw = [ _word("apple", 52, 11, 68, 19, conf=75), _word("Apfel", 298, 12, 62, 18, conf=70), _word("dog", 48, 49, 52, 21, conf=72), _word("Hund", 302, 51, 58, 19, conf=68), ] merged = _merge_paddle_tesseract(pw, tw) # Each word should appear exactly once assert len(merged) == 4 texts = [m["text"] for m in merged] assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"] class TestMergePaddleTesseractBulletPoints: """Tesseract catches bullet points / symbols that PaddleOCR misses.""" def test_bullet_added_from_tesseract(self): """Bullet character from Tesseract is added.""" pw = [_word("Betonung", 60, 10, 80, 20)] tw = [ _word("•", 10, 10, 15, 15, conf=65), _word("Betonung", 60, 10, 80, 20, conf=50), ] merged = _merge_paddle_tesseract(pw, tw) texts = [m["text"] for m in merged] assert "•" in texts assert "Betonung" in texts assert len(merged) == 2 def test_exclamation_added_from_tesseract(self): """Exclamation mark from Tesseract is added.""" pw = [_word("important", 60, 10, 100, 20)] tw = [ _word("!", 40, 10, 12, 20, conf=70), _word("important", 60, 10, 100, 20, conf=55), ] merged = _merge_paddle_tesseract(pw, tw) texts = [m["text"] for m in merged] assert "!" in texts assert len(merged) == 2 def test_multiple_unique_tesseract_symbols(self): """Multiple symbols only found by Tesseract are all added.""" pw = [_word("word", 100, 10, 60, 20)] tw = [ _word("!", 20, 10, 10, 20, conf=70), _word("•", 40, 10, 10, 15, conf=65), _word("word", 100, 10, 60, 20, conf=50), ] merged = _merge_paddle_tesseract(pw, tw) texts = [m["text"] for m in merged] assert "!" in texts assert "•" in texts assert "word" in texts assert len(merged) == 3 # --------------------------------------------------------------------------- # _deduplicate_words # --------------------------------------------------------------------------- class TestDeduplicateWords: def test_no_duplicates(self): """Different words at different positions: all kept.""" words = [_word("a", 10, 10), _word("b", 200, 10), _word("c", 10, 100)] result = _deduplicate_words(words) assert len(result) == 3 def test_exact_duplicate_removed(self): """Same text at same position: only one kept.""" words = [ _word("take", 185, 287, 47, 29, conf=90), _word("take", 188, 289, 52, 21, conf=96), ] result = _deduplicate_words(words) assert len(result) == 1 assert result[0]["conf"] == 96 # higher confidence kept def test_same_text_far_apart_kept(self): """Same word at very different positions (e.g. repeated in text): both kept.""" words = [ _word("the", 100, 10), _word("the", 500, 10), ] result = _deduplicate_words(words) assert len(result) == 2 def test_different_text_same_position_kept(self): """Different words at same position: both kept (not duplicates).""" words = [ _word("apple", 100, 50), _word("Apfel", 105, 52), ] result = _deduplicate_words(words) assert len(result) == 2 def test_empty_list(self): assert _deduplicate_words([]) == [] def test_single_word(self): words = [_word("hello", 10, 10)] assert len(_deduplicate_words(words)) == 1 def test_real_world_near_duplicates(self): """Simulate real-world: Paddle (height=29) + Tesseract (height=21) near-dupes.""" words = [ _word("take", 185, 287, 47, 29, conf=90), _word("part", 249, 292, 48, 24, conf=96), _word("More", 944, 287, 50, 29, conf=96), _word("than", 1003, 287, 50, 29, conf=96), # near-dupes from other engine _word("take", 188, 289, 52, 21, conf=96), _word("part", 249, 294, 47, 25, conf=96), _word("More", 948, 292, 60, 20, conf=90), _word("than", 1017, 291, 49, 21, conf=96), ] result = _deduplicate_words(words) # Each word should appear only once assert len(result) == 4 texts = sorted(w["text"] for w in result) assert texts == ["More", "part", "take", "than"] class TestMergeRealWorldRegression: """Regression test with actual data from the doubled-words bug.""" def test_row2_no_duplicates(self): """Reproduce the row-2 bug: both engines return the same words at slightly different positions. Merge should produce no duplicates.""" paddle = [ _word("teilnehmen", 526, 282, 140, 35, conf=93), _word("take", 185, 287, 47, 29, conf=90), _word("part(in)", 238, 287, 94, 29, conf=90), _word("More", 944, 287, 50, 29, conf=96), _word("than", 1003, 287, 50, 29, conf=96), _word("200", 1063, 287, 38, 29, conf=96), _word("singers", 1110, 287, 88, 29, conf=96), _word("took", 1207, 287, 50, 29, conf=96), _word("part", 1266, 287, 50, 29, conf=96), _word("in", 1326, 287, 25, 29, conf=96), _word("the", 1360, 287, 38, 29, conf=96), ] tess = [ _word("take", 188, 289, 52, 21, conf=96), _word("part", 249, 292, 48, 24, conf=96), _word("(in)", 305, 290, 38, 24, conf=93), _word("teilnehmen", 534, 290, 127, 21, conf=95), _word("(an),", 671, 291, 48, 23, conf=96), _word("mitmachen", 730, 290, 123, 22, conf=96), _word("More", 948, 292, 60, 20, conf=90), _word("than", 1017, 291, 49, 21, conf=96), _word("200", 1076, 292, 43, 20, conf=93), _word("singers", 1128, 293, 75, 26, conf=93), _word("took", 1212, 291, 55, 22, conf=96), _word("part", 1276, 294, 47, 25, conf=96), _word("in", 1332, 292, 20, 20, conf=95), _word("the", 1361, 292, 36, 21, conf=95), # Tesseract-only: phonetic transcriptions _word("[teık", 352, 292, 47, 21, conf=90), _word("'pa:t]", 407, 292, 55, 23, conf=89), ] merged = _merge_paddle_tesseract(paddle, tess) # Check no near-duplicates remain for i, w1 in enumerate(merged): for j, w2 in enumerate(merged): if j <= i: continue if w1["text"].lower() == w2["text"].lower(): cx1 = w1["left"] + w1.get("width", 0) / 2 cx2 = w2["left"] + w2.get("width", 0) / 2 cy1 = w1["top"] + w1.get("height", 0) / 2 cy2 = w2["top"] + w2.get("height", 0) / 2 assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, ( f"Near-duplicate found: '{w1['text']}' at ({w1['left']},{w1['top']}) " f"vs ({w2['left']},{w2['top']})" ) # Tesseract-only words should be present texts = [w["text"] for w in merged] assert "(in)" in texts # Tesseract split "part(in)" differently assert "(an)," in texts assert "mitmachen" in texts assert "[teık" in texts # phonetic from Tesseract assert "'pa:t]" in texts