"""Tests for the Kombi-Modus row-based sequence merge algorithm. Functions under test (ocr_pipeline_api.py): - _group_words_into_rows: Cluster words by Y-position into rows - _merge_row_sequences: Merge two word sequences within the same row - _merge_paddle_tesseract: Full merge with row matching + sequence dedup """ import pytest import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from ocr_pipeline_api import ( _group_words_into_rows, _merge_row_sequences, _merge_paddle_tesseract, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80): """Create a synthetic word dict.""" return { "text": text, "left": left, "top": top, "width": width, "height": height, "conf": conf, } # --------------------------------------------------------------------------- # _group_words_into_rows # --------------------------------------------------------------------------- class TestGroupWordsIntoRows: def test_single_row(self): words = [_word("a", 10, 50), _word("b", 100, 52), _word("c", 200, 48)] rows = _group_words_into_rows(words) assert len(rows) == 1 assert len(rows[0]) == 3 # Sorted left-to-right assert rows[0][0]["text"] == "a" assert rows[0][2]["text"] == "c" def test_two_rows(self): words = [ _word("a", 10, 50), _word("b", 100, 52), _word("c", 10, 100), _word("d", 100, 102), ] rows = _group_words_into_rows(words) assert len(rows) == 2 assert [w["text"] for w in rows[0]] == ["a", "b"] assert [w["text"] for w in rows[1]] == ["c", "d"] def test_empty(self): assert _group_words_into_rows([]) == [] def test_different_heights_same_row(self): """Paddle (h=29) and Tesseract (h=21) words at similar Y → same row.""" words = [ _word("take", 100, 287, 47, 29), # center_y = 301.5 _word("take", 103, 289, 52, 21), # center_y = 299.5 ] rows = _group_words_into_rows(words) assert len(rows) == 1 # Same row, not two rows def test_close_rows_separated(self): """Two rows ~30px apart should be separate rows.""" words = [ _word("a", 10, 50, height=20), # center_y = 60 _word("b", 10, 85, height=20), # center_y = 95 ] rows = _group_words_into_rows(words) assert len(rows) == 2 # --------------------------------------------------------------------------- # _merge_row_sequences # --------------------------------------------------------------------------- class TestMergeRowSequences: def test_identical_sequences_deduplicated(self): """Same words from both engines → only one copy each.""" paddle = [_word("apple", 50, 10), _word("Apfel", 200, 10)] tess = [_word("apple", 52, 12), _word("Apfel", 198, 11)] merged = _merge_row_sequences(paddle, tess) assert len(merged) == 2 assert merged[0]["text"] == "apple" assert merged[1]["text"] == "Apfel" def test_tesseract_extra_symbol(self): """Tesseract finds '!' that Paddle missed → included.""" paddle = [_word("Betonung", 60, 10)] tess = [_word("!", 20, 10, 12, 20, conf=70), _word("Betonung", 60, 10)] merged = _merge_row_sequences(paddle, tess) texts = [w["text"] for w in merged] assert "!" in texts assert "Betonung" in texts assert len(merged) == 2 def test_paddle_extra_word(self): """Paddle finds word that Tesseract missed → included.""" paddle = [_word("!", 20, 10, 12, 20), _word("word", 60, 10)] tess = [_word("word", 62, 12)] merged = _merge_row_sequences(paddle, tess) assert len(merged) == 2 def test_coordinates_averaged(self): """Matched words have coordinates averaged by confidence.""" paddle = [_word("hello", 100, 50, 80, 20, conf=90)] tess = [_word("hello", 110, 55, 70, 18, conf=60)] merged = _merge_row_sequences(paddle, tess) assert len(merged) == 1 m = merged[0] assert m["text"] == "hello" # (100*90 + 110*60) / 150 = 104 assert m["left"] == 104 assert m["conf"] == 90 def test_empty_paddle_row(self): tess = [_word("a", 10, 10, conf=80)] merged = _merge_row_sequences([], tess) assert len(merged) == 1 def test_empty_tess_row(self): paddle = [_word("a", 10, 10)] merged = _merge_row_sequences(paddle, []) assert len(merged) == 1 def test_both_empty(self): assert _merge_row_sequences([], []) == [] def test_substring_match(self): """'part(in)' from Paddle matches 'part' from Tesseract (substring).""" paddle = [_word("part(in)", 100, 10, 90, 20)] tess = [_word("part", 100, 12, 50, 18), _word("(in)", 155, 12, 40, 18)] merged = _merge_row_sequences(paddle, tess) # part(in) matches part, then (in) is extra from Tesseract assert len(merged) == 2 def test_low_conf_tesseract_dropped(self): """Unmatched Tesseract words with conf < 30 are dropped.""" paddle = [_word("hello", 100, 10)] tess = [_word("noise", 10, 10, conf=15), _word("hello", 100, 12)] merged = _merge_row_sequences(paddle, tess) texts = [w["text"] for w in merged] assert "noise" not in texts assert len(merged) == 1 def test_real_world_row(self): """Reproduce real data: both engines find 'take part teilnehmen More than'.""" paddle = [ _word("take", 185, 287, 47, 29, conf=90), _word("part(in)", 238, 287, 94, 29, conf=90), _word("teilnehmen", 526, 282, 140, 35, conf=93), _word("More", 944, 287, 50, 29, conf=96), _word("than", 1003, 287, 50, 29, conf=96), ] tess = [ _word("take", 188, 289, 52, 21, conf=96), _word("part", 249, 292, 48, 24, conf=96), _word("(in)", 305, 290, 38, 24, conf=93), _word("[teık", 352, 292, 47, 21, conf=90), _word("teilnehmen", 534, 290, 127, 21, conf=95), _word("More", 948, 292, 60, 20, conf=90), _word("than", 1017, 291, 49, 21, conf=96), ] merged = _merge_row_sequences(paddle, tess) texts = [w["text"] for w in merged] # No duplicates assert texts.count("take") == 1 assert texts.count("More") == 1 assert texts.count("than") == 1 assert texts.count("teilnehmen") == 1 # Tesseract-only phonetic kept assert "[teık" in texts # --------------------------------------------------------------------------- # _merge_paddle_tesseract (full pipeline) # --------------------------------------------------------------------------- class TestMergePaddleTesseract: def test_same_words_deduplicated(self): """Both engines find same words → no duplicates.""" pw = [ _word("apple", 50, 10, 70, 20, conf=90), _word("Apfel", 300, 10, 60, 20, conf=85), ] tw = [ _word("apple", 52, 11, 68, 19, conf=75), _word("Apfel", 298, 12, 62, 18, conf=70), ] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 texts = sorted(w["text"] for w in merged) assert texts == ["Apfel", "apple"] def test_different_rows_not_cross_merged(self): """Words from different rows must NOT be averaged together.""" pw = [ _word("row1word", 50, 50, 80, 20, conf=90), _word("row2word", 50, 100, 80, 20, conf=90), ] tw = [ _word("row1word", 52, 52, 78, 18, conf=80), _word("row2word", 52, 102, 78, 18, conf=80), ] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 # Row 1 word should stay near y=50, not averaged with y=100 row1 = [w for w in merged if w["text"] == "row1word"][0] row2 = [w for w in merged if w["text"] == "row2word"][0] assert row1["top"] < 60 # stays near row 1 assert row2["top"] > 90 # stays near row 2 def test_tesseract_extra_symbols_added(self): """Symbols only found by Tesseract are included.""" pw = [_word("Betonung", 60, 10, 80, 20)] tw = [ _word("!", 20, 10, 12, 20, conf=65), _word("Betonung", 60, 10, 80, 20, conf=50), ] merged = _merge_paddle_tesseract(pw, tw) texts = [w["text"] for w in merged] assert "!" in texts assert "Betonung" in texts assert len(merged) == 2 def test_paddle_extra_words_added(self): """Words only found by Paddle are included.""" pw = [_word("extra", 10, 10), _word("word", 100, 10)] tw = [_word("word", 102, 12)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 def test_empty_paddle(self): tw = [_word("a", 10, 10, conf=80), _word("b", 200, 200, conf=10)] merged = _merge_paddle_tesseract([], tw) assert len(merged) == 1 # only conf >= 40 def test_empty_tesseract(self): pw = [_word("a", 10, 10), _word("b", 200, 10)] merged = _merge_paddle_tesseract(pw, []) assert len(merged) == 2 def test_both_empty(self): assert _merge_paddle_tesseract([], []) == [] def test_multi_row_deduplication(self): """Multiple rows with words from both engines, all deduplicated.""" pw = [ _word("cat", 50, 50, conf=90), _word("Katze", 200, 50, conf=85), _word("dog", 50, 100, conf=88), _word("Hund", 200, 100, conf=82), ] tw = [ _word("cat", 52, 52, conf=75), _word("Katze", 198, 51, conf=70), _word("dog", 48, 101, conf=72), _word("Hund", 202, 102, conf=68), ] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 4 texts = sorted(w["text"] for w in merged) assert texts == ["Hund", "Katze", "cat", "dog"] class TestMergeRealWorldRegression: """Regression test with actual data from the doubled-words bug.""" def test_full_page_no_duplicates(self): """Both engines find same words at slightly different positions. Merge should produce no near-duplicate words.""" paddle = [ _word("teilnehmen", 526, 282, 140, 35, conf=93), _word("take", 185, 287, 47, 29, conf=90), _word("part(in)", 238, 287, 94, 29, conf=90), _word("More", 944, 287, 50, 29, conf=96), _word("than", 1003, 287, 50, 29, conf=96), _word("200", 1063, 287, 38, 29, conf=96), _word("singers", 1110, 287, 88, 29, conf=96), _word("took", 1207, 287, 50, 29, conf=96), _word("part", 1266, 287, 50, 29, conf=96), _word("in", 1326, 287, 25, 29, conf=96), _word("the", 1360, 287, 38, 29, conf=96), # Second row _word("be", 185, 365, 30, 29, conf=90), _word("good", 216, 365, 50, 29, conf=90), _word("at", 275, 365, 25, 29, conf=90), _word("sth.", 306, 365, 45, 29, conf=90), ] tess = [ _word("take", 188, 289, 52, 21, conf=96), _word("part", 249, 292, 48, 24, conf=96), _word("(in)", 305, 290, 38, 24, conf=93), _word("teilnehmen", 534, 290, 127, 21, conf=95), _word("More", 948, 292, 60, 20, conf=90), _word("than", 1017, 291, 49, 21, conf=96), _word("200", 1076, 292, 43, 20, conf=93), _word("singers", 1128, 293, 75, 26, conf=93), _word("took", 1212, 291, 55, 22, conf=96), _word("part", 1276, 294, 47, 25, conf=96), _word("in", 1332, 292, 20, 20, conf=95), _word("the", 1361, 292, 36, 21, conf=95), _word("[teık", 352, 292, 47, 21, conf=90), _word("'pa:t]", 407, 292, 55, 23, conf=89), # Second row _word("be", 189, 369, 28, 21, conf=96), _word("good", 225, 369, 50, 21, conf=96), _word("at", 292, 371, 22, 21, conf=96), _word("sth.", 324, 369, 42, 21, conf=96), ] merged = _merge_paddle_tesseract(paddle, tess) # Check no near-duplicates: same text within 30px horizontal / 15px vertical for i, w1 in enumerate(merged): for j in range(i + 1, len(merged)): w2 = merged[j] if w1["text"].lower() == w2["text"].lower(): cx1 = w1["left"] + w1.get("width", 0) / 2 cx2 = w2["left"] + w2.get("width", 0) / 2 cy1 = w1["top"] + w1.get("height", 0) / 2 cy2 = w2["top"] + w2.get("height", 0) / 2 assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, ( f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) " f"vs ({w2['left']},{w2['top']})" ) # Tesseract-only phonetic words should be present texts = [w["text"] for w in merged] assert "[teık" in texts assert "'pa:t]" in texts # Row 1 and Row 2 words should not be merged to same Y position be_word = [w for w in merged if w["text"] == "be"][0] take_word = [w for w in merged if w["text"] == "take"][0] assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"