Files
breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py
Benjamin Admin 846292f632
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
fix: rewrite Kombi merge with row-based sequence alignment
Replaces position-based word matching with row-based sequence alignment
to fix doubled words and cross-line averaging in Kombi-Modus.

New algorithm:
1. Group words into rows by Y-position clustering
2. Match rows between engines by vertical center proximity
3. Within each row: walk both sequences left-to-right, deduplicating
4. Unmatched rows kept as-is

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 08:45:03 +01:00

350 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the Kombi-Modus row-based sequence merge algorithm.
Functions under test (ocr_pipeline_api.py):
- _group_words_into_rows: Cluster words by Y-position into rows
- _merge_row_sequences: Merge two word sequences within the same row
- _merge_paddle_tesseract: Full merge with row matching + sequence dedup
"""
import pytest
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import (
_group_words_into_rows,
_merge_row_sequences,
_merge_paddle_tesseract,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
"""Create a synthetic word dict."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
}
# ---------------------------------------------------------------------------
# _group_words_into_rows
# ---------------------------------------------------------------------------
class TestGroupWordsIntoRows:
def test_single_row(self):
words = [_word("a", 10, 50), _word("b", 100, 52), _word("c", 200, 48)]
rows = _group_words_into_rows(words)
assert len(rows) == 1
assert len(rows[0]) == 3
# Sorted left-to-right
assert rows[0][0]["text"] == "a"
assert rows[0][2]["text"] == "c"
def test_two_rows(self):
words = [
_word("a", 10, 50), _word("b", 100, 52),
_word("c", 10, 100), _word("d", 100, 102),
]
rows = _group_words_into_rows(words)
assert len(rows) == 2
assert [w["text"] for w in rows[0]] == ["a", "b"]
assert [w["text"] for w in rows[1]] == ["c", "d"]
def test_empty(self):
assert _group_words_into_rows([]) == []
def test_different_heights_same_row(self):
"""Paddle (h=29) and Tesseract (h=21) words at similar Y → same row."""
words = [
_word("take", 100, 287, 47, 29), # center_y = 301.5
_word("take", 103, 289, 52, 21), # center_y = 299.5
]
rows = _group_words_into_rows(words)
assert len(rows) == 1 # Same row, not two rows
def test_close_rows_separated(self):
"""Two rows ~30px apart should be separate rows."""
words = [
_word("a", 10, 50, height=20), # center_y = 60
_word("b", 10, 85, height=20), # center_y = 95
]
rows = _group_words_into_rows(words)
assert len(rows) == 2
# ---------------------------------------------------------------------------
# _merge_row_sequences
# ---------------------------------------------------------------------------
class TestMergeRowSequences:
def test_identical_sequences_deduplicated(self):
"""Same words from both engines → only one copy each."""
paddle = [_word("apple", 50, 10), _word("Apfel", 200, 10)]
tess = [_word("apple", 52, 12), _word("Apfel", 198, 11)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
assert merged[0]["text"] == "apple"
assert merged[1]["text"] == "Apfel"
def test_tesseract_extra_symbol(self):
"""Tesseract finds '!' that Paddle missed → included."""
paddle = [_word("Betonung", 60, 10)]
tess = [_word("!", 20, 10, 12, 20, conf=70), _word("Betonung", 60, 10)]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
assert "!" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_paddle_extra_word(self):
"""Paddle finds word that Tesseract missed → included."""
paddle = [_word("!", 20, 10, 12, 20), _word("word", 60, 10)]
tess = [_word("word", 62, 12)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
def test_coordinates_averaged(self):
"""Matched words have coordinates averaged by confidence."""
paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
tess = [_word("hello", 110, 55, 70, 18, conf=60)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1
m = merged[0]
assert m["text"] == "hello"
# (100*90 + 110*60) / 150 = 104
assert m["left"] == 104
assert m["conf"] == 90
def test_empty_paddle_row(self):
tess = [_word("a", 10, 10, conf=80)]
merged = _merge_row_sequences([], tess)
assert len(merged) == 1
def test_empty_tess_row(self):
paddle = [_word("a", 10, 10)]
merged = _merge_row_sequences(paddle, [])
assert len(merged) == 1
def test_both_empty(self):
assert _merge_row_sequences([], []) == []
def test_substring_match(self):
"""'part(in)' from Paddle matches 'part' from Tesseract (substring)."""
paddle = [_word("part(in)", 100, 10, 90, 20)]
tess = [_word("part", 100, 12, 50, 18), _word("(in)", 155, 12, 40, 18)]
merged = _merge_row_sequences(paddle, tess)
# part(in) matches part, then (in) is extra from Tesseract
assert len(merged) == 2
def test_low_conf_tesseract_dropped(self):
"""Unmatched Tesseract words with conf < 30 are dropped."""
paddle = [_word("hello", 100, 10)]
tess = [_word("noise", 10, 10, conf=15), _word("hello", 100, 12)]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
assert "noise" not in texts
assert len(merged) == 1
def test_real_world_row(self):
"""Reproduce real data: both engines find 'take part teilnehmen More than'."""
paddle = [
_word("take", 185, 287, 47, 29, conf=90),
_word("part(in)", 238, 287, 94, 29, conf=90),
_word("teilnehmen", 526, 282, 140, 35, conf=93),
_word("More", 944, 287, 50, 29, conf=96),
_word("than", 1003, 287, 50, 29, conf=96),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
# No duplicates
assert texts.count("take") == 1
assert texts.count("More") == 1
assert texts.count("than") == 1
assert texts.count("teilnehmen") == 1
# Tesseract-only phonetic kept
assert "[teık" in texts
# ---------------------------------------------------------------------------
# _merge_paddle_tesseract (full pipeline)
# ---------------------------------------------------------------------------
class TestMergePaddleTesseract:
def test_same_words_deduplicated(self):
"""Both engines find same words → no duplicates."""
pw = [
_word("apple", 50, 10, 70, 20, conf=90),
_word("Apfel", 300, 10, 60, 20, conf=85),
]
tw = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
texts = sorted(w["text"] for w in merged)
assert texts == ["Apfel", "apple"]
def test_different_rows_not_cross_merged(self):
"""Words from different rows must NOT be averaged together."""
pw = [
_word("row1word", 50, 50, 80, 20, conf=90),
_word("row2word", 50, 100, 80, 20, conf=90),
]
tw = [
_word("row1word", 52, 52, 78, 18, conf=80),
_word("row2word", 52, 102, 78, 18, conf=80),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
# Row 1 word should stay near y=50, not averaged with y=100
row1 = [w for w in merged if w["text"] == "row1word"][0]
row2 = [w for w in merged if w["text"] == "row2word"][0]
assert row1["top"] < 60 # stays near row 1
assert row2["top"] > 90 # stays near row 2
def test_tesseract_extra_symbols_added(self):
"""Symbols only found by Tesseract are included."""
pw = [_word("Betonung", 60, 10, 80, 20)]
tw = [
_word("!", 20, 10, 12, 20, conf=65),
_word("Betonung", 60, 10, 80, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [w["text"] for w in merged]
assert "!" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_paddle_extra_words_added(self):
"""Words only found by Paddle are included."""
pw = [_word("extra", 10, 10), _word("word", 100, 10)]
tw = [_word("word", 102, 12)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
def test_empty_paddle(self):
tw = [_word("a", 10, 10, conf=80), _word("b", 200, 200, conf=10)]
merged = _merge_paddle_tesseract([], tw)
assert len(merged) == 1 # only conf >= 40
def test_empty_tesseract(self):
pw = [_word("a", 10, 10), _word("b", 200, 10)]
merged = _merge_paddle_tesseract(pw, [])
assert len(merged) == 2
def test_both_empty(self):
assert _merge_paddle_tesseract([], []) == []
def test_multi_row_deduplication(self):
"""Multiple rows with words from both engines, all deduplicated."""
pw = [
_word("cat", 50, 50, conf=90),
_word("Katze", 200, 50, conf=85),
_word("dog", 50, 100, conf=88),
_word("Hund", 200, 100, conf=82),
]
tw = [
_word("cat", 52, 52, conf=75),
_word("Katze", 198, 51, conf=70),
_word("dog", 48, 101, conf=72),
_word("Hund", 202, 102, conf=68),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 4
texts = sorted(w["text"] for w in merged)
assert texts == ["Hund", "Katze", "cat", "dog"]
class TestMergeRealWorldRegression:
"""Regression test with actual data from the doubled-words bug."""
def test_full_page_no_duplicates(self):
"""Both engines find same words at slightly different positions.
Merge should produce no near-duplicate words."""
paddle = [
_word("teilnehmen", 526, 282, 140, 35, conf=93),
_word("take", 185, 287, 47, 29, conf=90),
_word("part(in)", 238, 287, 94, 29, conf=90),
_word("More", 944, 287, 50, 29, conf=96),
_word("than", 1003, 287, 50, 29, conf=96),
_word("200", 1063, 287, 38, 29, conf=96),
_word("singers", 1110, 287, 88, 29, conf=96),
_word("took", 1207, 287, 50, 29, conf=96),
_word("part", 1266, 287, 50, 29, conf=96),
_word("in", 1326, 287, 25, 29, conf=96),
_word("the", 1360, 287, 38, 29, conf=96),
# Second row
_word("be", 185, 365, 30, 29, conf=90),
_word("good", 216, 365, 50, 29, conf=90),
_word("at", 275, 365, 25, 29, conf=90),
_word("sth.", 306, 365, 45, 29, conf=90),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
_word("singers", 1128, 293, 75, 26, conf=93),
_word("took", 1212, 291, 55, 22, conf=96),
_word("part", 1276, 294, 47, 25, conf=96),
_word("in", 1332, 292, 20, 20, conf=95),
_word("the", 1361, 292, 36, 21, conf=95),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("'pa:t]", 407, 292, 55, 23, conf=89),
# Second row
_word("be", 189, 369, 28, 21, conf=96),
_word("good", 225, 369, 50, 21, conf=96),
_word("at", 292, 371, 22, 21, conf=96),
_word("sth.", 324, 369, 42, 21, conf=96),
]
merged = _merge_paddle_tesseract(paddle, tess)
# Check no near-duplicates: same text within 30px horizontal / 15px vertical
for i, w1 in enumerate(merged):
for j in range(i + 1, len(merged)):
w2 = merged[j]
if w1["text"].lower() == w2["text"].lower():
cx1 = w1["left"] + w1.get("width", 0) / 2
cx2 = w2["left"] + w2.get("width", 0) / 2
cy1 = w1["top"] + w1.get("height", 0) / 2
cy2 = w2["top"] + w2.get("height", 0) / 2
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
f"vs ({w2['left']},{w2['top']})"
)
# Tesseract-only phonetic words should be present
texts = [w["text"] for w in merged]
assert "[teık" in texts
assert "'pa:t]" in texts
# Row 1 and Row 2 words should not be merged to same Y position
be_word = [w for w in merged if w["text"] == "be"][0]
take_word = [w for w in merged if w["text"] == "take"][0]
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"