Files
breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py
Benjamin Admin d6f51e4418
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 33s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 24s
fix: deduplicate overlapping OCR words and use per-word Y positions in overlay
Backend: Add spatial overlap check (>=50% horizontal IoU) to Kombi merge
so words at the same position are deduplicated even when OCR text differs.

Frontend: Add yPct/hPct to WordPosition so each word renders at its actual
vertical position instead of all words collapsing to the cell center Y.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 20:27:08 +01:00

503 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the Kombi-Modus row-based sequence merge algorithm.
Functions under test (ocr_pipeline_api.py):
- _split_paddle_multi_words: Split multi-word PaddleOCR boxes into individual words
- _group_words_into_rows: Cluster words by Y-position into rows
- _merge_row_sequences: Merge two word sequences within the same row
- _merge_paddle_tesseract: Full merge with row matching + sequence dedup
"""
import pytest
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import (
_split_paddle_multi_words,
_group_words_into_rows,
_merge_row_sequences,
_merge_paddle_tesseract,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
"""Create a synthetic word dict."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
}
# ---------------------------------------------------------------------------
# _split_paddle_multi_words
# ---------------------------------------------------------------------------
class TestSplitPaddleMultiWords:
def test_single_word_unchanged(self):
words = [_word("hello", 100, 50, 80, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 1
assert result[0]["text"] == "hello"
def test_multi_word_split(self):
"""'More than 200' as one box → 3 individual words."""
words = [_word("More than 200", 100, 50, 300, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 3
assert result[0]["text"] == "More"
assert result[1]["text"] == "than"
assert result[2]["text"] == "200"
# All should be within the original box
assert result[0]["left"] >= 100
assert result[2]["left"] + result[2]["width"] <= 400 + 5 # allow rounding
def test_exclamation_split(self):
"""'!Betonung' → ['!', 'Betonung']."""
words = [_word("!Betonung", 100, 50, 120, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 2
assert result[0]["text"] == "!"
assert result[1]["text"] == "Betonung"
def test_ipa_bracket_split(self):
"""'badge[bxd3]' → ['badge', '[bxd3]']."""
words = [_word("badge[bxd3]", 100, 50, 150, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 2
assert result[0]["text"] == "badge"
assert result[1]["text"] == "[bxd3]"
def test_long_phrase(self):
"""'More than 200 singers took part in the' → 8 words."""
words = [_word("More than 200 singers took part in the", 944, 287, 454, 29)]
result = _split_paddle_multi_words(words)
assert len(result) == 8
texts = [w["text"] for w in result]
assert texts == ["More", "than", "200", "singers", "took", "part", "in", "the"]
def test_empty_input(self):
assert _split_paddle_multi_words([]) == []
def test_preserves_top_and_height(self):
words = [_word("a b", 100, 50, 200, 25)]
result = _split_paddle_multi_words(words)
for w in result:
assert w["top"] == 50
assert w["height"] == 25
# ---------------------------------------------------------------------------
# _group_words_into_rows
# ---------------------------------------------------------------------------
class TestGroupWordsIntoRows:
def test_single_row(self):
words = [_word("a", 10, 50), _word("b", 100, 52), _word("c", 200, 48)]
rows = _group_words_into_rows(words)
assert len(rows) == 1
assert len(rows[0]) == 3
# Sorted left-to-right
assert rows[0][0]["text"] == "a"
assert rows[0][2]["text"] == "c"
def test_two_rows(self):
words = [
_word("a", 10, 50), _word("b", 100, 52),
_word("c", 10, 100), _word("d", 100, 102),
]
rows = _group_words_into_rows(words)
assert len(rows) == 2
assert [w["text"] for w in rows[0]] == ["a", "b"]
assert [w["text"] for w in rows[1]] == ["c", "d"]
def test_empty(self):
assert _group_words_into_rows([]) == []
def test_different_heights_same_row(self):
"""Paddle (h=29) and Tesseract (h=21) words at similar Y → same row."""
words = [
_word("take", 100, 287, 47, 29), # center_y = 301.5
_word("take", 103, 289, 52, 21), # center_y = 299.5
]
rows = _group_words_into_rows(words)
assert len(rows) == 1 # Same row, not two rows
def test_close_rows_separated(self):
"""Two rows ~30px apart should be separate rows."""
words = [
_word("a", 10, 50, height=20), # center_y = 60
_word("b", 10, 85, height=20), # center_y = 95
]
rows = _group_words_into_rows(words)
assert len(rows) == 2
# ---------------------------------------------------------------------------
# _merge_row_sequences
# ---------------------------------------------------------------------------
class TestMergeRowSequences:
def test_identical_sequences_deduplicated(self):
"""Same words from both engines → only one copy each."""
paddle = [_word("apple", 50, 10), _word("Apfel", 200, 10)]
tess = [_word("apple", 52, 12), _word("Apfel", 198, 11)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
assert merged[0]["text"] == "apple"
assert merged[1]["text"] == "Apfel"
def test_tesseract_extra_symbol(self):
"""Tesseract finds '!' that Paddle missed → included."""
paddle = [_word("Betonung", 60, 10)]
tess = [_word("!", 20, 10, 12, 20, conf=70), _word("Betonung", 60, 10)]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
assert "!" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_paddle_extra_word(self):
"""Paddle finds word that Tesseract missed → included."""
paddle = [_word("!", 20, 10, 12, 20), _word("word", 60, 10)]
tess = [_word("word", 62, 12)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
def test_coordinates_averaged(self):
"""Matched words have coordinates averaged by confidence."""
paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
tess = [_word("hello", 110, 55, 70, 18, conf=60)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1
m = merged[0]
assert m["text"] == "hello"
# (100*90 + 110*60) / 150 = 104
assert m["left"] == 104
assert m["conf"] == 90
def test_empty_paddle_row(self):
tess = [_word("a", 10, 10, conf=80)]
merged = _merge_row_sequences([], tess)
assert len(merged) == 1
def test_empty_tess_row(self):
paddle = [_word("a", 10, 10)]
merged = _merge_row_sequences(paddle, [])
assert len(merged) == 1
def test_both_empty(self):
assert _merge_row_sequences([], []) == []
def test_substring_match(self):
"""'part(in)' from Paddle matches 'part' from Tesseract (substring)."""
paddle = [_word("part(in)", 100, 10, 90, 20)]
tess = [_word("part", 100, 12, 50, 18), _word("(in)", 155, 12, 40, 18)]
merged = _merge_row_sequences(paddle, tess)
# part(in) matches part, then (in) is extra from Tesseract
assert len(merged) == 2
def test_low_conf_tesseract_dropped(self):
"""Unmatched Tesseract words with conf < 30 are dropped."""
paddle = [_word("hello", 100, 10)]
tess = [_word("noise", 10, 10, conf=15), _word("hello", 100, 12)]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
assert "noise" not in texts
assert len(merged) == 1
def test_real_world_row(self):
"""Reproduce real data: both engines find 'take part teilnehmen More than'."""
paddle = [
_word("take", 185, 287, 47, 29, conf=90),
_word("part(in)", 238, 287, 94, 29, conf=90),
_word("teilnehmen", 526, 282, 140, 35, conf=93),
_word("More", 944, 287, 50, 29, conf=96),
_word("than", 1003, 287, 50, 29, conf=96),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
# No duplicates
assert texts.count("take") == 1
assert texts.count("More") == 1
assert texts.count("than") == 1
assert texts.count("teilnehmen") == 1
# Tesseract-only phonetic kept
assert "[teık" in texts
# ---------------------------------------------------------------------------
# _merge_paddle_tesseract (full pipeline)
# ---------------------------------------------------------------------------
class TestMergePaddleTesseract:
def test_same_words_deduplicated(self):
"""Both engines find same words → no duplicates."""
pw = [
_word("apple", 50, 10, 70, 20, conf=90),
_word("Apfel", 300, 10, 60, 20, conf=85),
]
tw = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
texts = sorted(w["text"] for w in merged)
assert texts == ["Apfel", "apple"]
def test_different_rows_not_cross_merged(self):
"""Words from different rows must NOT be averaged together."""
pw = [
_word("row1word", 50, 50, 80, 20, conf=90),
_word("row2word", 50, 100, 80, 20, conf=90),
]
tw = [
_word("row1word", 52, 52, 78, 18, conf=80),
_word("row2word", 52, 102, 78, 18, conf=80),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
# Row 1 word should stay near y=50, not averaged with y=100
row1 = [w for w in merged if w["text"] == "row1word"][0]
row2 = [w for w in merged if w["text"] == "row2word"][0]
assert row1["top"] < 60 # stays near row 1
assert row2["top"] > 90 # stays near row 2
def test_tesseract_extra_symbols_added(self):
"""Symbols only found by Tesseract are included."""
pw = [_word("Betonung", 60, 10, 80, 20)]
tw = [
_word("!", 20, 10, 12, 20, conf=65),
_word("Betonung", 60, 10, 80, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [w["text"] for w in merged]
assert "!" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_paddle_extra_words_added(self):
"""Words only found by Paddle are included."""
pw = [_word("extra", 10, 10), _word("word", 100, 10)]
tw = [_word("word", 102, 12)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
def test_empty_paddle(self):
tw = [_word("a", 10, 10, conf=80), _word("b", 200, 200, conf=10)]
merged = _merge_paddle_tesseract([], tw)
assert len(merged) == 1 # only conf >= 40
def test_empty_tesseract(self):
pw = [_word("a", 10, 10), _word("b", 200, 10)]
merged = _merge_paddle_tesseract(pw, [])
assert len(merged) == 2
def test_both_empty(self):
assert _merge_paddle_tesseract([], []) == []
def test_multi_row_deduplication(self):
"""Multiple rows with words from both engines, all deduplicated."""
pw = [
_word("cat", 50, 50, conf=90),
_word("Katze", 200, 50, conf=85),
_word("dog", 50, 100, conf=88),
_word("Hund", 200, 100, conf=82),
]
tw = [
_word("cat", 52, 52, conf=75),
_word("Katze", 198, 51, conf=70),
_word("dog", 48, 101, conf=72),
_word("Hund", 202, 102, conf=68),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 4
texts = sorted(w["text"] for w in merged)
assert texts == ["Hund", "Katze", "cat", "dog"]
class TestMergeRealWorldRegression:
"""Regression test with actual data from the doubled-words bug."""
def test_full_page_no_duplicates(self):
"""Both engines find same words at slightly different positions.
Merge should produce no near-duplicate words."""
paddle = [
_word("teilnehmen", 526, 282, 140, 35, conf=93),
_word("take", 185, 287, 47, 29, conf=90),
_word("part(in)", 238, 287, 94, 29, conf=90),
_word("More", 944, 287, 50, 29, conf=96),
_word("than", 1003, 287, 50, 29, conf=96),
_word("200", 1063, 287, 38, 29, conf=96),
_word("singers", 1110, 287, 88, 29, conf=96),
_word("took", 1207, 287, 50, 29, conf=96),
_word("part", 1266, 287, 50, 29, conf=96),
_word("in", 1326, 287, 25, 29, conf=96),
_word("the", 1360, 287, 38, 29, conf=96),
# Second row
_word("be", 185, 365, 30, 29, conf=90),
_word("good", 216, 365, 50, 29, conf=90),
_word("at", 275, 365, 25, 29, conf=90),
_word("sth.", 306, 365, 45, 29, conf=90),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
_word("singers", 1128, 293, 75, 26, conf=93),
_word("took", 1212, 291, 55, 22, conf=96),
_word("part", 1276, 294, 47, 25, conf=96),
_word("in", 1332, 292, 20, 20, conf=95),
_word("the", 1361, 292, 36, 21, conf=95),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("'pa:t]", 407, 292, 55, 23, conf=89),
# Second row
_word("be", 189, 369, 28, 21, conf=96),
_word("good", 225, 369, 50, 21, conf=96),
_word("at", 292, 371, 22, 21, conf=96),
_word("sth.", 324, 369, 42, 21, conf=96),
]
merged = _merge_paddle_tesseract(paddle, tess)
# Check no near-duplicates: same text within 30px horizontal / 15px vertical
for i, w1 in enumerate(merged):
for j in range(i + 1, len(merged)):
w2 = merged[j]
if w1["text"].lower() == w2["text"].lower():
cx1 = w1["left"] + w1.get("width", 0) / 2
cx2 = w2["left"] + w2.get("width", 0) / 2
cy1 = w1["top"] + w1.get("height", 0) / 2
cy2 = w2["top"] + w2.get("height", 0) / 2
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
f"vs ({w2['left']},{w2['top']})"
)
# Tesseract-only phonetic words should be present
texts = [w["text"] for w in merged]
assert "[teık" in texts
assert "'pa:t]" in texts
# Row 1 and Row 2 words should not be merged to same Y position
be_word = [w for w in merged if w["text"] == "be"][0]
take_word = [w for w in merged if w["text"] == "take"][0]
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
class TestSpatialOverlapDedup:
"""Test that words at the same position are deduplicated even if text differs."""
def test_same_position_different_text_deduplicated(self):
"""Both engines find same physical word but OCR text differs slightly.
Spatial overlap should catch this as a duplicate."""
paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
tess = [_word("helo", 102, 52, 76, 18, conf=70)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1, (
f"Expected 1 word (deduped by overlap), got {len(merged)}: "
f"{[w['text'] for w in merged]}"
)
# Paddle text preferred (higher confidence)
assert merged[0]["text"] == "hello"
def test_same_position_single_char_deduplicated(self):
"""Single-char words at same position should be deduplicated via overlap."""
paddle = [_word("a", 100, 50, 20, 20, conf=90)]
tess = [_word("a!", 101, 51, 22, 19, conf=60)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1
def test_no_overlap_different_words_kept(self):
"""Different words at different positions should both be kept."""
paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
tess = [_word("dog", 300, 50, 50, 20, conf=70)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
def test_partial_overlap_below_threshold_kept(self):
"""Words with < 50% overlap are different words and both kept."""
paddle = [_word("take", 100, 50, 60, 20, conf=90)]
tess = [_word("part", 145, 50, 60, 20, conf=70)]
merged = _merge_row_sequences(paddle, tess)
# 15px overlap / 60px min width = 25% < 50% → kept as separate
assert len(merged) == 2
class TestSplitThenMerge:
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""
def test_multi_word_paddle_boxes_no_duplicates(self):
"""PaddleOCR returns phrases as single boxes — after splitting,
merge should produce no duplicates."""
# Paddle returns multi-word boxes (real-world behavior)
paddle_raw = [
_word("take part(in) [teik'pa:t]", 185, 287, 281, 29, conf=90),
_word("teilnehmen (an.mitmachen", 526, 282, 329, 35, conf=93),
_word("More than 200 singers took part in the", 944, 287, 454, 29, conf=96),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("'pa:t]", 407, 292, 55, 23, conf=89),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("(an),", 671, 291, 48, 23, conf=96),
_word("mitmachen", 730, 290, 123, 22, conf=96),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
_word("singers", 1128, 293, 75, 26, conf=93),
_word("took", 1212, 291, 55, 22, conf=96),
_word("part", 1276, 294, 47, 25, conf=96),
_word("in", 1332, 292, 20, 20, conf=95),
_word("the", 1361, 292, 36, 21, conf=95),
]
# Split paddle multi-word boxes first
paddle_split = _split_paddle_multi_words(paddle_raw)
assert len(paddle_split) > len(paddle_raw), "Should have more words after split"
# Merge
merged = _merge_paddle_tesseract(paddle_split, tess)
# Check no near-duplicates
for i, w1 in enumerate(merged):
for j in range(i + 1, len(merged)):
w2 = merged[j]
if w1["text"].lower() == w2["text"].lower():
cx1 = w1["left"] + w1.get("width", 0) / 2
cx2 = w2["left"] + w2.get("width", 0) / 2
cy1 = w1["top"] + w1.get("height", 0) / 2
cy2 = w2["top"] + w2.get("height", 0) / 2
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
f"vs ({w2['left']},{w2['top']})"
)