Files
breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py
Benjamin Admin a994ddee83
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
feat: add Kombi-Vergleich mode for side-by-side Paddle vs RapidOCR comparison
Add /rapid-kombi backend endpoint using local RapidOCR + Tesseract merge,
KombiCompareStep component for parallel execution and side-by-side overlay,
and wordResultOverride prop on OverlayReconstruction for direct data injection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 07:59:06 +01:00

564 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the Kombi-Modus row-based sequence merge algorithm.
Functions under test (ocr_pipeline_api.py):
- _split_paddle_multi_words: Split multi-word PaddleOCR boxes into individual words
- _group_words_into_rows: Cluster words by Y-position into rows
- _merge_row_sequences: Merge two word sequences within the same row
- _merge_paddle_tesseract: Full merge with row matching + sequence dedup
"""
import pytest
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import (
_split_paddle_multi_words,
_group_words_into_rows,
_merge_row_sequences,
_merge_paddle_tesseract,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
"""Create a synthetic word dict."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
}
# ---------------------------------------------------------------------------
# _split_paddle_multi_words
# ---------------------------------------------------------------------------
class TestSplitPaddleMultiWords:
def test_single_word_unchanged(self):
words = [_word("hello", 100, 50, 80, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 1
assert result[0]["text"] == "hello"
def test_multi_word_split(self):
"""'More than 200' as one box → 3 individual words."""
words = [_word("More than 200", 100, 50, 300, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 3
assert result[0]["text"] == "More"
assert result[1]["text"] == "than"
assert result[2]["text"] == "200"
# All should be within the original box
assert result[0]["left"] >= 100
assert result[2]["left"] + result[2]["width"] <= 400 + 5 # allow rounding
def test_exclamation_split(self):
"""'!Betonung' → ['!', 'Betonung']."""
words = [_word("!Betonung", 100, 50, 120, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 2
assert result[0]["text"] == "!"
assert result[1]["text"] == "Betonung"
def test_ipa_bracket_split(self):
"""'badge[bxd3]' → ['badge', '[bxd3]']."""
words = [_word("badge[bxd3]", 100, 50, 150, 20)]
result = _split_paddle_multi_words(words)
assert len(result) == 2
assert result[0]["text"] == "badge"
assert result[1]["text"] == "[bxd3]"
def test_long_phrase(self):
"""'More than 200 singers took part in the' → 8 words."""
words = [_word("More than 200 singers took part in the", 944, 287, 454, 29)]
result = _split_paddle_multi_words(words)
assert len(result) == 8
texts = [w["text"] for w in result]
assert texts == ["More", "than", "200", "singers", "took", "part", "in", "the"]
def test_empty_input(self):
assert _split_paddle_multi_words([]) == []
def test_preserves_top_and_height(self):
words = [_word("a b", 100, 50, 200, 25)]
result = _split_paddle_multi_words(words)
for w in result:
assert w["top"] == 50
assert w["height"] == 25
# ---------------------------------------------------------------------------
# _group_words_into_rows
# ---------------------------------------------------------------------------
class TestGroupWordsIntoRows:
def test_single_row(self):
words = [_word("a", 10, 50), _word("b", 100, 52), _word("c", 200, 48)]
rows = _group_words_into_rows(words)
assert len(rows) == 1
assert len(rows[0]) == 3
# Sorted left-to-right
assert rows[0][0]["text"] == "a"
assert rows[0][2]["text"] == "c"
def test_two_rows(self):
words = [
_word("a", 10, 50), _word("b", 100, 52),
_word("c", 10, 100), _word("d", 100, 102),
]
rows = _group_words_into_rows(words)
assert len(rows) == 2
assert [w["text"] for w in rows[0]] == ["a", "b"]
assert [w["text"] for w in rows[1]] == ["c", "d"]
def test_empty(self):
assert _group_words_into_rows([]) == []
def test_different_heights_same_row(self):
"""Paddle (h=29) and Tesseract (h=21) words at similar Y → same row."""
words = [
_word("take", 100, 287, 47, 29), # center_y = 301.5
_word("take", 103, 289, 52, 21), # center_y = 299.5
]
rows = _group_words_into_rows(words)
assert len(rows) == 1 # Same row, not two rows
def test_close_rows_separated(self):
"""Two rows ~30px apart should be separate rows."""
words = [
_word("a", 10, 50, height=20), # center_y = 60
_word("b", 10, 85, height=20), # center_y = 95
]
rows = _group_words_into_rows(words)
assert len(rows) == 2
# ---------------------------------------------------------------------------
# _merge_row_sequences
# ---------------------------------------------------------------------------
class TestMergeRowSequences:
def test_identical_sequences_deduplicated(self):
"""Same words from both engines → only one copy each."""
paddle = [_word("apple", 50, 10), _word("Apfel", 200, 10)]
tess = [_word("apple", 52, 12), _word("Apfel", 198, 11)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
assert merged[0]["text"] == "apple"
assert merged[1]["text"] == "Apfel"
def test_tesseract_extra_symbol(self):
"""Tesseract finds '!' that Paddle missed → included."""
paddle = [_word("Betonung", 60, 10)]
tess = [_word("!", 20, 10, 12, 20, conf=70), _word("Betonung", 60, 10)]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
assert "!" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_paddle_extra_word(self):
"""Paddle finds word that Tesseract missed → included."""
paddle = [_word("!", 20, 10, 12, 20), _word("word", 60, 10)]
tess = [_word("word", 62, 12)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
def test_coordinates_averaged(self):
"""Matched words have coordinates averaged by confidence."""
paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
tess = [_word("hello", 110, 55, 70, 18, conf=60)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1
m = merged[0]
assert m["text"] == "hello"
# (100*90 + 110*60) / 150 = 104
assert m["left"] == 104
assert m["conf"] == 90
def test_empty_paddle_row(self):
tess = [_word("a", 10, 10, conf=80)]
merged = _merge_row_sequences([], tess)
assert len(merged) == 1
def test_empty_tess_row(self):
paddle = [_word("a", 10, 10)]
merged = _merge_row_sequences(paddle, [])
assert len(merged) == 1
def test_both_empty(self):
assert _merge_row_sequences([], []) == []
def test_substring_match(self):
"""'part(in)' from Paddle matches 'part' from Tesseract (substring)."""
paddle = [_word("part(in)", 100, 10, 90, 20)]
tess = [_word("part", 100, 12, 50, 18), _word("(in)", 155, 12, 40, 18)]
merged = _merge_row_sequences(paddle, tess)
# part(in) matches part, then (in) is extra from Tesseract
assert len(merged) == 2
def test_low_conf_tesseract_dropped(self):
"""Unmatched Tesseract words with conf < 30 are dropped."""
paddle = [_word("hello", 100, 10)]
tess = [_word("noise", 10, 10, conf=15), _word("hello", 100, 12)]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
assert "noise" not in texts
assert len(merged) == 1
def test_real_world_row(self):
"""Reproduce real data: both engines find 'take part teilnehmen More than'."""
paddle = [
_word("take", 185, 287, 47, 29, conf=90),
_word("part(in)", 238, 287, 94, 29, conf=90),
_word("teilnehmen", 526, 282, 140, 35, conf=93),
_word("More", 944, 287, 50, 29, conf=96),
_word("than", 1003, 287, 50, 29, conf=96),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
]
merged = _merge_row_sequences(paddle, tess)
texts = [w["text"] for w in merged]
# No duplicates
assert texts.count("take") == 1
assert texts.count("More") == 1
assert texts.count("than") == 1
assert texts.count("teilnehmen") == 1
# Tesseract-only phonetic kept
assert "[teık" in texts
# ---------------------------------------------------------------------------
# _merge_paddle_tesseract (full pipeline)
# ---------------------------------------------------------------------------
class TestMergePaddleTesseract:
def test_same_words_deduplicated(self):
"""Both engines find same words → no duplicates."""
pw = [
_word("apple", 50, 10, 70, 20, conf=90),
_word("Apfel", 300, 10, 60, 20, conf=85),
]
tw = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
texts = sorted(w["text"] for w in merged)
assert texts == ["Apfel", "apple"]
def test_different_rows_not_cross_merged(self):
"""Words from different rows must NOT be averaged together."""
pw = [
_word("row1word", 50, 50, 80, 20, conf=90),
_word("row2word", 50, 100, 80, 20, conf=90),
]
tw = [
_word("row1word", 52, 52, 78, 18, conf=80),
_word("row2word", 52, 102, 78, 18, conf=80),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
# Row 1 word should stay near y=50, not averaged with y=100
row1 = [w for w in merged if w["text"] == "row1word"][0]
row2 = [w for w in merged if w["text"] == "row2word"][0]
assert row1["top"] < 60 # stays near row 1
assert row2["top"] > 90 # stays near row 2
def test_tesseract_extra_symbols_added(self):
"""Symbols only found by Tesseract are included."""
pw = [_word("Betonung", 60, 10, 80, 20)]
tw = [
_word("!", 20, 10, 12, 20, conf=65),
_word("Betonung", 60, 10, 80, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [w["text"] for w in merged]
assert "!" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_paddle_extra_words_added(self):
"""Words only found by Paddle are included."""
pw = [_word("extra", 10, 10), _word("word", 100, 10)]
tw = [_word("word", 102, 12)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
def test_empty_paddle(self):
tw = [_word("a", 10, 10, conf=80), _word("b", 200, 200, conf=10)]
merged = _merge_paddle_tesseract([], tw)
assert len(merged) == 1 # only conf >= 40
def test_empty_tesseract(self):
pw = [_word("a", 10, 10), _word("b", 200, 10)]
merged = _merge_paddle_tesseract(pw, [])
assert len(merged) == 2
def test_both_empty(self):
assert _merge_paddle_tesseract([], []) == []
def test_multi_row_deduplication(self):
"""Multiple rows with words from both engines, all deduplicated."""
pw = [
_word("cat", 50, 50, conf=90),
_word("Katze", 200, 50, conf=85),
_word("dog", 50, 100, conf=88),
_word("Hund", 200, 100, conf=82),
]
tw = [
_word("cat", 52, 52, conf=75),
_word("Katze", 198, 51, conf=70),
_word("dog", 48, 101, conf=72),
_word("Hund", 202, 102, conf=68),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 4
texts = sorted(w["text"] for w in merged)
assert texts == ["Hund", "Katze", "cat", "dog"]
class TestMergeRealWorldRegression:
"""Regression test with actual data from the doubled-words bug."""
def test_full_page_no_duplicates(self):
"""Both engines find same words at slightly different positions.
Merge should produce no near-duplicate words."""
paddle = [
_word("teilnehmen", 526, 282, 140, 35, conf=93),
_word("take", 185, 287, 47, 29, conf=90),
_word("part(in)", 238, 287, 94, 29, conf=90),
_word("More", 944, 287, 50, 29, conf=96),
_word("than", 1003, 287, 50, 29, conf=96),
_word("200", 1063, 287, 38, 29, conf=96),
_word("singers", 1110, 287, 88, 29, conf=96),
_word("took", 1207, 287, 50, 29, conf=96),
_word("part", 1266, 287, 50, 29, conf=96),
_word("in", 1326, 287, 25, 29, conf=96),
_word("the", 1360, 287, 38, 29, conf=96),
# Second row
_word("be", 185, 365, 30, 29, conf=90),
_word("good", 216, 365, 50, 29, conf=90),
_word("at", 275, 365, 25, 29, conf=90),
_word("sth.", 306, 365, 45, 29, conf=90),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
_word("singers", 1128, 293, 75, 26, conf=93),
_word("took", 1212, 291, 55, 22, conf=96),
_word("part", 1276, 294, 47, 25, conf=96),
_word("in", 1332, 292, 20, 20, conf=95),
_word("the", 1361, 292, 36, 21, conf=95),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("'pa:t]", 407, 292, 55, 23, conf=89),
# Second row
_word("be", 189, 369, 28, 21, conf=96),
_word("good", 225, 369, 50, 21, conf=96),
_word("at", 292, 371, 22, 21, conf=96),
_word("sth.", 324, 369, 42, 21, conf=96),
]
merged = _merge_paddle_tesseract(paddle, tess)
# Check no near-duplicates: same text within 30px horizontal / 15px vertical
for i, w1 in enumerate(merged):
for j in range(i + 1, len(merged)):
w2 = merged[j]
if w1["text"].lower() == w2["text"].lower():
cx1 = w1["left"] + w1.get("width", 0) / 2
cx2 = w2["left"] + w2.get("width", 0) / 2
cy1 = w1["top"] + w1.get("height", 0) / 2
cy2 = w2["top"] + w2.get("height", 0) / 2
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
f"vs ({w2['left']},{w2['top']})"
)
# Tesseract-only phonetic words should be present
texts = [w["text"] for w in merged]
assert "[teık" in texts
assert "'pa:t]" in texts
# Row 1 and Row 2 words should not be merged to same Y position
be_word = [w for w in merged if w["text"] == "be"][0]
take_word = [w for w in merged if w["text"] == "take"][0]
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
class TestSpatialOverlapDedup:
"""Test that words at the same position are deduplicated even if text differs."""
def test_same_position_different_text_deduplicated(self):
"""Both engines find same physical word but OCR text differs slightly.
Spatial overlap should catch this as a duplicate."""
paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
tess = [_word("helo", 102, 52, 76, 18, conf=70)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1, (
f"Expected 1 word (deduped by overlap), got {len(merged)}: "
f"{[w['text'] for w in merged]}"
)
# Paddle text preferred (higher confidence)
assert merged[0]["text"] == "hello"
def test_same_position_single_char_deduplicated(self):
"""Single-char words at same position should be deduplicated via overlap."""
paddle = [_word("a", 100, 50, 20, 20, conf=90)]
tess = [_word("a!", 101, 51, 22, 19, conf=60)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 1
def test_no_overlap_different_words_kept(self):
"""Different words at different positions should both be kept."""
paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
tess = [_word("dog", 300, 50, 50, 20, conf=70)]
merged = _merge_row_sequences(paddle, tess)
assert len(merged) == 2
def test_partial_overlap_below_threshold_kept(self):
"""Words with < 50% overlap are different words and both kept."""
paddle = [_word("take", 100, 50, 60, 20, conf=90)]
tess = [_word("part", 145, 50, 60, 20, conf=70)]
merged = _merge_row_sequences(paddle, tess)
# 15px overlap / 60px min width = 25% < 50% → kept as separate
assert len(merged) == 2
class TestRapidOcrMergeCompatibility:
"""Test that _merge_paddle_tesseract works with RapidOCR word format.
RapidOCR words include an extra 'region_type' key that PaddleOCR words
don't have. The merge logic must tolerate this extra field.
"""
def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
"""Create a word dict in RapidOCR format (has region_type)."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
"region_type": region_type,
}
def test_rapid_words_merge_with_tesseract(self):
"""RapidOCR words (with region_type) merge correctly with Tesseract words."""
rapid = [
self._rapid_word("apple", 50, 10, 70, 20, conf=90),
self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
]
tess = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
]
merged = _merge_paddle_tesseract(rapid, tess)
assert len(merged) == 2
texts = sorted(w["text"] for w in merged)
assert texts == ["Apfel", "apple"]
def test_rapid_words_split_then_merge(self):
"""Split + merge works with RapidOCR multi-word boxes."""
rapid_raw = [
self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
]
tess = [
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
]
rapid_split = _split_paddle_multi_words(rapid_raw)
assert len(rapid_split) == 3
merged = _merge_paddle_tesseract(rapid_split, tess)
texts = [w["text"] for w in merged]
assert texts.count("More") == 1
assert texts.count("than") == 1
assert texts.count("200") == 1
def test_region_type_preserved_in_unmatched(self):
"""Unmatched RapidOCR words keep their region_type field."""
rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
tess = [] # No Tesseract words
merged = _merge_paddle_tesseract(rapid, tess)
assert len(merged) == 1
assert merged[0]["text"] == "unique"
class TestSplitThenMerge:
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""
def test_multi_word_paddle_boxes_no_duplicates(self):
"""PaddleOCR returns phrases as single boxes — after splitting,
merge should produce no duplicates."""
# Paddle returns multi-word boxes (real-world behavior)
paddle_raw = [
_word("take part(in) [teik'pa:t]", 185, 287, 281, 29, conf=90),
_word("teilnehmen (an.mitmachen", 526, 282, 329, 35, conf=93),
_word("More than 200 singers took part in the", 944, 287, 454, 29, conf=96),
]
tess = [
_word("take", 188, 289, 52, 21, conf=96),
_word("part", 249, 292, 48, 24, conf=96),
_word("(in)", 305, 290, 38, 24, conf=93),
_word("[teık", 352, 292, 47, 21, conf=90),
_word("'pa:t]", 407, 292, 55, 23, conf=89),
_word("teilnehmen", 534, 290, 127, 21, conf=95),
_word("(an),", 671, 291, 48, 23, conf=96),
_word("mitmachen", 730, 290, 123, 22, conf=96),
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
_word("singers", 1128, 293, 75, 26, conf=93),
_word("took", 1212, 291, 55, 22, conf=96),
_word("part", 1276, 294, 47, 25, conf=96),
_word("in", 1332, 292, 20, 20, conf=95),
_word("the", 1361, 292, 36, 21, conf=95),
]
# Split paddle multi-word boxes first
paddle_split = _split_paddle_multi_words(paddle_raw)
assert len(paddle_split) > len(paddle_raw), "Should have more words after split"
# Merge
merged = _merge_paddle_tesseract(paddle_split, tess)
# Check no near-duplicates
for i, w1 in enumerate(merged):
for j in range(i + 1, len(merged)):
w2 = merged[j]
if w1["text"].lower() == w2["text"].lower():
cx1 = w1["left"] + w1.get("width", 0) / 2
cx2 = w2["left"] + w2.get("width", 0) / 2
cy1 = w1["top"] + w1.get("height", 0) / 2
cy2 = w2["top"] + w2.get("height", 0) / 2
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
f"vs ({w2['left']},{w2['top']})"
)