Files
breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py
Benjamin Admin 4f2fb0e94c
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m13s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 22s
fix: Kombi-Modus merge now deduplicates same words from both engines
The merge algorithm now uses 3 criteria instead of just IoU > 0.3:
1. IoU > 0.15 (relaxed threshold)
2. Center proximity < word height AND same row
3. Text similarity > 0.7 AND same row

This prevents doubled overlapping words when both PaddleOCR and
Tesseract find the same word at similar positions. Unique words
from either engine (e.g. bullets from Tesseract) are still added.

Tests expanded: 19 → 37 (added _box_center_dist, _text_similarity,
_words_match tests + deduplication regression test).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 08:11:31 +01:00

335 lines
12 KiB
Python

"""Tests for the Kombi-Modus merge algorithm.
Functions under test (ocr_pipeline_api.py):
- _box_iou: IoU between two word boxes
- _box_center_dist: Euclidean distance between box centers
- _text_similarity: Simple text similarity (0-1)
- _words_match: Multi-criteria match (IoU + center + text)
- _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists
"""
import pytest
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import (
_box_iou,
_box_center_dist,
_text_similarity,
_words_match,
_merge_paddle_tesseract,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
"""Create a synthetic word dict."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
}
# ---------------------------------------------------------------------------
# _box_iou
# ---------------------------------------------------------------------------
class TestBoxIoU:
def test_identical_boxes(self):
a = _word("hello", 10, 10, 100, 20)
assert _box_iou(a, a) == pytest.approx(1.0)
def test_no_overlap(self):
a = _word("a", 0, 0, 50, 20)
b = _word("b", 200, 200, 50, 20)
assert _box_iou(a, b) == 0.0
def test_partial_overlap(self):
a = _word("a", 0, 0, 100, 20)
b = _word("b", 50, 0, 100, 20)
assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)
def test_contained_box(self):
big = _word("big", 0, 0, 200, 40)
small = _word("small", 50, 10, 30, 10)
assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)
def test_touching_edges(self):
a = _word("a", 0, 0, 50, 20)
b = _word("b", 50, 0, 50, 20)
assert _box_iou(a, b) == 0.0
def test_zero_area_box(self):
a = _word("a", 10, 10, 0, 0)
b = _word("b", 10, 10, 50, 20)
assert _box_iou(a, b) == 0.0
# ---------------------------------------------------------------------------
# _box_center_dist
# ---------------------------------------------------------------------------
class TestBoxCenterDist:
def test_same_center(self):
a = _word("a", 100, 50, 60, 20)
assert _box_center_dist(a, a) == 0.0
def test_horizontal_offset(self):
a = _word("a", 100, 50, 60, 20)
b = _word("b", 110, 50, 60, 20)
assert _box_center_dist(a, b) == pytest.approx(10.0)
def test_diagonal(self):
a = _word("a", 0, 0, 20, 20) # center (10, 10)
b = _word("b", 20, 20, 20, 20) # center (30, 30)
expected = (20**2 + 20**2) ** 0.5
assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1)
# ---------------------------------------------------------------------------
# _text_similarity
# ---------------------------------------------------------------------------
class TestTextSimilarity:
def test_identical(self):
assert _text_similarity("hello", "hello") == 1.0
def test_case_insensitive(self):
assert _text_similarity("Hello", "hello") == 1.0
def test_substring(self):
"""One is substring of other (e.g. '!Betonung' vs 'Betonung')."""
assert _text_similarity("!Betonung", "Betonung") == 0.8
def test_completely_different(self):
assert _text_similarity("abc", "xyz") == 0.0
def test_empty_strings(self):
assert _text_similarity("", "hello") == 0.0
assert _text_similarity("", "") == 0.0
def test_partial_overlap(self):
"""Some shared characters."""
sim = _text_similarity("apple", "ape")
assert 0.0 < sim < 1.0
# ---------------------------------------------------------------------------
# _words_match
# ---------------------------------------------------------------------------
class TestWordsMatch:
def test_high_iou_matches(self):
"""IoU > 0.15 is sufficient for a match."""
a = _word("hello", 100, 50, 80, 20)
b = _word("hello", 105, 50, 80, 20)
assert _words_match(a, b) is True
def test_same_text_same_row_matches(self):
"""Same text on same row matches even with low IoU."""
a = _word("Betonung", 100, 50, 80, 20)
b = _word("Betonung", 130, 52, 70, 18) # shifted but same row
assert _words_match(a, b) is True
def test_close_centers_same_row_matches(self):
"""Nearby centers on same row match."""
a = _word("x", 100, 50, 40, 20)
b = _word("y", 110, 52, 50, 22) # close, same row
assert _words_match(a, b) is True
def test_different_rows_no_match(self):
"""Words on different rows don't match even with same text."""
a = _word("hello", 100, 50, 80, 20)
b = _word("hello", 100, 200, 80, 20) # far away vertically
assert _words_match(a, b) is False
def test_far_apart_same_row_different_text(self):
"""Different text far apart on same row: no match."""
a = _word("cat", 10, 50, 40, 20)
b = _word("dog", 400, 50, 40, 20)
assert _words_match(a, b) is False
def test_no_overlap_no_proximity_no_text(self):
"""Completely different words far apart: no match."""
a = _word("abc", 0, 0, 50, 20)
b = _word("xyz", 500, 500, 50, 20)
assert _words_match(a, b) is False
# ---------------------------------------------------------------------------
# _merge_paddle_tesseract
# ---------------------------------------------------------------------------
class TestMergePaddleTesseract:
def test_perfect_match_averages_coords(self):
"""Same word at same position: coordinates averaged by confidence."""
pw = [_word("hello", 100, 50, 80, 20, conf=90)]
tw = [_word("hello", 110, 55, 70, 18, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
m = merged[0]
assert m["text"] == "hello"
assert m["left"] == 104 # (100*90 + 110*60) / 150
assert m["conf"] == 90
def test_same_word_slightly_offset_merges(self):
"""Same word with slight offset still merges (center proximity)."""
pw = [_word("Betonung", 100, 50, 90, 22, conf=85)]
tw = [_word("Betonung", 115, 52, 80, 20, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "Betonung"
def test_truly_different_words_kept_separate(self):
"""Non-overlapping different words: both kept."""
pw = [_word("hello", 10, 10)]
tw = [_word("bullet", 500, 500, conf=50)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
texts = {m["text"] for m in merged}
assert texts == {"hello", "bullet"}
def test_low_conf_tesseract_dropped(self):
"""Unmatched Tesseract words with conf < 40 are dropped."""
pw = [_word("hello", 10, 10)]
tw = [_word("noise", 500, 500, conf=20)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
def test_empty_paddle(self):
pw = []
tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "bullet"
def test_empty_tesseract(self):
pw = [_word("a", 10, 10), _word("b", 200, 10)]
merged = _merge_paddle_tesseract(pw, [])
assert len(merged) == 2
def test_both_empty(self):
assert _merge_paddle_tesseract([], []) == []
def test_one_to_one_matching(self):
"""Each Tesseract word matches at most one Paddle word."""
pw = [
_word("cat", 10, 10, 60, 20, conf=80),
_word("dog", 200, 10, 60, 20, conf=80),
]
tw = [_word("cat", 15, 12, 55, 18, conf=70)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2 # cat (merged) + dog (unmatched paddle)
def test_far_apart_different_text_not_merged(self):
"""Different words far apart stay separate."""
pw = [_word("hello", 0, 0, 100, 20, conf=80)]
tw = [_word("world", 500, 300, 100, 20, conf=70)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
def test_paddle_text_preferred(self):
"""Merged word uses Paddle's text."""
pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "Betonung"
def test_confidence_weighted_positions(self):
"""Equal confidence → simple average of coordinates."""
pw = [_word("x", 100, 200, 60, 20, conf=50)]
tw = [_word("x", 110, 200, 60, 20, conf=50)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
m = merged[0]
assert m["left"] == 105
assert m["top"] == 200
def test_zero_confidence_no_division_error(self):
"""Words with conf=0 don't cause division by zero."""
pw = [_word("a", 100, 50, 80, 20, conf=0)]
tw = [_word("a", 100, 50, 80, 20, conf=0)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
def test_duplicate_words_same_position_deduplicated(self):
"""The core bug fix: same word at same position from both engines
should appear only once, not doubled."""
# Simulate typical case: both engines find same words
pw = [
_word("apple", 50, 10, 70, 20, conf=90),
_word("Apfel", 300, 10, 60, 20, conf=85),
_word("dog", 50, 50, 50, 20, conf=88),
_word("Hund", 300, 50, 60, 20, conf=82),
]
tw = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
_word("dog", 48, 49, 52, 21, conf=72),
_word("Hund", 302, 51, 58, 19, conf=68),
]
merged = _merge_paddle_tesseract(pw, tw)
# Each word should appear exactly once
assert len(merged) == 4
texts = [m["text"] for m in merged]
assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"]
class TestMergePaddleTesseractBulletPoints:
"""Tesseract catches bullet points / symbols that PaddleOCR misses."""
def test_bullet_added_from_tesseract(self):
"""Bullet character from Tesseract is added."""
pw = [_word("Betonung", 60, 10, 80, 20)]
tw = [
_word("", 10, 10, 15, 15, conf=65),
_word("Betonung", 60, 10, 80, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
assert "" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_exclamation_added_from_tesseract(self):
"""Exclamation mark from Tesseract is added."""
pw = [_word("important", 60, 10, 100, 20)]
tw = [
_word("!", 40, 10, 12, 20, conf=70),
_word("important", 60, 10, 100, 20, conf=55),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
assert "!" in texts
assert len(merged) == 2
def test_multiple_unique_tesseract_symbols(self):
"""Multiple symbols only found by Tesseract are all added."""
pw = [_word("word", 100, 10, 60, 20)]
tw = [
_word("!", 20, 10, 10, 20, conf=70),
_word("", 40, 10, 10, 15, conf=65),
_word("word", 100, 10, 60, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
assert "!" in texts
assert "" in texts
assert "word" in texts
assert len(merged) == 3