Files
breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py
Benjamin Admin 61c8169f9e
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 35s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m33s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 24s
docs+test: add Kombi-Modus tests (19 passing) and MkDocs documentation
- test_paddle_kombi.py: 6 IoU tests, 10 merge tests, 2 bullet-point tests
- OCR-Pipeline.md: new "OCR Overlay" section with Paddle Direct/Kombi docs,
  merge algorithm flowchart, dateistruktur update, changelog v4.5.0

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 20:18:46 +01:00

214 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract).
These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract
word boxes by IoU matching and confidence-weighted coordinate averaging.
"""
import pytest
import sys
import os
# Add backend to path so we can import from ocr_pipeline_api
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
"""Create a synthetic word dict."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
}
# ---------------------------------------------------------------------------
# _box_iou
# ---------------------------------------------------------------------------
class TestBoxIoU:
def test_identical_boxes(self):
"""Identical boxes have IoU = 1.0."""
a = _word("hello", 10, 10, 100, 20)
assert _box_iou(a, a) == pytest.approx(1.0)
def test_no_overlap(self):
"""Non-overlapping boxes have IoU = 0.0."""
a = _word("a", 0, 0, 50, 20)
b = _word("b", 200, 200, 50, 20)
assert _box_iou(a, b) == 0.0
def test_partial_overlap(self):
"""Partially overlapping boxes have 0 < IoU < 1."""
a = _word("a", 0, 0, 100, 20)
b = _word("b", 50, 0, 100, 20)
# Intersection: x=[50,100], y=[0,20] → 50×20 = 1000
# Union: 100×20 + 100×20 - 1000 = 3000
assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)
def test_contained_box(self):
"""Small box inside large box."""
big = _word("big", 0, 0, 200, 40)
small = _word("small", 50, 10, 30, 10)
# Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000
assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)
def test_touching_edges(self):
"""Boxes that share an edge but don't overlap have IoU = 0."""
a = _word("a", 0, 0, 50, 20)
b = _word("b", 50, 0, 50, 20)
assert _box_iou(a, b) == 0.0
def test_zero_area_box(self):
"""Zero-area box returns IoU = 0."""
a = _word("a", 10, 10, 0, 0)
b = _word("b", 10, 10, 50, 20)
assert _box_iou(a, b) == 0.0
# ---------------------------------------------------------------------------
# _merge_paddle_tesseract
# ---------------------------------------------------------------------------
class TestMergePaddleTesseract:
def test_perfect_match_averages_coords(self):
"""When paddle and tesseract have the same word at same position,
coordinates are averaged by confidence."""
pw = [_word("hello", 100, 50, 80, 20, conf=90)]
tw = [_word("hello", 110, 55, 70, 18, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
m = merged[0]
assert m["text"] == "hello" # Paddle text preferred
# Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104
assert m["left"] == 104
assert m["conf"] == 90 # max(90, 60)
def test_no_match_keeps_both(self):
"""Non-overlapping words: both kept."""
pw = [_word("hello", 10, 10)]
tw = [_word("bullet", 500, 500, conf=50)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
texts = {m["text"] for m in merged}
assert texts == {"hello", "bullet"}
def test_low_conf_tesseract_dropped(self):
"""Unmatched Tesseract words with conf < 40 are dropped."""
pw = [_word("hello", 10, 10)]
tw = [_word("noise", 500, 500, conf=20)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "hello"
def test_empty_paddle(self):
"""Only Tesseract words with sufficient confidence are kept."""
pw = []
tw = [
_word("bullet", 10, 10, conf=80),
_word("noise", 200, 200, conf=10),
]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "bullet"
def test_empty_tesseract(self):
"""All Paddle words kept when Tesseract is empty."""
pw = [_word("a", 10, 10), _word("b", 200, 10)]
tw = []
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2
def test_both_empty(self):
"""Empty inputs return empty list."""
assert _merge_paddle_tesseract([], []) == []
def test_one_to_one_matching(self):
"""Each Tesseract word matches at most one Paddle word."""
# Two paddle words at different X positions, one tesseract word overlaps first
pw = [
_word("cat", 10, 10, 60, 20, conf=80),
_word("dog", 200, 10, 60, 20, conf=80),
]
tw = [_word("cat", 15, 12, 55, 18, conf=70)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2 # cat (merged) + dog (unmatched paddle)
def test_iou_threshold(self):
"""Match requires IoU > 0.3, not just any overlap."""
pw = [_word("hello", 0, 0, 100, 20, conf=80)]
# Tiny overlap — IoU well below 0.3
tw = [_word("world", 95, 0, 100, 20, conf=70)]
# Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100
# Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2 # No match, both kept separately
def test_paddle_text_preferred(self):
"""Merged word uses Paddle's text, not Tesseract's."""
pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "Betonung"
def test_confidence_weighted_positions(self):
"""Equal confidence → simple average of coordinates."""
# Boxes must overlap enough for IoU > 0.3
pw = [_word("x", 100, 200, 60, 20, conf=50)]
tw = [_word("x", 110, 200, 60, 20, conf=50)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
m = merged[0]
assert m["left"] == 105 # (100+110)/2
assert m["top"] == 200 # (200+200)/2
assert m["width"] == 60 # (60+60)/2
assert m["height"] == 20 # (20+20)/2
def test_zero_confidence_no_division_error(self):
"""Words with conf=0 don't cause division by zero."""
pw = [_word("a", 100, 50, 80, 20, conf=0)]
tw = [_word("a", 100, 50, 80, 20, conf=0)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1 # Should not raise
class TestMergePaddleTesseractBulletPoints:
"""Test the key use case: Tesseract catches bullet points / symbols
that PaddleOCR misses or merges with adjacent text."""
def test_bullet_added_from_tesseract(self):
"""A bullet character recognized by Tesseract but not Paddle is added."""
pw = [_word("Betonung", 60, 10, 80, 20)]
tw = [
_word("", 10, 10, 15, 15, conf=65), # bullet
_word("Betonung", 60, 10, 80, 20, conf=50), # overlaps paddle
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
assert "" in texts
assert "Betonung" in texts
assert len(merged) == 2
def test_exclamation_added_from_tesseract(self):
"""An exclamation mark recognized separately by Tesseract is added."""
pw = [_word("important", 60, 10, 100, 20)]
tw = [
_word("!", 40, 10, 12, 20, conf=70),
_word("important", 60, 10, 100, 20, conf=55),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
assert "!" in texts
assert len(merged) == 2