Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 35s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m33s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 24s
- test_paddle_kombi.py: 6 IoU tests, 10 merge tests, 2 bullet-point tests - OCR-Pipeline.md: new "OCR Overlay" section with Paddle Direct/Kombi docs, merge algorithm flowchart, dateistruktur update, changelog v4.5.0 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
214 lines
8.0 KiB
Python
214 lines
8.0 KiB
Python
"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract).
|
||
|
||
These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract
|
||
word boxes by IoU matching and confidence-weighted coordinate averaging.
|
||
"""
|
||
|
||
import pytest
|
||
import sys
|
||
import os
|
||
|
||
# Add backend to path so we can import from ocr_pipeline_api
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||
|
||
from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
|
||
"""Create a synthetic word dict."""
|
||
return {
|
||
"text": text,
|
||
"left": left,
|
||
"top": top,
|
||
"width": width,
|
||
"height": height,
|
||
"conf": conf,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _box_iou
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBoxIoU:
|
||
|
||
def test_identical_boxes(self):
|
||
"""Identical boxes have IoU = 1.0."""
|
||
a = _word("hello", 10, 10, 100, 20)
|
||
assert _box_iou(a, a) == pytest.approx(1.0)
|
||
|
||
def test_no_overlap(self):
|
||
"""Non-overlapping boxes have IoU = 0.0."""
|
||
a = _word("a", 0, 0, 50, 20)
|
||
b = _word("b", 200, 200, 50, 20)
|
||
assert _box_iou(a, b) == 0.0
|
||
|
||
def test_partial_overlap(self):
|
||
"""Partially overlapping boxes have 0 < IoU < 1."""
|
||
a = _word("a", 0, 0, 100, 20)
|
||
b = _word("b", 50, 0, 100, 20)
|
||
# Intersection: x=[50,100], y=[0,20] → 50×20 = 1000
|
||
# Union: 100×20 + 100×20 - 1000 = 3000
|
||
assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)
|
||
|
||
def test_contained_box(self):
|
||
"""Small box inside large box."""
|
||
big = _word("big", 0, 0, 200, 40)
|
||
small = _word("small", 50, 10, 30, 10)
|
||
# Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000
|
||
assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)
|
||
|
||
def test_touching_edges(self):
|
||
"""Boxes that share an edge but don't overlap have IoU = 0."""
|
||
a = _word("a", 0, 0, 50, 20)
|
||
b = _word("b", 50, 0, 50, 20)
|
||
assert _box_iou(a, b) == 0.0
|
||
|
||
def test_zero_area_box(self):
|
||
"""Zero-area box returns IoU = 0."""
|
||
a = _word("a", 10, 10, 0, 0)
|
||
b = _word("b", 10, 10, 50, 20)
|
||
assert _box_iou(a, b) == 0.0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _merge_paddle_tesseract
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMergePaddleTesseract:
|
||
|
||
def test_perfect_match_averages_coords(self):
|
||
"""When paddle and tesseract have the same word at same position,
|
||
coordinates are averaged by confidence."""
|
||
pw = [_word("hello", 100, 50, 80, 20, conf=90)]
|
||
tw = [_word("hello", 110, 55, 70, 18, conf=60)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
m = merged[0]
|
||
assert m["text"] == "hello" # Paddle text preferred
|
||
# Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104
|
||
assert m["left"] == 104
|
||
assert m["conf"] == 90 # max(90, 60)
|
||
|
||
def test_no_match_keeps_both(self):
|
||
"""Non-overlapping words: both kept."""
|
||
pw = [_word("hello", 10, 10)]
|
||
tw = [_word("bullet", 500, 500, conf=50)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2
|
||
texts = {m["text"] for m in merged}
|
||
assert texts == {"hello", "bullet"}
|
||
|
||
def test_low_conf_tesseract_dropped(self):
|
||
"""Unmatched Tesseract words with conf < 40 are dropped."""
|
||
pw = [_word("hello", 10, 10)]
|
||
tw = [_word("noise", 500, 500, conf=20)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
assert merged[0]["text"] == "hello"
|
||
|
||
def test_empty_paddle(self):
|
||
"""Only Tesseract words with sufficient confidence are kept."""
|
||
pw = []
|
||
tw = [
|
||
_word("bullet", 10, 10, conf=80),
|
||
_word("noise", 200, 200, conf=10),
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
assert merged[0]["text"] == "bullet"
|
||
|
||
def test_empty_tesseract(self):
|
||
"""All Paddle words kept when Tesseract is empty."""
|
||
pw = [_word("a", 10, 10), _word("b", 200, 10)]
|
||
tw = []
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2
|
||
|
||
def test_both_empty(self):
|
||
"""Empty inputs return empty list."""
|
||
assert _merge_paddle_tesseract([], []) == []
|
||
|
||
def test_one_to_one_matching(self):
|
||
"""Each Tesseract word matches at most one Paddle word."""
|
||
# Two paddle words at different X positions, one tesseract word overlaps first
|
||
pw = [
|
||
_word("cat", 10, 10, 60, 20, conf=80),
|
||
_word("dog", 200, 10, 60, 20, conf=80),
|
||
]
|
||
tw = [_word("cat", 15, 12, 55, 18, conf=70)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2 # cat (merged) + dog (unmatched paddle)
|
||
|
||
def test_iou_threshold(self):
|
||
"""Match requires IoU > 0.3, not just any overlap."""
|
||
pw = [_word("hello", 0, 0, 100, 20, conf=80)]
|
||
# Tiny overlap — IoU well below 0.3
|
||
tw = [_word("world", 95, 0, 100, 20, conf=70)]
|
||
# Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100
|
||
# Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 2 # No match, both kept separately
|
||
|
||
def test_paddle_text_preferred(self):
|
||
"""Merged word uses Paddle's text, not Tesseract's."""
|
||
pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
|
||
tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
assert merged[0]["text"] == "Betonung"
|
||
|
||
def test_confidence_weighted_positions(self):
|
||
"""Equal confidence → simple average of coordinates."""
|
||
# Boxes must overlap enough for IoU > 0.3
|
||
pw = [_word("x", 100, 200, 60, 20, conf=50)]
|
||
tw = [_word("x", 110, 200, 60, 20, conf=50)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1
|
||
m = merged[0]
|
||
assert m["left"] == 105 # (100+110)/2
|
||
assert m["top"] == 200 # (200+200)/2
|
||
assert m["width"] == 60 # (60+60)/2
|
||
assert m["height"] == 20 # (20+20)/2
|
||
|
||
def test_zero_confidence_no_division_error(self):
|
||
"""Words with conf=0 don't cause division by zero."""
|
||
pw = [_word("a", 100, 50, 80, 20, conf=0)]
|
||
tw = [_word("a", 100, 50, 80, 20, conf=0)]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
assert len(merged) == 1 # Should not raise
|
||
|
||
|
||
class TestMergePaddleTesseractBulletPoints:
|
||
"""Test the key use case: Tesseract catches bullet points / symbols
|
||
that PaddleOCR misses or merges with adjacent text."""
|
||
|
||
def test_bullet_added_from_tesseract(self):
|
||
"""A bullet character recognized by Tesseract but not Paddle is added."""
|
||
pw = [_word("Betonung", 60, 10, 80, 20)]
|
||
tw = [
|
||
_word("•", 10, 10, 15, 15, conf=65), # bullet
|
||
_word("Betonung", 60, 10, 80, 20, conf=50), # overlaps paddle
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
texts = [m["text"] for m in merged]
|
||
assert "•" in texts
|
||
assert "Betonung" in texts
|
||
assert len(merged) == 2
|
||
|
||
def test_exclamation_added_from_tesseract(self):
|
||
"""An exclamation mark recognized separately by Tesseract is added."""
|
||
pw = [_word("important", 60, 10, 100, 20)]
|
||
tw = [
|
||
_word("!", 40, 10, 12, 20, conf=70),
|
||
_word("important", 60, 10, 100, 20, conf=55),
|
||
]
|
||
merged = _merge_paddle_tesseract(pw, tw)
|
||
texts = [m["text"] for m in merged]
|
||
assert "!" in texts
|
||
assert len(merged) == 2
|