"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract). These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract word boxes by IoU matching and confidence-weighted coordinate averaging. """ import pytest import sys import os # Add backend to path so we can import from ocr_pipeline_api sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80): """Create a synthetic word dict.""" return { "text": text, "left": left, "top": top, "width": width, "height": height, "conf": conf, } # --------------------------------------------------------------------------- # _box_iou # --------------------------------------------------------------------------- class TestBoxIoU: def test_identical_boxes(self): """Identical boxes have IoU = 1.0.""" a = _word("hello", 10, 10, 100, 20) assert _box_iou(a, a) == pytest.approx(1.0) def test_no_overlap(self): """Non-overlapping boxes have IoU = 0.0.""" a = _word("a", 0, 0, 50, 20) b = _word("b", 200, 200, 50, 20) assert _box_iou(a, b) == 0.0 def test_partial_overlap(self): """Partially overlapping boxes have 0 < IoU < 1.""" a = _word("a", 0, 0, 100, 20) b = _word("b", 50, 0, 100, 20) # Intersection: x=[50,100], y=[0,20] → 50×20 = 1000 # Union: 100×20 + 100×20 - 1000 = 3000 assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01) def test_contained_box(self): """Small box inside large box.""" big = _word("big", 0, 0, 200, 40) small = _word("small", 50, 10, 30, 10) # Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000 assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01) def test_touching_edges(self): """Boxes that share an edge but don't overlap have IoU = 0.""" a = _word("a", 0, 0, 50, 20) b = _word("b", 50, 0, 50, 20) assert _box_iou(a, b) == 0.0 def test_zero_area_box(self): """Zero-area box returns IoU = 0.""" a = _word("a", 10, 10, 0, 0) b = _word("b", 10, 10, 50, 20) assert _box_iou(a, b) == 0.0 # --------------------------------------------------------------------------- # _merge_paddle_tesseract # --------------------------------------------------------------------------- class TestMergePaddleTesseract: def test_perfect_match_averages_coords(self): """When paddle and tesseract have the same word at same position, coordinates are averaged by confidence.""" pw = [_word("hello", 100, 50, 80, 20, conf=90)] tw = [_word("hello", 110, 55, 70, 18, conf=60)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 m = merged[0] assert m["text"] == "hello" # Paddle text preferred # Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104 assert m["left"] == 104 assert m["conf"] == 90 # max(90, 60) def test_no_match_keeps_both(self): """Non-overlapping words: both kept.""" pw = [_word("hello", 10, 10)] tw = [_word("bullet", 500, 500, conf=50)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 texts = {m["text"] for m in merged} assert texts == {"hello", "bullet"} def test_low_conf_tesseract_dropped(self): """Unmatched Tesseract words with conf < 40 are dropped.""" pw = [_word("hello", 10, 10)] tw = [_word("noise", 500, 500, conf=20)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "hello" def test_empty_paddle(self): """Only Tesseract words with sufficient confidence are kept.""" pw = [] tw = [ _word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10), ] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "bullet" def test_empty_tesseract(self): """All Paddle words kept when Tesseract is empty.""" pw = [_word("a", 10, 10), _word("b", 200, 10)] tw = [] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 def test_both_empty(self): """Empty inputs return empty list.""" assert _merge_paddle_tesseract([], []) == [] def test_one_to_one_matching(self): """Each Tesseract word matches at most one Paddle word.""" # Two paddle words at different X positions, one tesseract word overlaps first pw = [ _word("cat", 10, 10, 60, 20, conf=80), _word("dog", 200, 10, 60, 20, conf=80), ] tw = [_word("cat", 15, 12, 55, 18, conf=70)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 # cat (merged) + dog (unmatched paddle) def test_iou_threshold(self): """Match requires IoU > 0.3, not just any overlap.""" pw = [_word("hello", 0, 0, 100, 20, conf=80)] # Tiny overlap — IoU well below 0.3 tw = [_word("world", 95, 0, 100, 20, conf=70)] # Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100 # Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026 merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 # No match, both kept separately def test_paddle_text_preferred(self): """Merged word uses Paddle's text, not Tesseract's.""" pw = [_word("Betonung", 100, 50, 80, 20, conf=85)] tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "Betonung" def test_confidence_weighted_positions(self): """Equal confidence → simple average of coordinates.""" # Boxes must overlap enough for IoU > 0.3 pw = [_word("x", 100, 200, 60, 20, conf=50)] tw = [_word("x", 110, 200, 60, 20, conf=50)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 m = merged[0] assert m["left"] == 105 # (100+110)/2 assert m["top"] == 200 # (200+200)/2 assert m["width"] == 60 # (60+60)/2 assert m["height"] == 20 # (20+20)/2 def test_zero_confidence_no_division_error(self): """Words with conf=0 don't cause division by zero.""" pw = [_word("a", 100, 50, 80, 20, conf=0)] tw = [_word("a", 100, 50, 80, 20, conf=0)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 # Should not raise class TestMergePaddleTesseractBulletPoints: """Test the key use case: Tesseract catches bullet points / symbols that PaddleOCR misses or merges with adjacent text.""" def test_bullet_added_from_tesseract(self): """A bullet character recognized by Tesseract but not Paddle is added.""" pw = [_word("Betonung", 60, 10, 80, 20)] tw = [ _word("•", 10, 10, 15, 15, conf=65), # bullet _word("Betonung", 60, 10, 80, 20, conf=50), # overlaps paddle ] merged = _merge_paddle_tesseract(pw, tw) texts = [m["text"] for m in merged] assert "•" in texts assert "Betonung" in texts assert len(merged) == 2 def test_exclamation_added_from_tesseract(self): """An exclamation mark recognized separately by Tesseract is added.""" pw = [_word("important", 60, 10, 100, 20)] tw = [ _word("!", 40, 10, 12, 20, conf=70), _word("important", 60, 10, 100, 20, conf=55), ] merged = _merge_paddle_tesseract(pw, tw) texts = [m["text"] for m in merged] assert "!" in texts assert len(merged) == 2