diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 6a328f1..79eee72 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2616,25 +2616,95 @@ def _box_iou(a: dict, b: dict) -> float: return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0 +def _box_center_dist(a: dict, b: dict) -> float: + """Euclidean distance between box centers.""" + acx = a["left"] + a["width"] / 2 + acy = a["top"] + a["height"] / 2 + bcx = b["left"] + b["width"] / 2 + bcy = b["top"] + b["height"] / 2 + return ((acx - bcx) ** 2 + (acy - bcy) ** 2) ** 0.5 + + +def _text_similarity(a: str, b: str) -> float: + """Simple text similarity (0-1). Handles stripped punctuation.""" + if not a or not b: + return 0.0 + a_lower = a.lower().strip() + b_lower = b.lower().strip() + if a_lower == b_lower: + return 1.0 + # One might be substring of the other (e.g. "!Betonung" vs "Betonung") + if a_lower in b_lower or b_lower in a_lower: + return 0.8 + # Check if they share most characters + shorter, longer = (a_lower, b_lower) if len(a_lower) <= len(b_lower) else (b_lower, a_lower) + if len(shorter) == 0: + return 0.0 + matches = sum(1 for c in shorter if c in longer) + return matches / max(len(shorter), len(longer)) + + +def _words_match(pw: dict, tw: dict) -> bool: + """Determine if a Paddle word and a Tesseract word represent the same word. + + Uses three criteria (any one is sufficient): + 1. IoU > 0.15 (relaxed from 0.3 — engines produce different-sized boxes) + 2. Center distance < max(word height, 20px) AND on same row (vertical overlap) + 3. Text similarity > 0.7 AND on same row + """ + iou = _box_iou(pw, tw) + if iou > 0.15: + return True + + # Same row check: vertical overlap > 50% of smaller height + py1, py2 = pw["top"], pw["top"] + pw["height"] + ty1, ty2 = tw["top"], tw["top"] + tw["height"] + v_overlap = max(0, min(py2, ty2) - max(py1, ty1)) + min_h = max(min(pw["height"], tw["height"]), 1) + same_row = v_overlap > 0.5 * min_h + + if not same_row: + return False + + # Center proximity on same row + cdist = _box_center_dist(pw, tw) + h_threshold = max(pw["height"], tw["height"], 20) + if cdist < h_threshold: + return True + + # Text similarity on same row + if _text_similarity(pw["text"], tw["text"]) > 0.7: + return True + + return False + + def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: """Merge word boxes from PaddleOCR and Tesseract. - Matching: IoU > 0.3 between bounding boxes. - Merging: Weighted average of coordinates by confidence. + Strategy: + - For each Paddle word, find the best matching Tesseract word + - Match criteria: IoU, center proximity, or text similarity (see _words_match) + - Matched pairs: keep Paddle text, average coordinates weighted by confidence + - Unmatched Paddle words: keep as-is + - Unmatched Tesseract words (conf >= 40): add (bullet points, symbols, etc.) """ merged = [] used_tess: set = set() for pw in paddle_words: - best_iou, best_ti = 0.0, -1 + best_score, best_ti = 0.0, -1 for ti, tw in enumerate(tess_words): if ti in used_tess: continue - iou = _box_iou(pw, tw) - if iou > best_iou: - best_iou, best_ti = iou, ti + if not _words_match(pw, tw): + continue + # Score: IoU + text_similarity to pick best match + score = _box_iou(pw, tw) + _text_similarity(pw["text"], tw["text"]) + if score > best_score: + best_score, best_ti = score, ti - if best_iou > 0.3 and best_ti >= 0: + if best_ti >= 0: tw = tess_words[best_ti] used_tess.add(best_ti) pc = pw.get("conf", 80) @@ -2651,6 +2721,7 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: "conf": max(pc, tc), }) else: + # No Tesseract match — keep Paddle word as-is merged.append(pw) # Add unmatched Tesseract words (bullet points, symbols, etc.) diff --git a/klausur-service/backend/tests/test_paddle_kombi.py b/klausur-service/backend/tests/test_paddle_kombi.py index d030f75..e2e08e3 100644 --- a/klausur-service/backend/tests/test_paddle_kombi.py +++ b/klausur-service/backend/tests/test_paddle_kombi.py @@ -1,17 +1,26 @@ -"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract). +"""Tests for the Kombi-Modus merge algorithm. -These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract -word boxes by IoU matching and confidence-weighted coordinate averaging. +Functions under test (ocr_pipeline_api.py): +- _box_iou: IoU between two word boxes +- _box_center_dist: Euclidean distance between box centers +- _text_similarity: Simple text similarity (0-1) +- _words_match: Multi-criteria match (IoU + center + text) +- _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists """ import pytest import sys import os -# Add backend to path so we can import from ocr_pipeline_api sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract +from ocr_pipeline_api import ( + _box_iou, + _box_center_dist, + _text_similarity, + _words_match, + _merge_paddle_tesseract, +) # --------------------------------------------------------------------------- @@ -37,44 +46,129 @@ def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, con class TestBoxIoU: def test_identical_boxes(self): - """Identical boxes have IoU = 1.0.""" a = _word("hello", 10, 10, 100, 20) assert _box_iou(a, a) == pytest.approx(1.0) def test_no_overlap(self): - """Non-overlapping boxes have IoU = 0.0.""" a = _word("a", 0, 0, 50, 20) b = _word("b", 200, 200, 50, 20) assert _box_iou(a, b) == 0.0 def test_partial_overlap(self): - """Partially overlapping boxes have 0 < IoU < 1.""" a = _word("a", 0, 0, 100, 20) b = _word("b", 50, 0, 100, 20) - # Intersection: x=[50,100], y=[0,20] → 50×20 = 1000 - # Union: 100×20 + 100×20 - 1000 = 3000 assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01) def test_contained_box(self): - """Small box inside large box.""" big = _word("big", 0, 0, 200, 40) small = _word("small", 50, 10, 30, 10) - # Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000 assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01) def test_touching_edges(self): - """Boxes that share an edge but don't overlap have IoU = 0.""" a = _word("a", 0, 0, 50, 20) b = _word("b", 50, 0, 50, 20) assert _box_iou(a, b) == 0.0 def test_zero_area_box(self): - """Zero-area box returns IoU = 0.""" a = _word("a", 10, 10, 0, 0) b = _word("b", 10, 10, 50, 20) assert _box_iou(a, b) == 0.0 +# --------------------------------------------------------------------------- +# _box_center_dist +# --------------------------------------------------------------------------- + +class TestBoxCenterDist: + + def test_same_center(self): + a = _word("a", 100, 50, 60, 20) + assert _box_center_dist(a, a) == 0.0 + + def test_horizontal_offset(self): + a = _word("a", 100, 50, 60, 20) + b = _word("b", 110, 50, 60, 20) + assert _box_center_dist(a, b) == pytest.approx(10.0) + + def test_diagonal(self): + a = _word("a", 0, 0, 20, 20) # center (10, 10) + b = _word("b", 20, 20, 20, 20) # center (30, 30) + expected = (20**2 + 20**2) ** 0.5 + assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1) + + +# --------------------------------------------------------------------------- +# _text_similarity +# --------------------------------------------------------------------------- + +class TestTextSimilarity: + + def test_identical(self): + assert _text_similarity("hello", "hello") == 1.0 + + def test_case_insensitive(self): + assert _text_similarity("Hello", "hello") == 1.0 + + def test_substring(self): + """One is substring of other (e.g. '!Betonung' vs 'Betonung').""" + assert _text_similarity("!Betonung", "Betonung") == 0.8 + + def test_completely_different(self): + assert _text_similarity("abc", "xyz") == 0.0 + + def test_empty_strings(self): + assert _text_similarity("", "hello") == 0.0 + assert _text_similarity("", "") == 0.0 + + def test_partial_overlap(self): + """Some shared characters.""" + sim = _text_similarity("apple", "ape") + assert 0.0 < sim < 1.0 + + +# --------------------------------------------------------------------------- +# _words_match +# --------------------------------------------------------------------------- + +class TestWordsMatch: + + def test_high_iou_matches(self): + """IoU > 0.15 is sufficient for a match.""" + a = _word("hello", 100, 50, 80, 20) + b = _word("hello", 105, 50, 80, 20) + assert _words_match(a, b) is True + + def test_same_text_same_row_matches(self): + """Same text on same row matches even with low IoU.""" + a = _word("Betonung", 100, 50, 80, 20) + b = _word("Betonung", 130, 52, 70, 18) # shifted but same row + assert _words_match(a, b) is True + + def test_close_centers_same_row_matches(self): + """Nearby centers on same row match.""" + a = _word("x", 100, 50, 40, 20) + b = _word("y", 110, 52, 50, 22) # close, same row + assert _words_match(a, b) is True + + def test_different_rows_no_match(self): + """Words on different rows don't match even with same text.""" + a = _word("hello", 100, 50, 80, 20) + b = _word("hello", 100, 200, 80, 20) # far away vertically + assert _words_match(a, b) is False + + def test_far_apart_same_row_different_text(self): + """Different text far apart on same row: no match.""" + a = _word("cat", 10, 50, 40, 20) + b = _word("dog", 400, 50, 40, 20) + assert _words_match(a, b) is False + + def test_no_overlap_no_proximity_no_text(self): + """Completely different words far apart: no match.""" + a = _word("abc", 0, 0, 50, 20) + b = _word("xyz", 500, 500, 50, 20) + assert _words_match(a, b) is False + + # --------------------------------------------------------------------------- # _merge_paddle_tesseract # --------------------------------------------------------------------------- @@ -82,20 +176,26 @@ class TestBoxIoU: class TestMergePaddleTesseract: def test_perfect_match_averages_coords(self): - """When paddle and tesseract have the same word at same position, - coordinates are averaged by confidence.""" + """Same word at same position: coordinates averaged by confidence.""" pw = [_word("hello", 100, 50, 80, 20, conf=90)] tw = [_word("hello", 110, 55, 70, 18, conf=60)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 m = merged[0] - assert m["text"] == "hello" # Paddle text preferred - # Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104 - assert m["left"] == 104 - assert m["conf"] == 90 # max(90, 60) + assert m["text"] == "hello" + assert m["left"] == 104 # (100*90 + 110*60) / 150 + assert m["conf"] == 90 - def test_no_match_keeps_both(self): - """Non-overlapping words: both kept.""" + def test_same_word_slightly_offset_merges(self): + """Same word with slight offset still merges (center proximity).""" + pw = [_word("Betonung", 100, 50, 90, 22, conf=85)] + tw = [_word("Betonung", 115, 52, 80, 20, conf=60)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 + assert merged[0]["text"] == "Betonung" + + def test_truly_different_words_kept_separate(self): + """Non-overlapping different words: both kept.""" pw = [_word("hello", 10, 10)] tw = [_word("bullet", 500, 500, conf=50)] merged = _merge_paddle_tesseract(pw, tw) @@ -109,33 +209,24 @@ class TestMergePaddleTesseract: tw = [_word("noise", 500, 500, conf=20)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 - assert merged[0]["text"] == "hello" def test_empty_paddle(self): - """Only Tesseract words with sufficient confidence are kept.""" pw = [] - tw = [ - _word("bullet", 10, 10, conf=80), - _word("noise", 200, 200, conf=10), - ] + tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 assert merged[0]["text"] == "bullet" def test_empty_tesseract(self): - """All Paddle words kept when Tesseract is empty.""" pw = [_word("a", 10, 10), _word("b", 200, 10)] - tw = [] - merged = _merge_paddle_tesseract(pw, tw) + merged = _merge_paddle_tesseract(pw, []) assert len(merged) == 2 def test_both_empty(self): - """Empty inputs return empty list.""" assert _merge_paddle_tesseract([], []) == [] def test_one_to_one_matching(self): """Each Tesseract word matches at most one Paddle word.""" - # Two paddle words at different X positions, one tesseract word overlaps first pw = [ _word("cat", 10, 10, 60, 20, conf=80), _word("dog", 200, 10, 60, 20, conf=80), @@ -144,18 +235,15 @@ class TestMergePaddleTesseract: merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 2 # cat (merged) + dog (unmatched paddle) - def test_iou_threshold(self): - """Match requires IoU > 0.3, not just any overlap.""" + def test_far_apart_different_text_not_merged(self): + """Different words far apart stay separate.""" pw = [_word("hello", 0, 0, 100, 20, conf=80)] - # Tiny overlap — IoU well below 0.3 - tw = [_word("world", 95, 0, 100, 20, conf=70)] - # Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100 - # Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026 + tw = [_word("world", 500, 300, 100, 20, conf=70)] merged = _merge_paddle_tesseract(pw, tw) - assert len(merged) == 2 # No match, both kept separately + assert len(merged) == 2 def test_paddle_text_preferred(self): - """Merged word uses Paddle's text, not Tesseract's.""" + """Merged word uses Paddle's text.""" pw = [_word("Betonung", 100, 50, 80, 20, conf=85)] tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)] merged = _merge_paddle_tesseract(pw, tw) @@ -164,35 +252,53 @@ class TestMergePaddleTesseract: def test_confidence_weighted_positions(self): """Equal confidence → simple average of coordinates.""" - # Boxes must overlap enough for IoU > 0.3 pw = [_word("x", 100, 200, 60, 20, conf=50)] tw = [_word("x", 110, 200, 60, 20, conf=50)] merged = _merge_paddle_tesseract(pw, tw) assert len(merged) == 1 m = merged[0] - assert m["left"] == 105 # (100+110)/2 - assert m["top"] == 200 # (200+200)/2 - assert m["width"] == 60 # (60+60)/2 - assert m["height"] == 20 # (20+20)/2 + assert m["left"] == 105 + assert m["top"] == 200 def test_zero_confidence_no_division_error(self): """Words with conf=0 don't cause division by zero.""" pw = [_word("a", 100, 50, 80, 20, conf=0)] tw = [_word("a", 100, 50, 80, 20, conf=0)] merged = _merge_paddle_tesseract(pw, tw) - assert len(merged) == 1 # Should not raise + assert len(merged) == 1 + + def test_duplicate_words_same_position_deduplicated(self): + """The core bug fix: same word at same position from both engines + should appear only once, not doubled.""" + # Simulate typical case: both engines find same words + pw = [ + _word("apple", 50, 10, 70, 20, conf=90), + _word("Apfel", 300, 10, 60, 20, conf=85), + _word("dog", 50, 50, 50, 20, conf=88), + _word("Hund", 300, 50, 60, 20, conf=82), + ] + tw = [ + _word("apple", 52, 11, 68, 19, conf=75), + _word("Apfel", 298, 12, 62, 18, conf=70), + _word("dog", 48, 49, 52, 21, conf=72), + _word("Hund", 302, 51, 58, 19, conf=68), + ] + merged = _merge_paddle_tesseract(pw, tw) + # Each word should appear exactly once + assert len(merged) == 4 + texts = [m["text"] for m in merged] + assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"] class TestMergePaddleTesseractBulletPoints: - """Test the key use case: Tesseract catches bullet points / symbols - that PaddleOCR misses or merges with adjacent text.""" + """Tesseract catches bullet points / symbols that PaddleOCR misses.""" def test_bullet_added_from_tesseract(self): - """A bullet character recognized by Tesseract but not Paddle is added.""" + """Bullet character from Tesseract is added.""" pw = [_word("Betonung", 60, 10, 80, 20)] tw = [ - _word("•", 10, 10, 15, 15, conf=65), # bullet - _word("Betonung", 60, 10, 80, 20, conf=50), # overlaps paddle + _word("•", 10, 10, 15, 15, conf=65), + _word("Betonung", 60, 10, 80, 20, conf=50), ] merged = _merge_paddle_tesseract(pw, tw) texts = [m["text"] for m in merged] @@ -201,7 +307,7 @@ class TestMergePaddleTesseractBulletPoints: assert len(merged) == 2 def test_exclamation_added_from_tesseract(self): - """An exclamation mark recognized separately by Tesseract is added.""" + """Exclamation mark from Tesseract is added.""" pw = [_word("important", 60, 10, 100, 20)] tw = [ _word("!", 40, 10, 12, 20, conf=70), @@ -211,3 +317,18 @@ class TestMergePaddleTesseractBulletPoints: texts = [m["text"] for m in merged] assert "!" in texts assert len(merged) == 2 + + def test_multiple_unique_tesseract_symbols(self): + """Multiple symbols only found by Tesseract are all added.""" + pw = [_word("word", 100, 10, 60, 20)] + tw = [ + _word("!", 20, 10, 10, 20, conf=70), + _word("•", 40, 10, 10, 15, conf=65), + _word("word", 100, 10, 60, 20, conf=50), + ] + merged = _merge_paddle_tesseract(pw, tw) + texts = [m["text"] for m in merged] + assert "!" in texts + assert "•" in texts + assert "word" in texts + assert len(merged) == 3