diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 79eee72..9c97b36 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2729,7 +2729,45 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: if ti not in used_tess and tw.get("conf", 0) >= 40: merged.append(tw) - return merged + # Safety net: deduplicate any remaining near-duplicate words + return _deduplicate_words(merged) + + +def _deduplicate_words(words: list) -> list: + """Remove near-duplicate words that slipped through matching. + + Two words are considered duplicates if: + - Same text (case-insensitive) + - Centers within 30px horizontally and 15px vertically + The word with higher confidence is kept. + """ + if len(words) <= 1: + return words + keep = [True] * len(words) + for i in range(len(words)): + if not keep[i]: + continue + w1 = words[i] + cx1 = w1["left"] + w1.get("width", 0) / 2 + cy1 = w1["top"] + w1.get("height", 0) / 2 + t1 = w1.get("text", "").lower().strip() + for j in range(i + 1, len(words)): + if not keep[j]: + continue + w2 = words[j] + t2 = w2.get("text", "").lower().strip() + if t1 != t2: + continue + cx2 = w2["left"] + w2.get("width", 0) / 2 + cy2 = w2["top"] + w2.get("height", 0) / 2 + if abs(cx1 - cx2) < 30 and abs(cy1 - cy2) < 15: + # Drop the one with lower confidence + if w1.get("conf", 0) >= w2.get("conf", 0): + keep[j] = False + else: + keep[i] = False + break # w1 is dropped, stop comparing + return [w for w, k in zip(words, keep) if k] @router.post("/sessions/{session_id}/paddle-kombi") diff --git a/klausur-service/backend/tests/test_paddle_kombi.py b/klausur-service/backend/tests/test_paddle_kombi.py index e2e08e3..4b7d36b 100644 --- a/klausur-service/backend/tests/test_paddle_kombi.py +++ b/klausur-service/backend/tests/test_paddle_kombi.py @@ -19,6 +19,7 @@ from ocr_pipeline_api import ( _box_center_dist, _text_similarity, _words_match, + _deduplicate_words, _merge_paddle_tesseract, ) @@ -332,3 +333,134 @@ class TestMergePaddleTesseractBulletPoints: assert "•" in texts assert "word" in texts assert len(merged) == 3 + + +# --------------------------------------------------------------------------- +# _deduplicate_words +# --------------------------------------------------------------------------- + +class TestDeduplicateWords: + + def test_no_duplicates(self): + """Different words at different positions: all kept.""" + words = [_word("a", 10, 10), _word("b", 200, 10), _word("c", 10, 100)] + result = _deduplicate_words(words) + assert len(result) == 3 + + def test_exact_duplicate_removed(self): + """Same text at same position: only one kept.""" + words = [ + _word("take", 185, 287, 47, 29, conf=90), + _word("take", 188, 289, 52, 21, conf=96), + ] + result = _deduplicate_words(words) + assert len(result) == 1 + assert result[0]["conf"] == 96 # higher confidence kept + + def test_same_text_far_apart_kept(self): + """Same word at very different positions (e.g. repeated in text): both kept.""" + words = [ + _word("the", 100, 10), + _word("the", 500, 10), + ] + result = _deduplicate_words(words) + assert len(result) == 2 + + def test_different_text_same_position_kept(self): + """Different words at same position: both kept (not duplicates).""" + words = [ + _word("apple", 100, 50), + _word("Apfel", 105, 52), + ] + result = _deduplicate_words(words) + assert len(result) == 2 + + def test_empty_list(self): + assert _deduplicate_words([]) == [] + + def test_single_word(self): + words = [_word("hello", 10, 10)] + assert len(_deduplicate_words(words)) == 1 + + def test_real_world_near_duplicates(self): + """Simulate real-world: Paddle (height=29) + Tesseract (height=21) near-dupes.""" + words = [ + _word("take", 185, 287, 47, 29, conf=90), + _word("part", 249, 292, 48, 24, conf=96), + _word("More", 944, 287, 50, 29, conf=96), + _word("than", 1003, 287, 50, 29, conf=96), + # near-dupes from other engine + _word("take", 188, 289, 52, 21, conf=96), + _word("part", 249, 294, 47, 25, conf=96), + _word("More", 948, 292, 60, 20, conf=90), + _word("than", 1017, 291, 49, 21, conf=96), + ] + result = _deduplicate_words(words) + # Each word should appear only once + assert len(result) == 4 + texts = sorted(w["text"] for w in result) + assert texts == ["More", "part", "take", "than"] + + +class TestMergeRealWorldRegression: + """Regression test with actual data from the doubled-words bug.""" + + def test_row2_no_duplicates(self): + """Reproduce the row-2 bug: both engines return the same words at + slightly different positions. Merge should produce no duplicates.""" + paddle = [ + _word("teilnehmen", 526, 282, 140, 35, conf=93), + _word("take", 185, 287, 47, 29, conf=90), + _word("part(in)", 238, 287, 94, 29, conf=90), + _word("More", 944, 287, 50, 29, conf=96), + _word("than", 1003, 287, 50, 29, conf=96), + _word("200", 1063, 287, 38, 29, conf=96), + _word("singers", 1110, 287, 88, 29, conf=96), + _word("took", 1207, 287, 50, 29, conf=96), + _word("part", 1266, 287, 50, 29, conf=96), + _word("in", 1326, 287, 25, 29, conf=96), + _word("the", 1360, 287, 38, 29, conf=96), + ] + tess = [ + _word("take", 188, 289, 52, 21, conf=96), + _word("part", 249, 292, 48, 24, conf=96), + _word("(in)", 305, 290, 38, 24, conf=93), + _word("teilnehmen", 534, 290, 127, 21, conf=95), + _word("(an),", 671, 291, 48, 23, conf=96), + _word("mitmachen", 730, 290, 123, 22, conf=96), + _word("More", 948, 292, 60, 20, conf=90), + _word("than", 1017, 291, 49, 21, conf=96), + _word("200", 1076, 292, 43, 20, conf=93), + _word("singers", 1128, 293, 75, 26, conf=93), + _word("took", 1212, 291, 55, 22, conf=96), + _word("part", 1276, 294, 47, 25, conf=96), + _word("in", 1332, 292, 20, 20, conf=95), + _word("the", 1361, 292, 36, 21, conf=95), + # Tesseract-only: phonetic transcriptions + _word("[teık", 352, 292, 47, 21, conf=90), + _word("'pa:t]", 407, 292, 55, 23, conf=89), + ] + merged = _merge_paddle_tesseract(paddle, tess) + + # Check no near-duplicates remain + for i, w1 in enumerate(merged): + for j, w2 in enumerate(merged): + if j <= i: + continue + if w1["text"].lower() == w2["text"].lower(): + cx1 = w1["left"] + w1.get("width", 0) / 2 + cx2 = w2["left"] + w2.get("width", 0) / 2 + cy1 = w1["top"] + w1.get("height", 0) / 2 + cy2 = w2["top"] + w2.get("height", 0) / 2 + assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, ( + f"Near-duplicate found: '{w1['text']}' at ({w1['left']},{w1['top']}) " + f"vs ({w2['left']},{w2['top']})" + ) + + # Tesseract-only words should be present + texts = [w["text"] for w in merged] + assert "(in)" in texts # Tesseract split "part(in)" differently + assert "(an)," in texts + assert "mitmachen" in texts + assert "[teık" in texts # phonetic from Tesseract + assert "'pa:t]" in texts