diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index ca6384f..ab76609 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2599,6 +2599,53 @@ async def paddle_direct(session_id: str): return {"session_id": session_id, **word_result} +def _split_paddle_multi_words(words: list) -> list: + """Split PaddleOCR multi-word boxes into individual word boxes. + + PaddleOCR often returns entire phrases as a single box, e.g. + "More than 200 singers took part in the" with one bounding box. + This splits them into individual words with proportional widths. + Also handles leading "!" (e.g. "!Betonung" → ["!", "Betonung"]) + and IPA brackets (e.g. "badge[bxd3]" → ["badge", "[bxd3]"]). + """ + import re + + result = [] + for w in words: + raw_text = w.get("text", "").strip() + if not raw_text: + continue + # Split on whitespace, before "[" (IPA), and after "!" before letter + tokens = re.split( + r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text + ) + tokens = [t for t in tokens if t] + + if len(tokens) <= 1: + result.append(w) + else: + # Split proportionally by character count + total_chars = sum(len(t) for t in tokens) + if total_chars == 0: + continue + n_gaps = len(tokens) - 1 + gap_px = w["width"] * 0.02 + usable_w = w["width"] - gap_px * n_gaps + cursor = w["left"] + for t in tokens: + token_w = max(1, usable_w * len(t) / total_chars) + result.append({ + "text": t, + "left": round(cursor), + "top": w["top"], + "width": round(token_w), + "height": w["height"], + "conf": w.get("conf", 0), + }) + cursor += token_w + gap_px + return result + + def _group_words_into_rows(words: list, row_gap: int = 12) -> list: """Group words into rows by Y-position clustering. @@ -2842,11 +2889,18 @@ async def paddle_kombi(session_id: str): "conf": conf, }) + # --- Split multi-word Paddle boxes into individual words --- + paddle_words_split = _split_paddle_multi_words(paddle_words) + logger.info( + "paddle_kombi: split %d paddle boxes → %d individual words", + len(paddle_words), len(paddle_words_split), + ) + # --- Merge --- - if not paddle_words and not tess_words: + if not paddle_words_split and not tess_words: raise HTTPException(status_code=400, detail="Both OCR engines returned no words") - merged_words = _merge_paddle_tesseract(paddle_words, tess_words) + merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0 @@ -2870,12 +2924,14 @@ async def paddle_kombi(session_id: str): "ocr_engine": "kombi", "grid_method": "kombi", "raw_paddle_words": paddle_words, + "raw_paddle_words_split": paddle_words_split, "raw_tesseract_words": tess_words, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), "paddle_words": len(paddle_words), + "paddle_words_split": len(paddle_words_split), "tesseract_words": len(tess_words), "merged_words": len(merged_words), }, diff --git a/klausur-service/backend/tests/test_paddle_kombi.py b/klausur-service/backend/tests/test_paddle_kombi.py index 829485f..612032a 100644 --- a/klausur-service/backend/tests/test_paddle_kombi.py +++ b/klausur-service/backend/tests/test_paddle_kombi.py @@ -1,6 +1,7 @@ """Tests for the Kombi-Modus row-based sequence merge algorithm. Functions under test (ocr_pipeline_api.py): +- _split_paddle_multi_words: Split multi-word PaddleOCR boxes into individual words - _group_words_into_rows: Cluster words by Y-position into rows - _merge_row_sequences: Merge two word sequences within the same row - _merge_paddle_tesseract: Full merge with row matching + sequence dedup @@ -13,6 +14,7 @@ import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from ocr_pipeline_api import ( + _split_paddle_multi_words, _group_words_into_rows, _merge_row_sequences, _merge_paddle_tesseract, @@ -35,6 +37,65 @@ def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, con } +# --------------------------------------------------------------------------- +# _split_paddle_multi_words +# --------------------------------------------------------------------------- + +class TestSplitPaddleMultiWords: + + def test_single_word_unchanged(self): + words = [_word("hello", 100, 50, 80, 20)] + result = _split_paddle_multi_words(words) + assert len(result) == 1 + assert result[0]["text"] == "hello" + + def test_multi_word_split(self): + """'More than 200' as one box → 3 individual words.""" + words = [_word("More than 200", 100, 50, 300, 20)] + result = _split_paddle_multi_words(words) + assert len(result) == 3 + assert result[0]["text"] == "More" + assert result[1]["text"] == "than" + assert result[2]["text"] == "200" + # All should be within the original box + assert result[0]["left"] >= 100 + assert result[2]["left"] + result[2]["width"] <= 400 + 5 # allow rounding + + def test_exclamation_split(self): + """'!Betonung' → ['!', 'Betonung'].""" + words = [_word("!Betonung", 100, 50, 120, 20)] + result = _split_paddle_multi_words(words) + assert len(result) == 2 + assert result[0]["text"] == "!" + assert result[1]["text"] == "Betonung" + + def test_ipa_bracket_split(self): + """'badge[bxd3]' → ['badge', '[bxd3]'].""" + words = [_word("badge[bxd3]", 100, 50, 150, 20)] + result = _split_paddle_multi_words(words) + assert len(result) == 2 + assert result[0]["text"] == "badge" + assert result[1]["text"] == "[bxd3]" + + def test_long_phrase(self): + """'More than 200 singers took part in the' → 8 words.""" + words = [_word("More than 200 singers took part in the", 944, 287, 454, 29)] + result = _split_paddle_multi_words(words) + assert len(result) == 8 + texts = [w["text"] for w in result] + assert texts == ["More", "than", "200", "singers", "took", "part", "in", "the"] + + def test_empty_input(self): + assert _split_paddle_multi_words([]) == [] + + def test_preserves_top_and_height(self): + words = [_word("a b", 100, 50, 200, 25)] + result = _split_paddle_multi_words(words) + for w in result: + assert w["top"] == 50 + assert w["height"] == 25 + + # --------------------------------------------------------------------------- # _group_words_into_rows # --------------------------------------------------------------------------- @@ -347,3 +408,56 @@ class TestMergeRealWorldRegression: be_word = [w for w in merged if w["text"] == "be"][0] take_word = [w for w in merged if w["text"] == "take"][0] assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate" + + +class TestSplitThenMerge: + """Test the full pipeline: split multi-word Paddle boxes, then merge.""" + + def test_multi_word_paddle_boxes_no_duplicates(self): + """PaddleOCR returns phrases as single boxes — after splitting, + merge should produce no duplicates.""" + # Paddle returns multi-word boxes (real-world behavior) + paddle_raw = [ + _word("take part(in) [teik'pa:t]", 185, 287, 281, 29, conf=90), + _word("teilnehmen (an.mitmachen", 526, 282, 329, 35, conf=93), + _word("More than 200 singers took part in the", 944, 287, 454, 29, conf=96), + ] + tess = [ + _word("take", 188, 289, 52, 21, conf=96), + _word("part", 249, 292, 48, 24, conf=96), + _word("(in)", 305, 290, 38, 24, conf=93), + _word("[teık", 352, 292, 47, 21, conf=90), + _word("'pa:t]", 407, 292, 55, 23, conf=89), + _word("teilnehmen", 534, 290, 127, 21, conf=95), + _word("(an),", 671, 291, 48, 23, conf=96), + _word("mitmachen", 730, 290, 123, 22, conf=96), + _word("More", 948, 292, 60, 20, conf=90), + _word("than", 1017, 291, 49, 21, conf=96), + _word("200", 1076, 292, 43, 20, conf=93), + _word("singers", 1128, 293, 75, 26, conf=93), + _word("took", 1212, 291, 55, 22, conf=96), + _word("part", 1276, 294, 47, 25, conf=96), + _word("in", 1332, 292, 20, 20, conf=95), + _word("the", 1361, 292, 36, 21, conf=95), + ] + + # Split paddle multi-word boxes first + paddle_split = _split_paddle_multi_words(paddle_raw) + assert len(paddle_split) > len(paddle_raw), "Should have more words after split" + + # Merge + merged = _merge_paddle_tesseract(paddle_split, tess) + + # Check no near-duplicates + for i, w1 in enumerate(merged): + for j in range(i + 1, len(merged)): + w2 = merged[j] + if w1["text"].lower() == w2["text"].lower(): + cx1 = w1["left"] + w1.get("width", 0) / 2 + cx2 = w2["left"] + w2.get("width", 0) / 2 + cy1 = w1["top"] + w1.get("height", 0) / 2 + cy2 = w2["top"] + w2.get("height", 0) / 2 + assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, ( + f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) " + f"vs ({w2['left']},{w2['top']})" + )