fix: split PaddleOCR multi-word boxes before merge

PaddleOCR returns entire phrases as single boxes (e.g. "More than 200 singers took part in the"). The merge algorithm compared word-by-word but Paddle had multi-word boxes vs Tesseract's individual words, so nothing matched and all Tesseract words were added as "extras" causing duplicates. Now splits Paddle boxes into individual words before merge. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 10:39:10 +01:00
parent 41ff7671cd
commit 703e110bab
2 changed files with 172 additions and 2 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2599,6 +2599,53 @@ async def paddle_direct(session_id: str):
    return {"session_id": session_id, **word_result}


+def _split_paddle_multi_words(words: list) -> list:
+    """Split PaddleOCR multi-word boxes into individual word boxes.
+
+    PaddleOCR often returns entire phrases as a single box, e.g.
+    "More than 200 singers took part in the" with one bounding box.
+    This splits them into individual words with proportional widths.
+    Also handles leading "!" (e.g. "!Betonung" → ["!", "Betonung"])
+    and IPA brackets (e.g. "badge[bxd3]" → ["badge", "[bxd3]"]).
+    """
+    import re
+
+    result = []
+    for w in words:
+        raw_text = w.get("text", "").strip()
+        if not raw_text:
+            continue
+        # Split on whitespace, before "[" (IPA), and after "!" before letter
+        tokens = re.split(
+            r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text
+        )
+        tokens = [t for t in tokens if t]
+
+        if len(tokens) <= 1:
+            result.append(w)
+        else:
+            # Split proportionally by character count
+            total_chars = sum(len(t) for t in tokens)
+            if total_chars == 0:
+                continue
+            n_gaps = len(tokens) - 1
+            gap_px = w["width"] * 0.02
+            usable_w = w["width"] - gap_px * n_gaps
+            cursor = w["left"]
+            for t in tokens:
+                token_w = max(1, usable_w * len(t) / total_chars)
+                result.append({
+                    "text": t,
+                    "left": round(cursor),
+                    "top": w["top"],
+                    "width": round(token_w),
+                    "height": w["height"],
+                    "conf": w.get("conf", 0),
+                })
+                cursor += token_w + gap_px
+    return result
+
+
 def _group_words_into_rows(words: list, row_gap: int = 12) -> list:
    """Group words into rows by Y-position clustering.

@@ -2842,11 +2889,18 @@ async def paddle_kombi(session_id: str):
            "conf": conf,
        })

+    # --- Split multi-word Paddle boxes into individual words ---
+    paddle_words_split = _split_paddle_multi_words(paddle_words)
+    logger.info(
+        "paddle_kombi: split %d paddle boxes → %d individual words",
+        len(paddle_words), len(paddle_words_split),
+    )
+
    # --- Merge ---
-    if not paddle_words and not tess_words:
+    if not paddle_words_split and not tess_words:
        raise HTTPException(status_code=400, detail="Both OCR engines returned no words")

-    merged_words = _merge_paddle_tesseract(paddle_words, tess_words)
+    merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words)

    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
    duration = time.time() - t0
@@ -2870,12 +2924,14 @@ async def paddle_kombi(session_id: str):
        "ocr_engine": "kombi",
        "grid_method": "kombi",
        "raw_paddle_words": paddle_words,
+        "raw_paddle_words_split": paddle_words_split,
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
            "paddle_words": len(paddle_words),
+            "paddle_words_split": len(paddle_words_split),
            "tesseract_words": len(tess_words),
            "merged_words": len(merged_words),
        },