fix: split PaddleOCR multi-word boxes before merge
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
PaddleOCR returns entire phrases as single boxes (e.g. "More than 200 singers took part in the"). The merge algorithm compared word-by-word but Paddle had multi-word boxes vs Tesseract's individual words, so nothing matched and all Tesseract words were added as "extras" causing duplicates. Now splits Paddle boxes into individual words before merge. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2599,6 +2599,53 @@ async def paddle_direct(session_id: str):
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
def _split_paddle_multi_words(words: list) -> list:
|
||||
"""Split PaddleOCR multi-word boxes into individual word boxes.
|
||||
|
||||
PaddleOCR often returns entire phrases as a single box, e.g.
|
||||
"More than 200 singers took part in the" with one bounding box.
|
||||
This splits them into individual words with proportional widths.
|
||||
Also handles leading "!" (e.g. "!Betonung" → ["!", "Betonung"])
|
||||
and IPA brackets (e.g. "badge[bxd3]" → ["badge", "[bxd3]"]).
|
||||
"""
|
||||
import re
|
||||
|
||||
result = []
|
||||
for w in words:
|
||||
raw_text = w.get("text", "").strip()
|
||||
if not raw_text:
|
||||
continue
|
||||
# Split on whitespace, before "[" (IPA), and after "!" before letter
|
||||
tokens = re.split(
|
||||
r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text
|
||||
)
|
||||
tokens = [t for t in tokens if t]
|
||||
|
||||
if len(tokens) <= 1:
|
||||
result.append(w)
|
||||
else:
|
||||
# Split proportionally by character count
|
||||
total_chars = sum(len(t) for t in tokens)
|
||||
if total_chars == 0:
|
||||
continue
|
||||
n_gaps = len(tokens) - 1
|
||||
gap_px = w["width"] * 0.02
|
||||
usable_w = w["width"] - gap_px * n_gaps
|
||||
cursor = w["left"]
|
||||
for t in tokens:
|
||||
token_w = max(1, usable_w * len(t) / total_chars)
|
||||
result.append({
|
||||
"text": t,
|
||||
"left": round(cursor),
|
||||
"top": w["top"],
|
||||
"width": round(token_w),
|
||||
"height": w["height"],
|
||||
"conf": w.get("conf", 0),
|
||||
})
|
||||
cursor += token_w + gap_px
|
||||
return result
|
||||
|
||||
|
||||
def _group_words_into_rows(words: list, row_gap: int = 12) -> list:
|
||||
"""Group words into rows by Y-position clustering.
|
||||
|
||||
@@ -2842,11 +2889,18 @@ async def paddle_kombi(session_id: str):
|
||||
"conf": conf,
|
||||
})
|
||||
|
||||
# --- Split multi-word Paddle boxes into individual words ---
|
||||
paddle_words_split = _split_paddle_multi_words(paddle_words)
|
||||
logger.info(
|
||||
"paddle_kombi: split %d paddle boxes → %d individual words",
|
||||
len(paddle_words), len(paddle_words_split),
|
||||
)
|
||||
|
||||
# --- Merge ---
|
||||
if not paddle_words and not tess_words:
|
||||
if not paddle_words_split and not tess_words:
|
||||
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
||||
|
||||
merged_words = _merge_paddle_tesseract(paddle_words, tess_words)
|
||||
merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words)
|
||||
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||
duration = time.time() - t0
|
||||
@@ -2870,12 +2924,14 @@ async def paddle_kombi(session_id: str):
|
||||
"ocr_engine": "kombi",
|
||||
"grid_method": "kombi",
|
||||
"raw_paddle_words": paddle_words,
|
||||
"raw_paddle_words_split": paddle_words_split,
|
||||
"raw_tesseract_words": tess_words,
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
"paddle_words": len(paddle_words),
|
||||
"paddle_words_split": len(paddle_words_split),
|
||||
"tesseract_words": len(tess_words),
|
||||
"merged_words": len(merged_words),
|
||||
},
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Tests for the Kombi-Modus row-based sequence merge algorithm.
|
||||
|
||||
Functions under test (ocr_pipeline_api.py):
|
||||
- _split_paddle_multi_words: Split multi-word PaddleOCR boxes into individual words
|
||||
- _group_words_into_rows: Cluster words by Y-position into rows
|
||||
- _merge_row_sequences: Merge two word sequences within the same row
|
||||
- _merge_paddle_tesseract: Full merge with row matching + sequence dedup
|
||||
@@ -13,6 +14,7 @@ import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from ocr_pipeline_api import (
|
||||
_split_paddle_multi_words,
|
||||
_group_words_into_rows,
|
||||
_merge_row_sequences,
|
||||
_merge_paddle_tesseract,
|
||||
@@ -35,6 +37,65 @@ def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, con
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _split_paddle_multi_words
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSplitPaddleMultiWords:
|
||||
|
||||
def test_single_word_unchanged(self):
|
||||
words = [_word("hello", 100, 50, 80, 20)]
|
||||
result = _split_paddle_multi_words(words)
|
||||
assert len(result) == 1
|
||||
assert result[0]["text"] == "hello"
|
||||
|
||||
def test_multi_word_split(self):
|
||||
"""'More than 200' as one box → 3 individual words."""
|
||||
words = [_word("More than 200", 100, 50, 300, 20)]
|
||||
result = _split_paddle_multi_words(words)
|
||||
assert len(result) == 3
|
||||
assert result[0]["text"] == "More"
|
||||
assert result[1]["text"] == "than"
|
||||
assert result[2]["text"] == "200"
|
||||
# All should be within the original box
|
||||
assert result[0]["left"] >= 100
|
||||
assert result[2]["left"] + result[2]["width"] <= 400 + 5 # allow rounding
|
||||
|
||||
def test_exclamation_split(self):
|
||||
"""'!Betonung' → ['!', 'Betonung']."""
|
||||
words = [_word("!Betonung", 100, 50, 120, 20)]
|
||||
result = _split_paddle_multi_words(words)
|
||||
assert len(result) == 2
|
||||
assert result[0]["text"] == "!"
|
||||
assert result[1]["text"] == "Betonung"
|
||||
|
||||
def test_ipa_bracket_split(self):
|
||||
"""'badge[bxd3]' → ['badge', '[bxd3]']."""
|
||||
words = [_word("badge[bxd3]", 100, 50, 150, 20)]
|
||||
result = _split_paddle_multi_words(words)
|
||||
assert len(result) == 2
|
||||
assert result[0]["text"] == "badge"
|
||||
assert result[1]["text"] == "[bxd3]"
|
||||
|
||||
def test_long_phrase(self):
|
||||
"""'More than 200 singers took part in the' → 8 words."""
|
||||
words = [_word("More than 200 singers took part in the", 944, 287, 454, 29)]
|
||||
result = _split_paddle_multi_words(words)
|
||||
assert len(result) == 8
|
||||
texts = [w["text"] for w in result]
|
||||
assert texts == ["More", "than", "200", "singers", "took", "part", "in", "the"]
|
||||
|
||||
def test_empty_input(self):
|
||||
assert _split_paddle_multi_words([]) == []
|
||||
|
||||
def test_preserves_top_and_height(self):
|
||||
words = [_word("a b", 100, 50, 200, 25)]
|
||||
result = _split_paddle_multi_words(words)
|
||||
for w in result:
|
||||
assert w["top"] == 50
|
||||
assert w["height"] == 25
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _group_words_into_rows
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -347,3 +408,56 @@ class TestMergeRealWorldRegression:
|
||||
be_word = [w for w in merged if w["text"] == "be"][0]
|
||||
take_word = [w for w in merged if w["text"] == "take"][0]
|
||||
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
|
||||
|
||||
|
||||
class TestSplitThenMerge:
|
||||
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""
|
||||
|
||||
def test_multi_word_paddle_boxes_no_duplicates(self):
|
||||
"""PaddleOCR returns phrases as single boxes — after splitting,
|
||||
merge should produce no duplicates."""
|
||||
# Paddle returns multi-word boxes (real-world behavior)
|
||||
paddle_raw = [
|
||||
_word("take part(in) [teik'pa:t]", 185, 287, 281, 29, conf=90),
|
||||
_word("teilnehmen (an.mitmachen", 526, 282, 329, 35, conf=93),
|
||||
_word("More than 200 singers took part in the", 944, 287, 454, 29, conf=96),
|
||||
]
|
||||
tess = [
|
||||
_word("take", 188, 289, 52, 21, conf=96),
|
||||
_word("part", 249, 292, 48, 24, conf=96),
|
||||
_word("(in)", 305, 290, 38, 24, conf=93),
|
||||
_word("[teık", 352, 292, 47, 21, conf=90),
|
||||
_word("'pa:t]", 407, 292, 55, 23, conf=89),
|
||||
_word("teilnehmen", 534, 290, 127, 21, conf=95),
|
||||
_word("(an),", 671, 291, 48, 23, conf=96),
|
||||
_word("mitmachen", 730, 290, 123, 22, conf=96),
|
||||
_word("More", 948, 292, 60, 20, conf=90),
|
||||
_word("than", 1017, 291, 49, 21, conf=96),
|
||||
_word("200", 1076, 292, 43, 20, conf=93),
|
||||
_word("singers", 1128, 293, 75, 26, conf=93),
|
||||
_word("took", 1212, 291, 55, 22, conf=96),
|
||||
_word("part", 1276, 294, 47, 25, conf=96),
|
||||
_word("in", 1332, 292, 20, 20, conf=95),
|
||||
_word("the", 1361, 292, 36, 21, conf=95),
|
||||
]
|
||||
|
||||
# Split paddle multi-word boxes first
|
||||
paddle_split = _split_paddle_multi_words(paddle_raw)
|
||||
assert len(paddle_split) > len(paddle_raw), "Should have more words after split"
|
||||
|
||||
# Merge
|
||||
merged = _merge_paddle_tesseract(paddle_split, tess)
|
||||
|
||||
# Check no near-duplicates
|
||||
for i, w1 in enumerate(merged):
|
||||
for j in range(i + 1, len(merged)):
|
||||
w2 = merged[j]
|
||||
if w1["text"].lower() == w2["text"].lower():
|
||||
cx1 = w1["left"] + w1.get("width", 0) / 2
|
||||
cx2 = w2["left"] + w2.get("width", 0) / 2
|
||||
cy1 = w1["top"] + w1.get("height", 0) / 2
|
||||
cy2 = w2["top"] + w2.get("height", 0) / 2
|
||||
assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
|
||||
f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
|
||||
f"vs ({w2['left']},{w2['top']})"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user