fix: deduplicate overlapping words after Paddle+Tesseract merge
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =") that produce duplicate words after splitting. Added _deduplicate_words() post-merge pass that removes words with same text at overlapping positions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3359,6 +3359,45 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
|
|||||||
return merged_all
|
return merged_all
|
||||||
|
|
||||||
|
|
||||||
|
def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list:
|
||||||
|
"""Remove duplicate words with same text at overlapping positions.
|
||||||
|
|
||||||
|
PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =")
|
||||||
|
that produce duplicate words after splitting. This pass removes them.
|
||||||
|
"""
|
||||||
|
if not words:
|
||||||
|
return words
|
||||||
|
|
||||||
|
result: list = []
|
||||||
|
for w in words:
|
||||||
|
wt = w.get("text", "").lower().strip()
|
||||||
|
if not wt:
|
||||||
|
continue
|
||||||
|
is_dup = False
|
||||||
|
for existing in result:
|
||||||
|
et = existing.get("text", "").lower().strip()
|
||||||
|
if wt != et:
|
||||||
|
continue
|
||||||
|
# Check horizontal overlap
|
||||||
|
ol = max(w["left"], existing["left"])
|
||||||
|
or_ = min(
|
||||||
|
w["left"] + w.get("width", 0),
|
||||||
|
existing["left"] + existing.get("width", 0),
|
||||||
|
)
|
||||||
|
ow = max(0, or_ - ol)
|
||||||
|
min_w = min(w.get("width", 1), existing.get("width", 1))
|
||||||
|
if min_w > 0 and ow / min_w >= overlap_ratio:
|
||||||
|
is_dup = True
|
||||||
|
break
|
||||||
|
if not is_dup:
|
||||||
|
result.append(w)
|
||||||
|
|
||||||
|
removed = len(words) - len(result)
|
||||||
|
if removed:
|
||||||
|
logger.info("dedup: removed %d duplicate words", removed)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/paddle-kombi")
|
@router.post("/sessions/{session_id}/paddle-kombi")
|
||||||
async def paddle_kombi(session_id: str):
|
async def paddle_kombi(session_id: str):
|
||||||
"""Run PaddleOCR + Tesseract on the preprocessed image and merge results.
|
"""Run PaddleOCR + Tesseract on the preprocessed image and merge results.
|
||||||
@@ -3429,6 +3468,7 @@ async def paddle_kombi(session_id: str):
|
|||||||
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
||||||
|
|
||||||
merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words)
|
merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words)
|
||||||
|
merged_words = _deduplicate_words(merged_words)
|
||||||
|
|
||||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
@@ -3564,6 +3604,7 @@ async def rapid_kombi(session_id: str):
|
|||||||
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
||||||
|
|
||||||
merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
|
merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
|
||||||
|
merged_words = _deduplicate_words(merged_words)
|
||||||
|
|
||||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|||||||
Reference in New Issue
Block a user