From 29d3c1caf53075575440d206bd1f0e0baad004a0 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 17 Mar 2026 10:47:42 +0100 Subject: [PATCH] fix: deduplicate overlapping words after Paddle+Tesseract merge PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =") that produce duplicate words after splitting. Added _deduplicate_words() post-merge pass that removes words with same text at overlapping positions. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/ocr_pipeline_api.py | 41 +++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 04f4cdb..a7ed95e 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -3359,6 +3359,45 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: return merged_all +def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list: + """Remove duplicate words with same text at overlapping positions. + + PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =") + that produce duplicate words after splitting. This pass removes them. + """ + if not words: + return words + + result: list = [] + for w in words: + wt = w.get("text", "").lower().strip() + if not wt: + continue + is_dup = False + for existing in result: + et = existing.get("text", "").lower().strip() + if wt != et: + continue + # Check horizontal overlap + ol = max(w["left"], existing["left"]) + or_ = min( + w["left"] + w.get("width", 0), + existing["left"] + existing.get("width", 0), + ) + ow = max(0, or_ - ol) + min_w = min(w.get("width", 1), existing.get("width", 1)) + if min_w > 0 and ow / min_w >= overlap_ratio: + is_dup = True + break + if not is_dup: + result.append(w) + + removed = len(words) - len(result) + if removed: + logger.info("dedup: removed %d duplicate words", removed) + return result + + @router.post("/sessions/{session_id}/paddle-kombi") async def paddle_kombi(session_id: str): """Run PaddleOCR + Tesseract on the preprocessed image and merge results. @@ -3429,6 +3468,7 @@ async def paddle_kombi(session_id: str): raise HTTPException(status_code=400, detail="Both OCR engines returned no words") merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words) + merged_words = _deduplicate_words(merged_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0 @@ -3564,6 +3604,7 @@ async def rapid_kombi(session_id: str): raise HTTPException(status_code=400, detail="Both OCR engines returned no words") merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words) + merged_words = _deduplicate_words(merged_words) cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) duration = time.time() - t0