From bbf0a5720e9a91a75558246d72bd4bf5d6c8a470 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 17 Mar 2026 10:57:44 +0100 Subject: [PATCH] fix: require both horizontal AND vertical overlap for word dedup Previous version only checked X overlap, causing false positives for short words like "=" and "I" that appear at similar X positions in different rows. Now requires >=50% overlap in both dimensions. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/ocr_pipeline_api.py | 27 ++++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index a7ed95e..89f874b 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -3359,11 +3359,14 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: return merged_all -def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list: +def _deduplicate_words(words: list) -> list: """Remove duplicate words with same text at overlapping positions. PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =") that produce duplicate words after splitting. This pass removes them. + + A word is a duplicate only when BOTH horizontal AND vertical overlap + exceed 50% — same text on the same visual line at the same position. """ if not words: return words @@ -3374,19 +3377,25 @@ def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list: if not wt: continue is_dup = False + w_right = w["left"] + w.get("width", 0) + w_bottom = w["top"] + w.get("height", 0) for existing in result: et = existing.get("text", "").lower().strip() if wt != et: continue - # Check horizontal overlap - ol = max(w["left"], existing["left"]) - or_ = min( - w["left"] + w.get("width", 0), - existing["left"] + existing.get("width", 0), - ) - ow = max(0, or_ - ol) + # Horizontal overlap + ox_l = max(w["left"], existing["left"]) + ox_r = min(w_right, existing["left"] + existing.get("width", 0)) + ox = max(0, ox_r - ox_l) min_w = min(w.get("width", 1), existing.get("width", 1)) - if min_w > 0 and ow / min_w >= overlap_ratio: + if min_w <= 0 or ox / min_w < 0.5: + continue + # Vertical overlap — must also be on the same line + oy_t = max(w["top"], existing["top"]) + oy_b = min(w_bottom, existing["top"] + existing.get("height", 0)) + oy = max(0, oy_b - oy_t) + min_h = min(w.get("height", 1), existing.get("height", 1)) + if min_h > 0 and oy / min_h >= 0.5: is_dup = True break if not is_dup: