From a25214126d3d19be292bed9e360ae0fe8b272198 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 07:00:57 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20merge=20overlapping=20OCR=20words=20with?= =?UTF-8?q?=20different=20text=20(Stick/St=C3=BCck)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues in paddle-kombi word merge: 1. Overlap threshold too strict: PaddleOCR "Stick" and Tesseract "Stück" overlap at 48.6%, just below the 50% threshold. Both words ended up in the result, overlapping on the same position. Fix: lower threshold from 50% to 40%. 2. Text selection blind to confidence: always took PaddleOCR text even when Tesseract had higher confidence and correct text. Fix: when texts differ due to spatial-only match, prefer the engine with higher confidence. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/ocr_pipeline_api.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 89f874b..11a88f2 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -3219,8 +3219,10 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: # Same text or one contains the other is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt)) - # Spatial overlap check: if words overlap >= 50% horizontally, - # they're the same physical word regardless of OCR text differences + # Spatial overlap check: if words overlap >= 40% horizontally, + # they're the same physical word regardless of OCR text differences. + # (40% catches borderline cases like "Stick"/"Stück" at 48% overlap) + spatial_match = False if not is_same: overlap_left = max(pw["left"], tw["left"]) overlap_right = min( @@ -3229,8 +3231,9 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: ) overlap_w = max(0, overlap_right - overlap_left) min_w = min(pw.get("width", 1), tw.get("width", 1)) - if min_w > 0 and overlap_w / min_w >= 0.5: + if min_w > 0 and overlap_w / min_w >= 0.4: is_same = True + spatial_match = True if is_same: # Matched — average coordinates weighted by confidence @@ -3239,8 +3242,14 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: total = pc + tc if total == 0: total = 1 + # Text: prefer higher-confidence engine when texts differ + # (e.g. Tesseract "Stück" conf=98 vs PaddleOCR "Stick" conf=80) + if spatial_match and pc < tc: + best_text = tw["text"] + else: + best_text = pw["text"] merged.append({ - "text": pw["text"], # Paddle text preferred + "text": best_text, "left": round((pw["left"] * pc + tw["left"] * tc) / total), "top": round((pw["top"] * pc + tw["top"] * tc) / total), "width": round((pw["width"] * pc + tw["width"] * tc) / total),