diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 89f874b..11a88f2 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -3219,8 +3219,10 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: # Same text or one contains the other is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt)) - # Spatial overlap check: if words overlap >= 50% horizontally, - # they're the same physical word regardless of OCR text differences + # Spatial overlap check: if words overlap >= 40% horizontally, + # they're the same physical word regardless of OCR text differences. + # (40% catches borderline cases like "Stick"/"Stück" at 48% overlap) + spatial_match = False if not is_same: overlap_left = max(pw["left"], tw["left"]) overlap_right = min( @@ -3229,8 +3231,9 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: ) overlap_w = max(0, overlap_right - overlap_left) min_w = min(pw.get("width", 1), tw.get("width", 1)) - if min_w > 0 and overlap_w / min_w >= 0.5: + if min_w > 0 and overlap_w / min_w >= 0.4: is_same = True + spatial_match = True if is_same: # Matched — average coordinates weighted by confidence @@ -3239,8 +3242,14 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: total = pc + tc if total == 0: total = 1 + # Text: prefer higher-confidence engine when texts differ + # (e.g. Tesseract "Stück" conf=98 vs PaddleOCR "Stick" conf=80) + if spatial_match and pc < tc: + best_text = tw["text"] + else: + best_text = pw["text"] merged.append({ - "text": pw["text"], # Paddle text preferred + "text": best_text, "left": round((pw["left"] * pc + tw["left"] * tc) / total), "top": round((pw["top"] * pc + tw["top"] * tc) / total), "width": round((pw["width"] * pc + tw["width"] * tc) / total),