fix: deduplicate overlapping OCR words and use per-word Y positions in overlay

Backend: Add spatial overlap check (>=50% horizontal IoU) to Kombi merge so words at the same position are deduplicated even when OCR text differs. Frontend: Add yPct/hPct to WordPosition so each word renders at its actual vertical position instead of all words collapsing to the cell center Y. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 20:27:08 +01:00
parent 703e110bab
commit d6f51e4418
5 changed files with 70 additions and 4 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2704,6 +2704,19 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
        # Same text or one contains the other
        is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))

+        # Spatial overlap check: if words overlap >= 50% horizontally,
+        # they're the same physical word regardless of OCR text differences
+        if not is_same:
+            overlap_left = max(pw["left"], tw["left"])
+            overlap_right = min(
+                pw["left"] + pw.get("width", 0),
+                tw["left"] + tw.get("width", 0),
+            )
+            overlap_w = max(0, overlap_right - overlap_left)
+            min_w = min(pw.get("width", 1), tw.get("width", 1))
+            if min_w > 0 and overlap_w / min_w >= 0.5:
+                is_same = True
+
        if is_same:
            # Matched — average coordinates weighted by confidence
            pc = pw.get("conf", 80)