fix: add _deduplicate_words safety net to Kombi merge

Even after multi-criteria matching, near-duplicate words can slip through (same text, centers within 30px horizontal / 15px vertical). The new _deduplicate_words() removes these, keeping the higher-confidence copy. Regression test with real session data (row 2 with 145 near-dupes) confirms no duplicates remain after merge + deduplication. Tests: 37 → 45 (added TestDeduplicateWords, TestMergeRealWorldRegression). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 08:27:45 +01:00
parent 4f2fb0e94c
commit 4280298e02
2 changed files with 171 additions and 1 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2729,7 +2729,45 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
        if ti not in used_tess and tw.get("conf", 0) >= 40:
            merged.append(tw)

-    return merged
+    # Safety net: deduplicate any remaining near-duplicate words
+    return _deduplicate_words(merged)
+
+
+def _deduplicate_words(words: list) -> list:
+    """Remove near-duplicate words that slipped through matching.
+
+    Two words are considered duplicates if:
+    - Same text (case-insensitive)
+    - Centers within 30px horizontally and 15px vertically
+    The word with higher confidence is kept.
+    """
+    if len(words) <= 1:
+        return words
+    keep = [True] * len(words)
+    for i in range(len(words)):
+        if not keep[i]:
+            continue
+        w1 = words[i]
+        cx1 = w1["left"] + w1.get("width", 0) / 2
+        cy1 = w1["top"] + w1.get("height", 0) / 2
+        t1 = w1.get("text", "").lower().strip()
+        for j in range(i + 1, len(words)):
+            if not keep[j]:
+                continue
+            w2 = words[j]
+            t2 = w2.get("text", "").lower().strip()
+            if t1 != t2:
+                continue
+            cx2 = w2["left"] + w2.get("width", 0) / 2
+            cy2 = w2["top"] + w2.get("height", 0) / 2
+            if abs(cx1 - cx2) < 30 and abs(cy1 - cy2) < 15:
+                # Drop the one with lower confidence
+                if w1.get("conf", 0) >= w2.get("conf", 0):
+                    keep[j] = False
+                else:
+                    keep[i] = False
+                    break  # w1 is dropped, stop comparing
+    return [w for w, k in zip(words, keep) if k]


@router.post("/sessions/{session_id}/paddle-kombi")