From 29d3c1caf53075575440d206bd1f0e0baad004a0 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 17 Mar 2026 10:47:42 +0100
Subject: [PATCH] fix: deduplicate overlapping words after Paddle+Tesseract
 merge

PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =")
that produce duplicate words after splitting. Added _deduplicate_words()
post-merge pass that removes words with same text at overlapping positions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/ocr_pipeline_api.py | 41 +++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 04f4cdb..a7ed95e 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -3359,6 +3359,45 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
     return merged_all
 
 
+def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list:
+    """Remove duplicate words with same text at overlapping positions.
+
+    PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =")
+    that produce duplicate words after splitting.  This pass removes them.
+    """
+    if not words:
+        return words
+
+    result: list = []
+    for w in words:
+        wt = w.get("text", "").lower().strip()
+        if not wt:
+            continue
+        is_dup = False
+        for existing in result:
+            et = existing.get("text", "").lower().strip()
+            if wt != et:
+                continue
+            # Check horizontal overlap
+            ol = max(w["left"], existing["left"])
+            or_ = min(
+                w["left"] + w.get("width", 0),
+                existing["left"] + existing.get("width", 0),
+            )
+            ow = max(0, or_ - ol)
+            min_w = min(w.get("width", 1), existing.get("width", 1))
+            if min_w > 0 and ow / min_w >= overlap_ratio:
+                is_dup = True
+                break
+        if not is_dup:
+            result.append(w)
+
+    removed = len(words) - len(result)
+    if removed:
+        logger.info("dedup: removed %d duplicate words", removed)
+    return result
+
+
 @router.post("/sessions/{session_id}/paddle-kombi")
 async def paddle_kombi(session_id: str):
     """Run PaddleOCR + Tesseract on the preprocessed image and merge results.
@@ -3429,6 +3468,7 @@ async def paddle_kombi(session_id: str):
         raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
 
     merged_words = _merge_paddle_tesseract(paddle_words_split, tess_words)
+    merged_words = _deduplicate_words(merged_words)
 
     cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
     duration = time.time() - t0
@@ -3564,6 +3604,7 @@ async def rapid_kombi(session_id: str):
         raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
 
     merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
+    merged_words = _deduplicate_words(merged_words)
 
     cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
     duration = time.time() - t0