fix: Kombi-Modus merge now deduplicates same words from both engines

The merge algorithm now uses 3 criteria instead of just IoU > 0.3: 1. IoU > 0.15 (relaxed threshold) 2. Center proximity < word height AND same row 3. Text similarity > 0.7 AND same row This prevents doubled overlapping words when both PaddleOCR and Tesseract find the same word at similar positions. Unique words from either engine (e.g. bullets from Tesseract) are still added. Tests expanded: 19 → 37 (added _box_center_dist, _text_similarity, _words_match tests + deduplication regression test). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 08:11:31 +01:00
parent 61c8169f9e
commit 4f2fb0e94c
2 changed files with 252 additions and 60 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2616,25 +2616,95 @@ def _box_iou(a: dict, b: dict) -> float:
    return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0


+def _box_center_dist(a: dict, b: dict) -> float:
+    """Euclidean distance between box centers."""
+    acx = a["left"] + a["width"] / 2
+    acy = a["top"] + a["height"] / 2
+    bcx = b["left"] + b["width"] / 2
+    bcy = b["top"] + b["height"] / 2
+    return ((acx - bcx) ** 2 + (acy - bcy) ** 2) ** 0.5
+
+
+def _text_similarity(a: str, b: str) -> float:
+    """Simple text similarity (0-1). Handles stripped punctuation."""
+    if not a or not b:
+        return 0.0
+    a_lower = a.lower().strip()
+    b_lower = b.lower().strip()
+    if a_lower == b_lower:
+        return 1.0
+    # One might be substring of the other (e.g. "!Betonung" vs "Betonung")
+    if a_lower in b_lower or b_lower in a_lower:
+        return 0.8
+    # Check if they share most characters
+    shorter, longer = (a_lower, b_lower) if len(a_lower) <= len(b_lower) else (b_lower, a_lower)
+    if len(shorter) == 0:
+        return 0.0
+    matches = sum(1 for c in shorter if c in longer)
+    return matches / max(len(shorter), len(longer))
+
+
+def _words_match(pw: dict, tw: dict) -> bool:
+    """Determine if a Paddle word and a Tesseract word represent the same word.
+
+    Uses three criteria (any one is sufficient):
+    1. IoU > 0.15 (relaxed from 0.3 — engines produce different-sized boxes)
+    2. Center distance < max(word height, 20px) AND on same row (vertical overlap)
+    3. Text similarity > 0.7 AND on same row
+    """
+    iou = _box_iou(pw, tw)
+    if iou > 0.15:
+        return True
+
+    # Same row check: vertical overlap > 50% of smaller height
+    py1, py2 = pw["top"], pw["top"] + pw["height"]
+    ty1, ty2 = tw["top"], tw["top"] + tw["height"]
+    v_overlap = max(0, min(py2, ty2) - max(py1, ty1))
+    min_h = max(min(pw["height"], tw["height"]), 1)
+    same_row = v_overlap > 0.5 * min_h
+
+    if not same_row:
+        return False
+
+    # Center proximity on same row
+    cdist = _box_center_dist(pw, tw)
+    h_threshold = max(pw["height"], tw["height"], 20)
+    if cdist < h_threshold:
+        return True
+
+    # Text similarity on same row
+    if _text_similarity(pw["text"], tw["text"]) > 0.7:
+        return True
+
+    return False
+
+
 def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
    """Merge word boxes from PaddleOCR and Tesseract.

-    Matching: IoU > 0.3 between bounding boxes.
-    Merging: Weighted average of coordinates by confidence.
+    Strategy:
+    - For each Paddle word, find the best matching Tesseract word
+    - Match criteria: IoU, center proximity, or text similarity (see _words_match)
+    - Matched pairs: keep Paddle text, average coordinates weighted by confidence
+    - Unmatched Paddle words: keep as-is
+    - Unmatched Tesseract words (conf >= 40): add (bullet points, symbols, etc.)
    """
    merged = []
    used_tess: set = set()

    for pw in paddle_words:
-        best_iou, best_ti = 0.0, -1
+        best_score, best_ti = 0.0, -1
        for ti, tw in enumerate(tess_words):
            if ti in used_tess:
                continue
-            iou = _box_iou(pw, tw)
-            if iou > best_iou:
-                best_iou, best_ti = iou, ti
+            if not _words_match(pw, tw):
+                continue
+            # Score: IoU + text_similarity to pick best match
+            score = _box_iou(pw, tw) + _text_similarity(pw["text"], tw["text"])
+            if score > best_score:
+                best_score, best_ti = score, ti

-        if best_iou > 0.3 and best_ti >= 0:
+        if best_ti >= 0:
            tw = tess_words[best_ti]
            used_tess.add(best_ti)
            pc = pw.get("conf", 80)
@@ -2651,6 +2721,7 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
                "conf": max(pc, tc),
            })
        else:
+            # No Tesseract match — keep Paddle word as-is
            merged.append(pw)

    # Add unmatched Tesseract words (bullet points, symbols, etc.)