fix: Kombi-Modus merge now deduplicates same words from both engines

The merge algorithm now uses 3 criteria instead of just IoU > 0.3: 1. IoU > 0.15 (relaxed threshold) 2. Center proximity < word height AND same row 3. Text similarity > 0.7 AND same row This prevents doubled overlapping words when both PaddleOCR and Tesseract find the same word at similar positions. Unique words from either engine (e.g. bullets from Tesseract) are still added. Tests expanded: 19 → 37 (added _box_center_dist, _text_similarity, _words_match tests + deduplication regression test). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 08:11:31 +01:00
parent 61c8169f9e
commit 4f2fb0e94c
2 changed files with 252 additions and 60 deletions
@@ -2616,25 +2616,95 @@ def _box_iou(a: dict, b: dict) -> float:
    return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0


+def _box_center_dist(a: dict, b: dict) -> float:
+    """Euclidean distance between box centers."""
+    acx = a["left"] + a["width"] / 2
+    acy = a["top"] + a["height"] / 2
+    bcx = b["left"] + b["width"] / 2
+    bcy = b["top"] + b["height"] / 2
+    return ((acx - bcx) ** 2 + (acy - bcy) ** 2) ** 0.5
+
+
+def _text_similarity(a: str, b: str) -> float:
+    """Simple text similarity (0-1). Handles stripped punctuation."""
+    if not a or not b:
+        return 0.0
+    a_lower = a.lower().strip()
+    b_lower = b.lower().strip()
+    if a_lower == b_lower:
+        return 1.0
+    # One might be substring of the other (e.g. "!Betonung" vs "Betonung")
+    if a_lower in b_lower or b_lower in a_lower:
+        return 0.8
+    # Check if they share most characters
+    shorter, longer = (a_lower, b_lower) if len(a_lower) <= len(b_lower) else (b_lower, a_lower)
+    if len(shorter) == 0:
+        return 0.0
+    matches = sum(1 for c in shorter if c in longer)
+    return matches / max(len(shorter), len(longer))
+
+
+def _words_match(pw: dict, tw: dict) -> bool:
+    """Determine if a Paddle word and a Tesseract word represent the same word.
+
+    Uses three criteria (any one is sufficient):
+    1. IoU > 0.15 (relaxed from 0.3 — engines produce different-sized boxes)
+    2. Center distance < max(word height, 20px) AND on same row (vertical overlap)
+    3. Text similarity > 0.7 AND on same row
+    """
+    iou = _box_iou(pw, tw)
+    if iou > 0.15:
+        return True
+
+    # Same row check: vertical overlap > 50% of smaller height
+    py1, py2 = pw["top"], pw["top"] + pw["height"]
+    ty1, ty2 = tw["top"], tw["top"] + tw["height"]
+    v_overlap = max(0, min(py2, ty2) - max(py1, ty1))
+    min_h = max(min(pw["height"], tw["height"]), 1)
+    same_row = v_overlap > 0.5 * min_h
+
+    if not same_row:
+        return False
+
+    # Center proximity on same row
+    cdist = _box_center_dist(pw, tw)
+    h_threshold = max(pw["height"], tw["height"], 20)
+    if cdist < h_threshold:
+        return True
+
+    # Text similarity on same row
+    if _text_similarity(pw["text"], tw["text"]) > 0.7:
+        return True
+
+    return False
+
+
 def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
    """Merge word boxes from PaddleOCR and Tesseract.

-    Matching: IoU > 0.3 between bounding boxes.
-    Merging: Weighted average of coordinates by confidence.
+    Strategy:
+    - For each Paddle word, find the best matching Tesseract word
+    - Match criteria: IoU, center proximity, or text similarity (see _words_match)
+    - Matched pairs: keep Paddle text, average coordinates weighted by confidence
+    - Unmatched Paddle words: keep as-is
+    - Unmatched Tesseract words (conf >= 40): add (bullet points, symbols, etc.)
    """
    merged = []
    used_tess: set = set()

    for pw in paddle_words:
-        best_iou, best_ti = 0.0, -1
+        best_score, best_ti = 0.0, -1
        for ti, tw in enumerate(tess_words):
            if ti in used_tess:
                continue
-            iou = _box_iou(pw, tw)
-            if iou > best_iou:
-                best_iou, best_ti = iou, ti
+            if not _words_match(pw, tw):
+                continue
+            # Score: IoU + text_similarity to pick best match
+            score = _box_iou(pw, tw) + _text_similarity(pw["text"], tw["text"])
+            if score > best_score:
+                best_score, best_ti = score, ti

-        if best_iou > 0.3 and best_ti >= 0:
+        if best_ti >= 0:
            tw = tess_words[best_ti]
            used_tess.add(best_ti)
            pc = pw.get("conf", 80)
@@ -2651,6 +2721,7 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
                "conf": max(pc, tc),
            })
        else:
+            # No Tesseract match — keep Paddle word as-is
            merged.append(pw)

    # Add unmatched Tesseract words (bullet points, symbols, etc.)
@@ -1,17 +1,26 @@
-"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract).
+"""Tests for the Kombi-Modus merge algorithm.

-These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract
-word boxes by IoU matching and confidence-weighted coordinate averaging.
+Functions under test (ocr_pipeline_api.py):
+- _box_iou: IoU between two word boxes
+- _box_center_dist: Euclidean distance between box centers
+- _text_similarity: Simple text similarity (0-1)
+- _words_match: Multi-criteria match (IoU + center + text)
+- _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists
 """

 import pytest
 import sys
 import os

-# Add backend to path so we can import from ocr_pipeline_api
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract
+from ocr_pipeline_api import (
+    _box_iou,
+    _box_center_dist,
+    _text_similarity,
+    _words_match,
+    _merge_paddle_tesseract,
+)


 # ---------------------------------------------------------------------------
@@ -37,44 +46,129 @@ def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, con
 class TestBoxIoU:

    def test_identical_boxes(self):
-        """Identical boxes have IoU = 1.0."""
        a = _word("hello", 10, 10, 100, 20)
        assert _box_iou(a, a) == pytest.approx(1.0)

    def test_no_overlap(self):
-        """Non-overlapping boxes have IoU = 0.0."""
        a = _word("a", 0, 0, 50, 20)
        b = _word("b", 200, 200, 50, 20)
        assert _box_iou(a, b) == 0.0

    def test_partial_overlap(self):
-        """Partially overlapping boxes have 0 < IoU < 1."""
        a = _word("a", 0, 0, 100, 20)
        b = _word("b", 50, 0, 100, 20)
-        # Intersection: x=[50,100], y=[0,20] → 50×20 = 1000
-        # Union: 100×20 + 100×20 - 1000 = 3000
        assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)

    def test_contained_box(self):
-        """Small box inside large box."""
        big = _word("big", 0, 0, 200, 40)
        small = _word("small", 50, 10, 30, 10)
-        # Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000
        assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)

    def test_touching_edges(self):
-        """Boxes that share an edge but don't overlap have IoU = 0."""
        a = _word("a", 0, 0, 50, 20)
        b = _word("b", 50, 0, 50, 20)
        assert _box_iou(a, b) == 0.0

    def test_zero_area_box(self):
-        """Zero-area box returns IoU = 0."""
        a = _word("a", 10, 10, 0, 0)
        b = _word("b", 10, 10, 50, 20)
        assert _box_iou(a, b) == 0.0


+# ---------------------------------------------------------------------------
+# _box_center_dist
+# ---------------------------------------------------------------------------
+
+class TestBoxCenterDist:
+
+    def test_same_center(self):
+        a = _word("a", 100, 50, 60, 20)
+        assert _box_center_dist(a, a) == 0.0
+
+    def test_horizontal_offset(self):
+        a = _word("a", 100, 50, 60, 20)
+        b = _word("b", 110, 50, 60, 20)
+        assert _box_center_dist(a, b) == pytest.approx(10.0)
+
+    def test_diagonal(self):
+        a = _word("a", 0, 0, 20, 20)  # center (10, 10)
+        b = _word("b", 20, 20, 20, 20)  # center (30, 30)
+        expected = (20**2 + 20**2) ** 0.5
+        assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1)
+
+
+# ---------------------------------------------------------------------------
+# _text_similarity
+# ---------------------------------------------------------------------------
+
+class TestTextSimilarity:
+
+    def test_identical(self):
+        assert _text_similarity("hello", "hello") == 1.0
+
+    def test_case_insensitive(self):
+        assert _text_similarity("Hello", "hello") == 1.0
+
+    def test_substring(self):
+        """One is substring of other (e.g. '!Betonung' vs 'Betonung')."""
+        assert _text_similarity("!Betonung", "Betonung") == 0.8
+
+    def test_completely_different(self):
+        assert _text_similarity("abc", "xyz") == 0.0
+
+    def test_empty_strings(self):
+        assert _text_similarity("", "hello") == 0.0
+        assert _text_similarity("", "") == 0.0
+
+    def test_partial_overlap(self):
+        """Some shared characters."""
+        sim = _text_similarity("apple", "ape")
+        assert 0.0 < sim < 1.0
+
+
+# ---------------------------------------------------------------------------
+# _words_match
+# ---------------------------------------------------------------------------
+
+class TestWordsMatch:
+
+    def test_high_iou_matches(self):
+        """IoU > 0.15 is sufficient for a match."""
+        a = _word("hello", 100, 50, 80, 20)
+        b = _word("hello", 105, 50, 80, 20)
+        assert _words_match(a, b) is True
+
+    def test_same_text_same_row_matches(self):
+        """Same text on same row matches even with low IoU."""
+        a = _word("Betonung", 100, 50, 80, 20)
+        b = _word("Betonung", 130, 52, 70, 18)  # shifted but same row
+        assert _words_match(a, b) is True
+
+    def test_close_centers_same_row_matches(self):
+        """Nearby centers on same row match."""
+        a = _word("x", 100, 50, 40, 20)
+        b = _word("y", 110, 52, 50, 22)  # close, same row
+        assert _words_match(a, b) is True
+
+    def test_different_rows_no_match(self):
+        """Words on different rows don't match even with same text."""
+        a = _word("hello", 100, 50, 80, 20)
+        b = _word("hello", 100, 200, 80, 20)  # far away vertically
+        assert _words_match(a, b) is False
+
+    def test_far_apart_same_row_different_text(self):
+        """Different text far apart on same row: no match."""
+        a = _word("cat", 10, 50, 40, 20)
+        b = _word("dog", 400, 50, 40, 20)
+        assert _words_match(a, b) is False
+
+    def test_no_overlap_no_proximity_no_text(self):
+        """Completely different words far apart: no match."""
+        a = _word("abc", 0, 0, 50, 20)
+        b = _word("xyz", 500, 500, 50, 20)
+        assert _words_match(a, b) is False
+
+
 # ---------------------------------------------------------------------------
 # _merge_paddle_tesseract
 # ---------------------------------------------------------------------------
@@ -82,20 +176,26 @@ class TestBoxIoU:
 class TestMergePaddleTesseract:

    def test_perfect_match_averages_coords(self):
-        """When paddle and tesseract have the same word at same position,
-        coordinates are averaged by confidence."""
+        """Same word at same position: coordinates averaged by confidence."""
        pw = [_word("hello", 100, 50, 80, 20, conf=90)]
        tw = [_word("hello", 110, 55, 70, 18, conf=60)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        m = merged[0]
-        assert m["text"] == "hello"  # Paddle text preferred
-        # Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104
-        assert m["left"] == 104
-        assert m["conf"] == 90  # max(90, 60)
+        assert m["text"] == "hello"
+        assert m["left"] == 104  # (100*90 + 110*60) / 150
+        assert m["conf"] == 90

-    def test_no_match_keeps_both(self):
-        """Non-overlapping words: both kept."""
+    def test_same_word_slightly_offset_merges(self):
+        """Same word with slight offset still merges (center proximity)."""
+        pw = [_word("Betonung", 100, 50, 90, 22, conf=85)]
+        tw = [_word("Betonung", 115, 52, 80, 20, conf=60)]
+        merged = _merge_paddle_tesseract(pw, tw)
+        assert len(merged) == 1
+        assert merged[0]["text"] == "Betonung"
+
+    def test_truly_different_words_kept_separate(self):
+        """Non-overlapping different words: both kept."""
        pw = [_word("hello", 10, 10)]
        tw = [_word("bullet", 500, 500, conf=50)]
        merged = _merge_paddle_tesseract(pw, tw)
@@ -109,33 +209,24 @@ class TestMergePaddleTesseract:
        tw = [_word("noise", 500, 500, conf=20)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
-        assert merged[0]["text"] == "hello"

    def test_empty_paddle(self):
-        """Only Tesseract words with sufficient confidence are kept."""
        pw = []
-        tw = [
-            _word("bullet", 10, 10, conf=80),
-            _word("noise", 200, 200, conf=10),
-        ]
+        tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        assert merged[0]["text"] == "bullet"

    def test_empty_tesseract(self):
-        """All Paddle words kept when Tesseract is empty."""
        pw = [_word("a", 10, 10), _word("b", 200, 10)]
-        tw = []
-        merged = _merge_paddle_tesseract(pw, tw)
+        merged = _merge_paddle_tesseract(pw, [])
        assert len(merged) == 2

    def test_both_empty(self):
-        """Empty inputs return empty list."""
        assert _merge_paddle_tesseract([], []) == []

    def test_one_to_one_matching(self):
        """Each Tesseract word matches at most one Paddle word."""
-        # Two paddle words at different X positions, one tesseract word overlaps first
        pw = [
            _word("cat", 10, 10, 60, 20, conf=80),
            _word("dog", 200, 10, 60, 20, conf=80),
@@ -144,18 +235,15 @@ class TestMergePaddleTesseract:
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2  # cat (merged) + dog (unmatched paddle)

-    def test_iou_threshold(self):
-        """Match requires IoU > 0.3, not just any overlap."""
+    def test_far_apart_different_text_not_merged(self):
+        """Different words far apart stay separate."""
        pw = [_word("hello", 0, 0, 100, 20, conf=80)]
-        # Tiny overlap — IoU well below 0.3
-        tw = [_word("world", 95, 0, 100, 20, conf=70)]
-        # Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100
-        # Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026
+        tw = [_word("world", 500, 300, 100, 20, conf=70)]
        merged = _merge_paddle_tesseract(pw, tw)
-        assert len(merged) == 2  # No match, both kept separately
+        assert len(merged) == 2

    def test_paddle_text_preferred(self):
-        """Merged word uses Paddle's text, not Tesseract's."""
+        """Merged word uses Paddle's text."""
        pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
        tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
        merged = _merge_paddle_tesseract(pw, tw)
@@ -164,35 +252,53 @@ class TestMergePaddleTesseract:

    def test_confidence_weighted_positions(self):
        """Equal confidence → simple average of coordinates."""
-        # Boxes must overlap enough for IoU > 0.3
        pw = [_word("x", 100, 200, 60, 20, conf=50)]
        tw = [_word("x", 110, 200, 60, 20, conf=50)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        m = merged[0]
-        assert m["left"] == 105   # (100+110)/2
-        assert m["top"] == 200    # (200+200)/2
-        assert m["width"] == 60   # (60+60)/2
-        assert m["height"] == 20  # (20+20)/2
+        assert m["left"] == 105
+        assert m["top"] == 200

    def test_zero_confidence_no_division_error(self):
        """Words with conf=0 don't cause division by zero."""
        pw = [_word("a", 100, 50, 80, 20, conf=0)]
        tw = [_word("a", 100, 50, 80, 20, conf=0)]
        merged = _merge_paddle_tesseract(pw, tw)
-        assert len(merged) == 1  # Should not raise
+        assert len(merged) == 1
+
+    def test_duplicate_words_same_position_deduplicated(self):
+        """The core bug fix: same word at same position from both engines
+        should appear only once, not doubled."""
+        # Simulate typical case: both engines find same words
+        pw = [
+            _word("apple", 50, 10, 70, 20, conf=90),
+            _word("Apfel", 300, 10, 60, 20, conf=85),
+            _word("dog", 50, 50, 50, 20, conf=88),
+            _word("Hund", 300, 50, 60, 20, conf=82),
+        ]
+        tw = [
+            _word("apple", 52, 11, 68, 19, conf=75),
+            _word("Apfel", 298, 12, 62, 18, conf=70),
+            _word("dog", 48, 49, 52, 21, conf=72),
+            _word("Hund", 302, 51, 58, 19, conf=68),
+        ]
+        merged = _merge_paddle_tesseract(pw, tw)
+        # Each word should appear exactly once
+        assert len(merged) == 4
+        texts = [m["text"] for m in merged]
+        assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"]


 class TestMergePaddleTesseractBulletPoints:
-    """Test the key use case: Tesseract catches bullet points / symbols
-    that PaddleOCR misses or merges with adjacent text."""
+    """Tesseract catches bullet points / symbols that PaddleOCR misses."""

    def test_bullet_added_from_tesseract(self):
-        """A bullet character recognized by Tesseract but not Paddle is added."""
+        """Bullet character from Tesseract is added."""
        pw = [_word("Betonung", 60, 10, 80, 20)]
        tw = [
-            _word("•", 10, 10, 15, 15, conf=65),   # bullet
-            _word("Betonung", 60, 10, 80, 20, conf=50),  # overlaps paddle
+            _word("•", 10, 10, 15, 15, conf=65),
+            _word("Betonung", 60, 10, 80, 20, conf=50),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        texts = [m["text"] for m in merged]
@@ -201,7 +307,7 @@ class TestMergePaddleTesseractBulletPoints:
        assert len(merged) == 2

    def test_exclamation_added_from_tesseract(self):
-        """An exclamation mark recognized separately by Tesseract is added."""
+        """Exclamation mark from Tesseract is added."""
        pw = [_word("important", 60, 10, 100, 20)]
        tw = [
            _word("!", 40, 10, 12, 20, conf=70),
@@ -211,3 +317,18 @@ class TestMergePaddleTesseractBulletPoints:
        texts = [m["text"] for m in merged]
        assert "!" in texts
        assert len(merged) == 2
+
+    def test_multiple_unique_tesseract_symbols(self):
+        """Multiple symbols only found by Tesseract are all added."""
+        pw = [_word("word", 100, 10, 60, 20)]
+        tw = [
+            _word("!", 20, 10, 10, 20, conf=70),
+            _word("•", 40, 10, 10, 15, conf=65),
+            _word("word", 100, 10, 60, 20, conf=50),
+        ]
+        merged = _merge_paddle_tesseract(pw, tw)
+        texts = [m["text"] for m in merged]
+        assert "!" in texts
+        assert "•" in texts
+        assert "word" in texts
+        assert len(merged) == 3