fix: Kombi-Modus merge now deduplicates same words from both engines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m13s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 22s

The merge algorithm now uses 3 criteria instead of just IoU > 0.3:
1. IoU > 0.15 (relaxed threshold)
2. Center proximity < word height AND same row
3. Text similarity > 0.7 AND same row

This prevents doubled overlapping words when both PaddleOCR and
Tesseract find the same word at similar positions. Unique words
from either engine (e.g. bullets from Tesseract) are still added.

Tests expanded: 19 → 37 (added _box_center_dist, _text_similarity,
_words_match tests + deduplication regression test).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-13 08:11:31 +01:00
parent 61c8169f9e
commit 4f2fb0e94c
2 changed files with 252 additions and 60 deletions

View File

@@ -2616,25 +2616,95 @@ def _box_iou(a: dict, b: dict) -> float:
return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0
def _box_center_dist(a: dict, b: dict) -> float:
"""Euclidean distance between box centers."""
acx = a["left"] + a["width"] / 2
acy = a["top"] + a["height"] / 2
bcx = b["left"] + b["width"] / 2
bcy = b["top"] + b["height"] / 2
return ((acx - bcx) ** 2 + (acy - bcy) ** 2) ** 0.5
def _text_similarity(a: str, b: str) -> float:
"""Simple text similarity (0-1). Handles stripped punctuation."""
if not a or not b:
return 0.0
a_lower = a.lower().strip()
b_lower = b.lower().strip()
if a_lower == b_lower:
return 1.0
# One might be substring of the other (e.g. "!Betonung" vs "Betonung")
if a_lower in b_lower or b_lower in a_lower:
return 0.8
# Check if they share most characters
shorter, longer = (a_lower, b_lower) if len(a_lower) <= len(b_lower) else (b_lower, a_lower)
if len(shorter) == 0:
return 0.0
matches = sum(1 for c in shorter if c in longer)
return matches / max(len(shorter), len(longer))
def _words_match(pw: dict, tw: dict) -> bool:
"""Determine if a Paddle word and a Tesseract word represent the same word.
Uses three criteria (any one is sufficient):
1. IoU > 0.15 (relaxed from 0.3 — engines produce different-sized boxes)
2. Center distance < max(word height, 20px) AND on same row (vertical overlap)
3. Text similarity > 0.7 AND on same row
"""
iou = _box_iou(pw, tw)
if iou > 0.15:
return True
# Same row check: vertical overlap > 50% of smaller height
py1, py2 = pw["top"], pw["top"] + pw["height"]
ty1, ty2 = tw["top"], tw["top"] + tw["height"]
v_overlap = max(0, min(py2, ty2) - max(py1, ty1))
min_h = max(min(pw["height"], tw["height"]), 1)
same_row = v_overlap > 0.5 * min_h
if not same_row:
return False
# Center proximity on same row
cdist = _box_center_dist(pw, tw)
h_threshold = max(pw["height"], tw["height"], 20)
if cdist < h_threshold:
return True
# Text similarity on same row
if _text_similarity(pw["text"], tw["text"]) > 0.7:
return True
return False
def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
"""Merge word boxes from PaddleOCR and Tesseract.
Matching: IoU > 0.3 between bounding boxes.
Merging: Weighted average of coordinates by confidence.
Strategy:
- For each Paddle word, find the best matching Tesseract word
- Match criteria: IoU, center proximity, or text similarity (see _words_match)
- Matched pairs: keep Paddle text, average coordinates weighted by confidence
- Unmatched Paddle words: keep as-is
- Unmatched Tesseract words (conf >= 40): add (bullet points, symbols, etc.)
"""
merged = []
used_tess: set = set()
for pw in paddle_words:
best_iou, best_ti = 0.0, -1
best_score, best_ti = 0.0, -1
for ti, tw in enumerate(tess_words):
if ti in used_tess:
continue
iou = _box_iou(pw, tw)
if iou > best_iou:
best_iou, best_ti = iou, ti
if not _words_match(pw, tw):
continue
# Score: IoU + text_similarity to pick best match
score = _box_iou(pw, tw) + _text_similarity(pw["text"], tw["text"])
if score > best_score:
best_score, best_ti = score, ti
if best_iou > 0.3 and best_ti >= 0:
if best_ti >= 0:
tw = tess_words[best_ti]
used_tess.add(best_ti)
pc = pw.get("conf", 80)
@@ -2651,6 +2721,7 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
"conf": max(pc, tc),
})
else:
# No Tesseract match — keep Paddle word as-is
merged.append(pw)
# Add unmatched Tesseract words (bullet points, symbols, etc.)

View File

@@ -1,17 +1,26 @@
"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract).
"""Tests for the Kombi-Modus merge algorithm.
These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract
word boxes by IoU matching and confidence-weighted coordinate averaging.
Functions under test (ocr_pipeline_api.py):
- _box_iou: IoU between two word boxes
- _box_center_dist: Euclidean distance between box centers
- _text_similarity: Simple text similarity (0-1)
- _words_match: Multi-criteria match (IoU + center + text)
- _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists
"""
import pytest
import sys
import os
# Add backend to path so we can import from ocr_pipeline_api
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract
from ocr_pipeline_api import (
_box_iou,
_box_center_dist,
_text_similarity,
_words_match,
_merge_paddle_tesseract,
)
# ---------------------------------------------------------------------------
@@ -37,44 +46,129 @@ def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, con
class TestBoxIoU:
def test_identical_boxes(self):
"""Identical boxes have IoU = 1.0."""
a = _word("hello", 10, 10, 100, 20)
assert _box_iou(a, a) == pytest.approx(1.0)
def test_no_overlap(self):
"""Non-overlapping boxes have IoU = 0.0."""
a = _word("a", 0, 0, 50, 20)
b = _word("b", 200, 200, 50, 20)
assert _box_iou(a, b) == 0.0
def test_partial_overlap(self):
"""Partially overlapping boxes have 0 < IoU < 1."""
a = _word("a", 0, 0, 100, 20)
b = _word("b", 50, 0, 100, 20)
# Intersection: x=[50,100], y=[0,20] → 50×20 = 1000
# Union: 100×20 + 100×20 - 1000 = 3000
assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)
def test_contained_box(self):
"""Small box inside large box."""
big = _word("big", 0, 0, 200, 40)
small = _word("small", 50, 10, 30, 10)
# Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000
assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)
def test_touching_edges(self):
"""Boxes that share an edge but don't overlap have IoU = 0."""
a = _word("a", 0, 0, 50, 20)
b = _word("b", 50, 0, 50, 20)
assert _box_iou(a, b) == 0.0
def test_zero_area_box(self):
"""Zero-area box returns IoU = 0."""
a = _word("a", 10, 10, 0, 0)
b = _word("b", 10, 10, 50, 20)
assert _box_iou(a, b) == 0.0
# ---------------------------------------------------------------------------
# _box_center_dist
# ---------------------------------------------------------------------------
class TestBoxCenterDist:
def test_same_center(self):
a = _word("a", 100, 50, 60, 20)
assert _box_center_dist(a, a) == 0.0
def test_horizontal_offset(self):
a = _word("a", 100, 50, 60, 20)
b = _word("b", 110, 50, 60, 20)
assert _box_center_dist(a, b) == pytest.approx(10.0)
def test_diagonal(self):
a = _word("a", 0, 0, 20, 20) # center (10, 10)
b = _word("b", 20, 20, 20, 20) # center (30, 30)
expected = (20**2 + 20**2) ** 0.5
assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1)
# ---------------------------------------------------------------------------
# _text_similarity
# ---------------------------------------------------------------------------
class TestTextSimilarity:
def test_identical(self):
assert _text_similarity("hello", "hello") == 1.0
def test_case_insensitive(self):
assert _text_similarity("Hello", "hello") == 1.0
def test_substring(self):
"""One is substring of other (e.g. '!Betonung' vs 'Betonung')."""
assert _text_similarity("!Betonung", "Betonung") == 0.8
def test_completely_different(self):
assert _text_similarity("abc", "xyz") == 0.0
def test_empty_strings(self):
assert _text_similarity("", "hello") == 0.0
assert _text_similarity("", "") == 0.0
def test_partial_overlap(self):
"""Some shared characters."""
sim = _text_similarity("apple", "ape")
assert 0.0 < sim < 1.0
# ---------------------------------------------------------------------------
# _words_match
# ---------------------------------------------------------------------------
class TestWordsMatch:
def test_high_iou_matches(self):
"""IoU > 0.15 is sufficient for a match."""
a = _word("hello", 100, 50, 80, 20)
b = _word("hello", 105, 50, 80, 20)
assert _words_match(a, b) is True
def test_same_text_same_row_matches(self):
"""Same text on same row matches even with low IoU."""
a = _word("Betonung", 100, 50, 80, 20)
b = _word("Betonung", 130, 52, 70, 18) # shifted but same row
assert _words_match(a, b) is True
def test_close_centers_same_row_matches(self):
"""Nearby centers on same row match."""
a = _word("x", 100, 50, 40, 20)
b = _word("y", 110, 52, 50, 22) # close, same row
assert _words_match(a, b) is True
def test_different_rows_no_match(self):
"""Words on different rows don't match even with same text."""
a = _word("hello", 100, 50, 80, 20)
b = _word("hello", 100, 200, 80, 20) # far away vertically
assert _words_match(a, b) is False
def test_far_apart_same_row_different_text(self):
"""Different text far apart on same row: no match."""
a = _word("cat", 10, 50, 40, 20)
b = _word("dog", 400, 50, 40, 20)
assert _words_match(a, b) is False
def test_no_overlap_no_proximity_no_text(self):
"""Completely different words far apart: no match."""
a = _word("abc", 0, 0, 50, 20)
b = _word("xyz", 500, 500, 50, 20)
assert _words_match(a, b) is False
# ---------------------------------------------------------------------------
# _merge_paddle_tesseract
# ---------------------------------------------------------------------------
@@ -82,20 +176,26 @@ class TestBoxIoU:
class TestMergePaddleTesseract:
def test_perfect_match_averages_coords(self):
"""When paddle and tesseract have the same word at same position,
coordinates are averaged by confidence."""
"""Same word at same position: coordinates averaged by confidence."""
pw = [_word("hello", 100, 50, 80, 20, conf=90)]
tw = [_word("hello", 110, 55, 70, 18, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
m = merged[0]
assert m["text"] == "hello" # Paddle text preferred
# Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104
assert m["left"] == 104
assert m["conf"] == 90 # max(90, 60)
assert m["text"] == "hello"
assert m["left"] == 104 # (100*90 + 110*60) / 150
assert m["conf"] == 90
def test_no_match_keeps_both(self):
"""Non-overlapping words: both kept."""
def test_same_word_slightly_offset_merges(self):
"""Same word with slight offset still merges (center proximity)."""
pw = [_word("Betonung", 100, 50, 90, 22, conf=85)]
tw = [_word("Betonung", 115, 52, 80, 20, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "Betonung"
def test_truly_different_words_kept_separate(self):
"""Non-overlapping different words: both kept."""
pw = [_word("hello", 10, 10)]
tw = [_word("bullet", 500, 500, conf=50)]
merged = _merge_paddle_tesseract(pw, tw)
@@ -109,33 +209,24 @@ class TestMergePaddleTesseract:
tw = [_word("noise", 500, 500, conf=20)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "hello"
def test_empty_paddle(self):
"""Only Tesseract words with sufficient confidence are kept."""
pw = []
tw = [
_word("bullet", 10, 10, conf=80),
_word("noise", 200, 200, conf=10),
]
tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
assert merged[0]["text"] == "bullet"
def test_empty_tesseract(self):
"""All Paddle words kept when Tesseract is empty."""
pw = [_word("a", 10, 10), _word("b", 200, 10)]
tw = []
merged = _merge_paddle_tesseract(pw, tw)
merged = _merge_paddle_tesseract(pw, [])
assert len(merged) == 2
def test_both_empty(self):
"""Empty inputs return empty list."""
assert _merge_paddle_tesseract([], []) == []
def test_one_to_one_matching(self):
"""Each Tesseract word matches at most one Paddle word."""
# Two paddle words at different X positions, one tesseract word overlaps first
pw = [
_word("cat", 10, 10, 60, 20, conf=80),
_word("dog", 200, 10, 60, 20, conf=80),
@@ -144,18 +235,15 @@ class TestMergePaddleTesseract:
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2 # cat (merged) + dog (unmatched paddle)
def test_iou_threshold(self):
"""Match requires IoU > 0.3, not just any overlap."""
def test_far_apart_different_text_not_merged(self):
"""Different words far apart stay separate."""
pw = [_word("hello", 0, 0, 100, 20, conf=80)]
# Tiny overlap — IoU well below 0.3
tw = [_word("world", 95, 0, 100, 20, conf=70)]
# Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100
# Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026
tw = [_word("world", 500, 300, 100, 20, conf=70)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 2 # No match, both kept separately
assert len(merged) == 2
def test_paddle_text_preferred(self):
"""Merged word uses Paddle's text, not Tesseract's."""
"""Merged word uses Paddle's text."""
pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
merged = _merge_paddle_tesseract(pw, tw)
@@ -164,35 +252,53 @@ class TestMergePaddleTesseract:
def test_confidence_weighted_positions(self):
"""Equal confidence → simple average of coordinates."""
# Boxes must overlap enough for IoU > 0.3
pw = [_word("x", 100, 200, 60, 20, conf=50)]
tw = [_word("x", 110, 200, 60, 20, conf=50)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1
m = merged[0]
assert m["left"] == 105 # (100+110)/2
assert m["top"] == 200 # (200+200)/2
assert m["width"] == 60 # (60+60)/2
assert m["height"] == 20 # (20+20)/2
assert m["left"] == 105
assert m["top"] == 200
def test_zero_confidence_no_division_error(self):
"""Words with conf=0 don't cause division by zero."""
pw = [_word("a", 100, 50, 80, 20, conf=0)]
tw = [_word("a", 100, 50, 80, 20, conf=0)]
merged = _merge_paddle_tesseract(pw, tw)
assert len(merged) == 1 # Should not raise
assert len(merged) == 1
def test_duplicate_words_same_position_deduplicated(self):
"""The core bug fix: same word at same position from both engines
should appear only once, not doubled."""
# Simulate typical case: both engines find same words
pw = [
_word("apple", 50, 10, 70, 20, conf=90),
_word("Apfel", 300, 10, 60, 20, conf=85),
_word("dog", 50, 50, 50, 20, conf=88),
_word("Hund", 300, 50, 60, 20, conf=82),
]
tw = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
_word("dog", 48, 49, 52, 21, conf=72),
_word("Hund", 302, 51, 58, 19, conf=68),
]
merged = _merge_paddle_tesseract(pw, tw)
# Each word should appear exactly once
assert len(merged) == 4
texts = [m["text"] for m in merged]
assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"]
class TestMergePaddleTesseractBulletPoints:
"""Test the key use case: Tesseract catches bullet points / symbols
that PaddleOCR misses or merges with adjacent text."""
"""Tesseract catches bullet points / symbols that PaddleOCR misses."""
def test_bullet_added_from_tesseract(self):
"""A bullet character recognized by Tesseract but not Paddle is added."""
"""Bullet character from Tesseract is added."""
pw = [_word("Betonung", 60, 10, 80, 20)]
tw = [
_word("", 10, 10, 15, 15, conf=65), # bullet
_word("Betonung", 60, 10, 80, 20, conf=50), # overlaps paddle
_word("", 10, 10, 15, 15, conf=65),
_word("Betonung", 60, 10, 80, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
@@ -201,7 +307,7 @@ class TestMergePaddleTesseractBulletPoints:
assert len(merged) == 2
def test_exclamation_added_from_tesseract(self):
"""An exclamation mark recognized separately by Tesseract is added."""
"""Exclamation mark from Tesseract is added."""
pw = [_word("important", 60, 10, 100, 20)]
tw = [
_word("!", 40, 10, 12, 20, conf=70),
@@ -211,3 +317,18 @@ class TestMergePaddleTesseractBulletPoints:
texts = [m["text"] for m in merged]
assert "!" in texts
assert len(merged) == 2
def test_multiple_unique_tesseract_symbols(self):
"""Multiple symbols only found by Tesseract are all added."""
pw = [_word("word", 100, 10, 60, 20)]
tw = [
_word("!", 20, 10, 10, 20, conf=70),
_word("", 40, 10, 10, 15, conf=65),
_word("word", 100, 10, 60, 20, conf=50),
]
merged = _merge_paddle_tesseract(pw, tw)
texts = [m["text"] for m in merged]
assert "!" in texts
assert "" in texts
assert "word" in texts
assert len(merged) == 3