feat: add Kombi-Vergleich mode for side-by-side Paddle vs RapidOCR comparison
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s

Add /rapid-kombi backend endpoint using local RapidOCR + Tesseract merge,
KombiCompareStep component for parallel execution and side-by-side overlay,
and wordResultOverride prop on OverlayReconstruction for direct data injection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-14 07:59:06 +01:00
parent c2c082d4b4
commit a994ddee83
6 changed files with 504 additions and 35 deletions

View File

@@ -449,6 +449,67 @@ class TestSpatialOverlapDedup:
assert len(merged) == 2
class TestRapidOcrMergeCompatibility:
"""Test that _merge_paddle_tesseract works with RapidOCR word format.
RapidOCR words include an extra 'region_type' key that PaddleOCR words
don't have. The merge logic must tolerate this extra field.
"""
def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
"""Create a word dict in RapidOCR format (has region_type)."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
"region_type": region_type,
}
def test_rapid_words_merge_with_tesseract(self):
"""RapidOCR words (with region_type) merge correctly with Tesseract words."""
rapid = [
self._rapid_word("apple", 50, 10, 70, 20, conf=90),
self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
]
tess = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
]
merged = _merge_paddle_tesseract(rapid, tess)
assert len(merged) == 2
texts = sorted(w["text"] for w in merged)
assert texts == ["Apfel", "apple"]
def test_rapid_words_split_then_merge(self):
"""Split + merge works with RapidOCR multi-word boxes."""
rapid_raw = [
self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
]
tess = [
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
]
rapid_split = _split_paddle_multi_words(rapid_raw)
assert len(rapid_split) == 3
merged = _merge_paddle_tesseract(rapid_split, tess)
texts = [w["text"] for w in merged]
assert texts.count("More") == 1
assert texts.count("than") == 1
assert texts.count("200") == 1
def test_region_type_preserved_in_unmatched(self):
"""Unmatched RapidOCR words keep their region_type field."""
rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
tess = [] # No Tesseract words
merged = _merge_paddle_tesseract(rapid, tess)
assert len(merged) == 1
assert merged[0]["text"] == "unique"
class TestSplitThenMerge:
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""