feat: add Kombi-Vergleich mode for side-by-side Paddle vs RapidOCR comparison
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Add /rapid-kombi backend endpoint using local RapidOCR + Tesseract merge, KombiCompareStep component for parallel execution and side-by-side overlay, and wordResultOverride prop on OverlayReconstruction for direct data injection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2976,6 +2976,141 @@ async def paddle_kombi(session_id: str):
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/rapid-kombi")
|
||||
async def rapid_kombi(session_id: str):
|
||||
"""Run RapidOCR + Tesseract on the preprocessed image and merge results.
|
||||
|
||||
Same merge logic as paddle-kombi, but uses local RapidOCR (ONNX Runtime)
|
||||
instead of remote PaddleOCR service.
|
||||
"""
|
||||
img_png = await get_session_image(session_id, "cropped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "dewarped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "original")
|
||||
if not img_png:
|
||||
raise HTTPException(status_code=404, detail="No image found for this session")
|
||||
|
||||
img_arr = np.frombuffer(img_png, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
|
||||
if img_bgr is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to decode image")
|
||||
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
|
||||
from cv_ocr_engines import ocr_region_rapid
|
||||
from cv_vocab_types import PageRegion
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# --- RapidOCR (local, synchronous) ---
|
||||
full_region = PageRegion(
|
||||
type="full_page", x=0, y=0, width=img_w, height=img_h,
|
||||
)
|
||||
rapid_words = ocr_region_rapid(img_bgr, full_region)
|
||||
if not rapid_words:
|
||||
rapid_words = []
|
||||
|
||||
# --- Tesseract ---
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
|
||||
data = pytesseract.image_to_data(
|
||||
pil_img, lang="eng+deu",
|
||||
config="--psm 6 --oem 3",
|
||||
output_type=pytesseract.Output.DICT,
|
||||
)
|
||||
tess_words = []
|
||||
for i in range(len(data["text"])):
|
||||
text = str(data["text"][i]).strip()
|
||||
conf_raw = str(data["conf"][i])
|
||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||
if not text or conf < 20:
|
||||
continue
|
||||
tess_words.append({
|
||||
"text": text,
|
||||
"left": data["left"][i],
|
||||
"top": data["top"][i],
|
||||
"width": data["width"][i],
|
||||
"height": data["height"][i],
|
||||
"conf": conf,
|
||||
})
|
||||
|
||||
# --- Split multi-word RapidOCR boxes into individual words ---
|
||||
rapid_words_split = _split_paddle_multi_words(rapid_words)
|
||||
logger.info(
|
||||
"rapid_kombi: split %d rapid boxes → %d individual words",
|
||||
len(rapid_words), len(rapid_words_split),
|
||||
)
|
||||
|
||||
# --- Merge ---
|
||||
if not rapid_words_split and not tess_words:
|
||||
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
||||
|
||||
merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
|
||||
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||
duration = time.time() - t0
|
||||
|
||||
for cell in cells:
|
||||
cell["ocr_engine"] = "rapid_kombi"
|
||||
|
||||
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
col_types = {c.get("type") for c in columns_meta}
|
||||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if is_vocab else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"ocr_engine": "rapid_kombi",
|
||||
"grid_method": "rapid_kombi",
|
||||
"raw_rapid_words": rapid_words,
|
||||
"raw_rapid_words_split": rapid_words_split,
|
||||
"raw_tesseract_words": tess_words,
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
"rapid_words": len(rapid_words),
|
||||
"rapid_words_split": len(rapid_words_split),
|
||||
"tesseract_words": len(tess_words),
|
||||
"merged_words": len(merged_words),
|
||||
},
|
||||
}
|
||||
|
||||
await update_session_db(
|
||||
session_id,
|
||||
word_result=word_result,
|
||||
cropped_png=img_png,
|
||||
current_step=8,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"rapid_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
|
||||
"[rapid=%d, tess=%d, merged=%d]",
|
||||
session_id, len(cells), n_rows, n_cols, duration,
|
||||
len(rapid_words), len(tess_words), len(merged_words),
|
||||
)
|
||||
|
||||
await _append_pipeline_log(session_id, "rapid_kombi", {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||
"rapid_words": len(rapid_words),
|
||||
"tesseract_words": len(tess_words),
|
||||
"merged_words": len(merged_words),
|
||||
"ocr_engine": "rapid_kombi",
|
||||
}, duration_ms=int(duration * 1000))
|
||||
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
class WordGroundTruthRequest(BaseModel):
|
||||
is_correct: bool
|
||||
corrected_entries: Optional[List[Dict[str, Any]]] = None
|
||||
|
||||
@@ -449,6 +449,67 @@ class TestSpatialOverlapDedup:
|
||||
assert len(merged) == 2
|
||||
|
||||
|
||||
class TestRapidOcrMergeCompatibility:
|
||||
"""Test that _merge_paddle_tesseract works with RapidOCR word format.
|
||||
|
||||
RapidOCR words include an extra 'region_type' key that PaddleOCR words
|
||||
don't have. The merge logic must tolerate this extra field.
|
||||
"""
|
||||
|
||||
def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
|
||||
"""Create a word dict in RapidOCR format (has region_type)."""
|
||||
return {
|
||||
"text": text,
|
||||
"left": left,
|
||||
"top": top,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"conf": conf,
|
||||
"region_type": region_type,
|
||||
}
|
||||
|
||||
def test_rapid_words_merge_with_tesseract(self):
|
||||
"""RapidOCR words (with region_type) merge correctly with Tesseract words."""
|
||||
rapid = [
|
||||
self._rapid_word("apple", 50, 10, 70, 20, conf=90),
|
||||
self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
|
||||
]
|
||||
tess = [
|
||||
_word("apple", 52, 11, 68, 19, conf=75),
|
||||
_word("Apfel", 298, 12, 62, 18, conf=70),
|
||||
]
|
||||
merged = _merge_paddle_tesseract(rapid, tess)
|
||||
assert len(merged) == 2
|
||||
texts = sorted(w["text"] for w in merged)
|
||||
assert texts == ["Apfel", "apple"]
|
||||
|
||||
def test_rapid_words_split_then_merge(self):
|
||||
"""Split + merge works with RapidOCR multi-word boxes."""
|
||||
rapid_raw = [
|
||||
self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
|
||||
]
|
||||
tess = [
|
||||
_word("More", 948, 292, 60, 20, conf=90),
|
||||
_word("than", 1017, 291, 49, 21, conf=96),
|
||||
_word("200", 1076, 292, 43, 20, conf=93),
|
||||
]
|
||||
rapid_split = _split_paddle_multi_words(rapid_raw)
|
||||
assert len(rapid_split) == 3
|
||||
merged = _merge_paddle_tesseract(rapid_split, tess)
|
||||
texts = [w["text"] for w in merged]
|
||||
assert texts.count("More") == 1
|
||||
assert texts.count("than") == 1
|
||||
assert texts.count("200") == 1
|
||||
|
||||
def test_region_type_preserved_in_unmatched(self):
|
||||
"""Unmatched RapidOCR words keep their region_type field."""
|
||||
rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
|
||||
tess = [] # No Tesseract words
|
||||
merged = _merge_paddle_tesseract(rapid, tess)
|
||||
assert len(merged) == 1
|
||||
assert merged[0]["text"] == "unique"
|
||||
|
||||
|
||||
class TestSplitThenMerge:
|
||||
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user