fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR

RapidOCR (PaddleOCR) is optimized for full-page scene text and produces
artifacts on small isolated cell crops: extra characters ("Tanz z",
"er r wollte"), missing punctuation, garbled phonetic transcriptions.

Tesseract works much better on isolated binarized crops with upscaling,
which is exactly what cell-first OCR provides. RapidOCR remains available
as explicit engine choice via the dropdown.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 22:30:34 +01:00
parent 34c649c8be
commit f94a3836f8

View File

@@ -4886,13 +4886,15 @@ def build_cell_grid_v2(
Drop-in replacement for build_cell_grid() — same signature & return type.
No full-page word assignment; each cell is OCR'd from its own crop.
"""
# Resolve engine
# Resolve engine — default to Tesseract for cell-first OCR.
# Tesseract excels at isolated text crops (binarized, upscaled).
# RapidOCR is optimized for full-page scene-text and produces artifacts
# on small cell crops (extra chars, missing punctuation, garbled IPA).
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
engine_name = "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
@@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming(
Yields:
(cell_dict, columns_meta, total_cells)
"""
# Resolve engine
# Resolve engine — default to Tesseract for cell-first OCR.
# Tesseract excels at isolated text crops (binarized, upscaled).
# RapidOCR is optimized for full-page scene-text and produces artifacts
# on small cell crops (extra chars, missing punctuation, garbled IPA).
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
engine_name = "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")