fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR
RapidOCR (PaddleOCR) is optimized for full-page scene text and produces
artifacts on small isolated cell crops: extra characters ("Tanz z",
"er r wollte"), missing punctuation, garbled phonetic transcriptions.
Tesseract works much better on isolated binarized crops with upscaling,
which is exactly what cell-first OCR provides. RapidOCR remains available
as explicit engine choice via the dropdown.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4886,13 +4886,15 @@ def build_cell_grid_v2(
|
||||
Drop-in replacement for build_cell_grid() — same signature & return type.
|
||||
No full-page word assignment; each cell is OCR'd from its own crop.
|
||||
"""
|
||||
# Resolve engine
|
||||
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||
use_rapid = False
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
@@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming(
|
||||
Yields:
|
||||
(cell_dict, columns_meta, total_cells)
|
||||
"""
|
||||
# Resolve engine
|
||||
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||
use_rapid = False
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
|
||||
Reference in New Issue
Block a user