fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR
RapidOCR (PaddleOCR) is optimized for full-page scene text and produces
artifacts on small isolated cell crops: extra characters ("Tanz z",
"er r wollte"), missing punctuation, garbled phonetic transcriptions.
Tesseract works much better on isolated binarized crops with upscaling,
which is exactly what cell-first OCR provides. RapidOCR remains available
as explicit engine choice via the dropdown.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4886,13 +4886,15 @@ def build_cell_grid_v2(
|
|||||||
Drop-in replacement for build_cell_grid() — same signature & return type.
|
Drop-in replacement for build_cell_grid() — same signature & return type.
|
||||||
No full-page word assignment; each cell is OCR'd from its own crop.
|
No full-page word assignment; each cell is OCR'd from its own crop.
|
||||||
"""
|
"""
|
||||||
# Resolve engine
|
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||||
|
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||||
|
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||||
|
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||||
use_rapid = False
|
use_rapid = False
|
||||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||||
engine_name = ocr_engine
|
engine_name = ocr_engine
|
||||||
elif ocr_engine == "auto":
|
elif ocr_engine == "auto":
|
||||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
engine_name = "tesseract"
|
||||||
engine_name = "rapid" if use_rapid else "tesseract"
|
|
||||||
elif ocr_engine == "rapid":
|
elif ocr_engine == "rapid":
|
||||||
if not RAPIDOCR_AVAILABLE:
|
if not RAPIDOCR_AVAILABLE:
|
||||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||||
@@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming(
|
|||||||
Yields:
|
Yields:
|
||||||
(cell_dict, columns_meta, total_cells)
|
(cell_dict, columns_meta, total_cells)
|
||||||
"""
|
"""
|
||||||
# Resolve engine
|
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||||
|
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||||
|
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||||
|
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||||
use_rapid = False
|
use_rapid = False
|
||||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||||
engine_name = ocr_engine
|
engine_name = ocr_engine
|
||||||
elif ocr_engine == "auto":
|
elif ocr_engine == "auto":
|
||||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
engine_name = "tesseract"
|
||||||
engine_name = "rapid" if use_rapid else "tesseract"
|
|
||||||
elif ocr_engine == "rapid":
|
elif ocr_engine == "rapid":
|
||||||
if not RAPIDOCR_AVAILABLE:
|
if not RAPIDOCR_AVAILABLE:
|
||||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||||
|
|||||||
Reference in New Issue
Block a user