fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR

RapidOCR (PaddleOCR) is optimized for full-page scene text and produces artifacts on small isolated cell crops: extra characters ("Tanz z", "er r wollte"), missing punctuation, garbled phonetic transcriptions. Tesseract works much better on isolated binarized crops with upscaling, which is exactly what cell-first OCR provides. RapidOCR remains available as explicit engine choice via the dropdown. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 22:30:34 +01:00
parent 34c649c8be
commit f94a3836f8
1 changed files with 10 additions and 6 deletions
@@ -4886,13 +4886,15 @@ def build_cell_grid_v2(
    Drop-in replacement for build_cell_grid() — same signature & return type.
    No full-page word assignment; each cell is OCR'd from its own crop.
    """
-    # Resolve engine
+    # Resolve engine — default to Tesseract for cell-first OCR.
    # Tesseract excels at isolated text crops (binarized, upscaled).
    # RapidOCR is optimized for full-page scene-text and produces artifacts
    # on small cell crops (extra chars, missing punctuation, garbled IPA).
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "tesseract"
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
@@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming(
    Yields:
        (cell_dict, columns_meta, total_cells)
    """
-    # Resolve engine
+    # Resolve engine — default to Tesseract for cell-first OCR.
    # Tesseract excels at isolated text crops (binarized, upscaled).
    # RapidOCR is optimized for full-page scene-text and produces artifacts
    # on small cell crops (extra chars, missing punctuation, garbled IPA).
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "tesseract"
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")