From f94a3836f827aefefeb856960a8a3b9fe8ce2c90 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 4 Mar 2026 22:30:34 +0100 Subject: [PATCH] fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR RapidOCR (PaddleOCR) is optimized for full-page scene text and produces artifacts on small isolated cell crops: extra characters ("Tanz z", "er r wollte"), missing punctuation, garbled phonetic transcriptions. Tesseract works much better on isolated binarized crops with upscaling, which is exactly what cell-first OCR provides. RapidOCR remains available as explicit engine choice via the dropdown. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index e1e8644..5ef69ec 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4886,13 +4886,15 @@ def build_cell_grid_v2( Drop-in replacement for build_cell_grid() — same signature & return type. No full-page word assignment; each cell is OCR'd from its own crop. """ - # Resolve engine + # Resolve engine — default to Tesseract for cell-first OCR. + # Tesseract excels at isolated text crops (binarized, upscaled). + # RapidOCR is optimized for full-page scene-text and produces artifacts + # on small cell crops (extra chars, missing punctuation, garbled IPA). use_rapid = False if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "auto": - use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None - engine_name = "rapid" if use_rapid else "tesseract" + engine_name = "tesseract" elif ocr_engine == "rapid": if not RAPIDOCR_AVAILABLE: logger.warning("RapidOCR requested but not available, falling back to Tesseract") @@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming( Yields: (cell_dict, columns_meta, total_cells) """ - # Resolve engine + # Resolve engine — default to Tesseract for cell-first OCR. + # Tesseract excels at isolated text crops (binarized, upscaled). + # RapidOCR is optimized for full-page scene-text and produces artifacts + # on small cell crops (extra chars, missing punctuation, garbled IPA). use_rapid = False if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "auto": - use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None - engine_name = "rapid" if use_rapid else "tesseract" + engine_name = "tesseract" elif ocr_engine == "rapid": if not RAPIDOCR_AVAILABLE: logger.warning("RapidOCR requested but not available, falling back to Tesseract")