diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index e1e8644..5ef69ec 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4886,13 +4886,15 @@ def build_cell_grid_v2( Drop-in replacement for build_cell_grid() — same signature & return type. No full-page word assignment; each cell is OCR'd from its own crop. """ - # Resolve engine + # Resolve engine — default to Tesseract for cell-first OCR. + # Tesseract excels at isolated text crops (binarized, upscaled). + # RapidOCR is optimized for full-page scene-text and produces artifacts + # on small cell crops (extra chars, missing punctuation, garbled IPA). use_rapid = False if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "auto": - use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None - engine_name = "rapid" if use_rapid else "tesseract" + engine_name = "tesseract" elif ocr_engine == "rapid": if not RAPIDOCR_AVAILABLE: logger.warning("RapidOCR requested but not available, falling back to Tesseract") @@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming( Yields: (cell_dict, columns_meta, total_cells) """ - # Resolve engine + # Resolve engine — default to Tesseract for cell-first OCR. + # Tesseract excels at isolated text crops (binarized, upscaled). + # RapidOCR is optimized for full-page scene-text and produces artifacts + # on small cell crops (extra chars, missing punctuation, garbled IPA). use_rapid = False if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "auto": - use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None - engine_name = "rapid" if use_rapid else "tesseract" + engine_name = "tesseract" elif ocr_engine == "rapid": if not RAPIDOCR_AVAILABLE: logger.warning("RapidOCR requested but not available, falling back to Tesseract")