From f94a3836f827aefefeb856960a8a3b9fe8ce2c90 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 4 Mar 2026 22:30:34 +0100
Subject: [PATCH] fix: use Tesseract as default engine for cell-first OCR
 instead of RapidOCR

RapidOCR (PaddleOCR) is optimized for full-page scene text and produces
artifacts on small isolated cell crops: extra characters ("Tanz z",
"er r wollte"), missing punctuation, garbled phonetic transcriptions.

Tesseract works much better on isolated binarized crops with upscaling,
which is exactly what cell-first OCR provides. RapidOCR remains available
as explicit engine choice via the dropdown.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index e1e8644..5ef69ec 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4886,13 +4886,15 @@ def build_cell_grid_v2(
     Drop-in replacement for build_cell_grid() — same signature & return type.
     No full-page word assignment; each cell is OCR'd from its own crop.
     """
-    # Resolve engine
+    # Resolve engine — default to Tesseract for cell-first OCR.
+    # Tesseract excels at isolated text crops (binarized, upscaled).
+    # RapidOCR is optimized for full-page scene-text and produces artifacts
+    # on small cell crops (extra chars, missing punctuation, garbled IPA).
     use_rapid = False
     if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
         engine_name = ocr_engine
     elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
-        engine_name = "rapid" if use_rapid else "tesseract"
+        engine_name = "tesseract"
     elif ocr_engine == "rapid":
         if not RAPIDOCR_AVAILABLE:
             logger.warning("RapidOCR requested but not available, falling back to Tesseract")
@@ -5034,13 +5036,15 @@ def build_cell_grid_v2_streaming(
     Yields:
         (cell_dict, columns_meta, total_cells)
     """
-    # Resolve engine
+    # Resolve engine — default to Tesseract for cell-first OCR.
+    # Tesseract excels at isolated text crops (binarized, upscaled).
+    # RapidOCR is optimized for full-page scene-text and produces artifacts
+    # on small cell crops (extra chars, missing punctuation, garbled IPA).
     use_rapid = False
     if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
         engine_name = ocr_engine
     elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
-        engine_name = "rapid" if use_rapid else "tesseract"
+        engine_name = "tesseract"
     elif ocr_engine == "rapid":
         if not RAPIDOCR_AVAILABLE:
             logger.warning("RapidOCR requested but not available, falling back to Tesseract")