diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 3c4e304..50e0a4a 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3121,7 +3121,12 @@ def _ocr_single_cell( # --- FALLBACK: Cell-OCR for empty cells --- # Full-page Tesseract can miss small or isolated words (e.g. "Ei"). # Re-run OCR on the cell crop to catch what word-lookup missed. - if not text.strip() and cell_w > 0 and cell_h > 0: + # Only run fallback for EN/DE columns (where vocab words are expected). + # Example columns are often legitimately empty and running Tesseract on + # all of them wastes ~10s. column_example cells stay empty if word-lookup + # found nothing. + _fallback_col_types = {'column_en', 'column_de'} + if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types: cell_region = PageRegion( type=col.type, x=cell_x, y=cell_y,