From befc44d2ddbf0fc1f1fc378fbbfc8ef9a854db00 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 09:01:08 +0100 Subject: [PATCH] perf(ocr-pipeline): limit cell-OCR fallback to EN/DE columns only Skip Tesseract fallback for column_example cells which are often legitimately empty. This reduces ~48 Tesseract calls to ~10, cutting Step 5 fallback time from ~13s to ~3s. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 3c4e304..50e0a4a 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3121,7 +3121,12 @@ def _ocr_single_cell( # --- FALLBACK: Cell-OCR for empty cells --- # Full-page Tesseract can miss small or isolated words (e.g. "Ei"). # Re-run OCR on the cell crop to catch what word-lookup missed. - if not text.strip() and cell_w > 0 and cell_h > 0: + # Only run fallback for EN/DE columns (where vocab words are expected). + # Example columns are often legitimately empty and running Tesseract on + # all of them wastes ~10s. column_example cells stay empty if word-lookup + # found nothing. + _fallback_col_types = {'column_en', 'column_de'} + if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types: cell_region = PageRegion( type=col.type, x=cell_x, y=cell_y,