From befc44d2ddbf0fc1f1fc378fbbfc8ef9a854db00 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 09:01:08 +0100
Subject: [PATCH] perf(ocr-pipeline): limit cell-OCR fallback to EN/DE columns
 only

Skip Tesseract fallback for column_example cells which are often
legitimately empty.  This reduces ~48 Tesseract calls to ~10,
cutting Step 5 fallback time from ~13s to ~3s.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 3c4e304..50e0a4a 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3121,7 +3121,12 @@ def _ocr_single_cell(
     # --- FALLBACK: Cell-OCR for empty cells ---
     # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
     # Re-run OCR on the cell crop to catch what word-lookup missed.
-    if not text.strip() and cell_w > 0 and cell_h > 0:
+    # Only run fallback for EN/DE columns (where vocab words are expected).
+    # Example columns are often legitimately empty and running Tesseract on
+    # all of them wastes ~10s.  column_example cells stay empty if word-lookup
+    # found nothing.
+    _fallback_col_types = {'column_en', 'column_de'}
+    if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types:
         cell_region = PageRegion(
             type=col.type,
             x=cell_x, y=cell_y,