perf(ocr-pipeline): limit cell-OCR fallback to EN/DE columns only

Skip Tesseract fallback for column_example cells which are often
legitimately empty.  This reduces ~48 Tesseract calls to ~10,
cutting Step 5 fallback time from ~13s to ~3s.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 09:01:08 +01:00
parent 6db3c02db4
commit befc44d2dd

View File

@@ -3121,7 +3121,12 @@ def _ocr_single_cell(
# --- FALLBACK: Cell-OCR for empty cells --- # --- FALLBACK: Cell-OCR for empty cells ---
# Full-page Tesseract can miss small or isolated words (e.g. "Ei"). # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
# Re-run OCR on the cell crop to catch what word-lookup missed. # Re-run OCR on the cell crop to catch what word-lookup missed.
if not text.strip() and cell_w > 0 and cell_h > 0: # Only run fallback for EN/DE columns (where vocab words are expected).
# Example columns are often legitimately empty and running Tesseract on
# all of them wastes ~10s. column_example cells stay empty if word-lookup
# found nothing.
_fallback_col_types = {'column_en', 'column_de'}
if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types:
cell_region = PageRegion( cell_region = PageRegion(
type=col.type, type=col.type,
x=cell_x, y=cell_y, x=cell_x, y=cell_y,