perf(ocr-pipeline): limit cell-OCR fallback to EN/DE columns only
Skip Tesseract fallback for column_example cells which are often legitimately empty. This reduces ~48 Tesseract calls to ~10, cutting Step 5 fallback time from ~13s to ~3s. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3121,7 +3121,12 @@ def _ocr_single_cell(
|
|||||||
# --- FALLBACK: Cell-OCR for empty cells ---
|
# --- FALLBACK: Cell-OCR for empty cells ---
|
||||||
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
||||||
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
||||||
if not text.strip() and cell_w > 0 and cell_h > 0:
|
# Only run fallback for EN/DE columns (where vocab words are expected).
|
||||||
|
# Example columns are often legitimately empty and running Tesseract on
|
||||||
|
# all of them wastes ~10s. column_example cells stay empty if word-lookup
|
||||||
|
# found nothing.
|
||||||
|
_fallback_col_types = {'column_en', 'column_de'}
|
||||||
|
if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types:
|
||||||
cell_region = PageRegion(
|
cell_region = PageRegion(
|
||||||
type=col.type,
|
type=col.type,
|
||||||
x=cell_x, y=cell_y,
|
x=cell_x, y=cell_y,
|
||||||
|
|||||||
Reference in New Issue
Block a user