feat(ocr-pipeline): hybrid word-lookup with cell-OCR fallback

Word-lookup from full-page Tesseract is fast but can miss small or
isolated words (e.g. "Ei"). Now falls back to per-cell Tesseract OCR
for cells that remain empty after word-lookup. The ocr_engine field
reports 'cell_ocr_fallback' for cells that needed the fallback.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 08:21:12 +01:00
parent 50ad06f43a
commit 8f2c2e8f68

View File

@@ -3103,8 +3103,9 @@ def _ocr_single_cell(
'ocr_engine': 'word_lookup',
}
# Use pre-assigned words (exclusive per column) if provided
# --- PRIMARY: Word-lookup from full-page Tesseract ---
words = preassigned_words if preassigned_words is not None else []
used_engine = 'word_lookup'
if words:
# Use row height as Y-tolerance so all words within a single row
@@ -3112,12 +3113,36 @@ def _ocr_single_cell(
# across two lines due to slight vertical offset).
y_tol = max(15, row.height)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
else:
text = ''
avg_conf = 0.0
avg_conf = 0.0
if words:
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
# --- FALLBACK: Cell-OCR for empty cells ---
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
# Re-run OCR on the cell crop to catch what word-lookup missed.
if not text.strip() and cell_w > 0 and cell_h > 0:
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
if use_rapid and img_bgr is not None:
fallback_words = ocr_region_rapid(img_bgr, cell_region)
else:
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
if fallback_words:
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
fb_y_tol = max(10, int(fb_avg_h * 0.5))
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
if fb_text.strip():
text = fb_text
avg_conf = round(
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
)
used_engine = 'cell_ocr_fallback'
return {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
@@ -3133,7 +3158,7 @@ def _ocr_single_cell(
'w': round(cell_w / img_w * 100, 2),
'h': round(cell_h / img_h * 100, 2),
},
'ocr_engine': 'word_lookup',
'ocr_engine': used_engine,
}