feat(ocr-pipeline): hybrid word-lookup with cell-OCR fallback
Word-lookup from full-page Tesseract is fast but can miss small or isolated words (e.g. "Ei"). Now falls back to per-cell Tesseract OCR for cells that remain empty after word-lookup. The ocr_engine field reports 'cell_ocr_fallback' for cells that needed the fallback. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3103,8 +3103,9 @@ def _ocr_single_cell(
|
||||
'ocr_engine': 'word_lookup',
|
||||
}
|
||||
|
||||
# Use pre-assigned words (exclusive per column) if provided
|
||||
# --- PRIMARY: Word-lookup from full-page Tesseract ---
|
||||
words = preassigned_words if preassigned_words is not None else []
|
||||
used_engine = 'word_lookup'
|
||||
|
||||
if words:
|
||||
# Use row height as Y-tolerance so all words within a single row
|
||||
@@ -3112,12 +3113,36 @@ def _ocr_single_cell(
|
||||
# across two lines due to slight vertical offset).
|
||||
y_tol = max(15, row.height)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
|
||||
avg_conf = 0.0
|
||||
if words:
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
# --- FALLBACK: Cell-OCR for empty cells ---
|
||||
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
||||
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
||||
if not text.strip() and cell_w > 0 and cell_h > 0:
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
if use_rapid and img_bgr is not None:
|
||||
fallback_words = ocr_region_rapid(img_bgr, cell_region)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
|
||||
if fallback_words:
|
||||
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
|
||||
fb_y_tol = max(10, int(fb_avg_h * 0.5))
|
||||
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
|
||||
if fb_text.strip():
|
||||
text = fb_text
|
||||
avg_conf = round(
|
||||
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
|
||||
)
|
||||
used_engine = 'cell_ocr_fallback'
|
||||
|
||||
return {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
@@ -3133,7 +3158,7 @@ def _ocr_single_cell(
|
||||
'w': round(cell_w / img_w * 100, 2),
|
||||
'h': round(cell_h / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
'ocr_engine': used_engine,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user