feat(ocr-pipeline): use word-lookup instead of cell-OCR for cell grid
Replace per-cell Tesseract re-runs with lookup of pre-existing full-page words from row.words. Words are filtered by X-overlap with column bounds. This fixes phantom rows with garbage text, missing last words, and incomplete example text by using the more reliable full-page OCR results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3009,6 +3009,48 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
|||||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||||
|
|
||||||
|
|
||||||
|
def _lookup_cell_words(
|
||||||
|
row: RowGeometry,
|
||||||
|
col: PageRegion,
|
||||||
|
pad: int = 8,
|
||||||
|
) -> Tuple[List[Dict], float]:
|
||||||
|
"""Look up pre-existing Tesseract words that fall within a cell region.
|
||||||
|
|
||||||
|
Instead of re-running OCR on a cell crop, this filters the full-page
|
||||||
|
Tesseract words (stored in row.words) by X-overlap with the column.
|
||||||
|
|
||||||
|
Words use coordinates relative to the content ROI; columns use absolute
|
||||||
|
coordinates. row.x equals the content-ROI left_x, so we convert with:
|
||||||
|
col_left_rel = col.x - row.x
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(words_in_cell, avg_confidence) where words_in_cell keep their
|
||||||
|
original relative coordinates (compatible with
|
||||||
|
_words_to_reading_order_text).
|
||||||
|
"""
|
||||||
|
if not row.words:
|
||||||
|
return [], 0.0
|
||||||
|
|
||||||
|
left_x = row.x # content ROI offset (absolute)
|
||||||
|
col_left_rel = col.x - left_x - pad
|
||||||
|
col_right_rel = col.x - left_x + col.width + pad
|
||||||
|
|
||||||
|
words_in_cell = []
|
||||||
|
for w in row.words:
|
||||||
|
w_left = w['left']
|
||||||
|
w_right = w_left + w['width']
|
||||||
|
# Word center must be within column bounds
|
||||||
|
w_center_x = (w_left + w_right) / 2
|
||||||
|
if col_left_rel <= w_center_x <= col_right_rel:
|
||||||
|
words_in_cell.append(w)
|
||||||
|
|
||||||
|
avg_conf = 0.0
|
||||||
|
if words_in_cell:
|
||||||
|
avg_conf = round(sum(w['conf'] for w in words_in_cell) / len(words_in_cell), 1)
|
||||||
|
|
||||||
|
return words_in_cell, avg_conf
|
||||||
|
|
||||||
|
|
||||||
def _ocr_single_cell(
|
def _ocr_single_cell(
|
||||||
row_idx: int,
|
row_idx: int,
|
||||||
col_idx: int,
|
col_idx: int,
|
||||||
@@ -3023,7 +3065,7 @@ def _ocr_single_cell(
|
|||||||
lang: str,
|
lang: str,
|
||||||
lang_map: Dict[str, str],
|
lang_map: Dict[str, str],
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""OCR a single cell (column × row intersection) and return its dict."""
|
"""Populate a single cell (column x row intersection) via word lookup."""
|
||||||
pad = 8 # pixels
|
pad = 8 # pixels
|
||||||
cell_x = max(0, col.x - pad)
|
cell_x = max(0, col.x - pad)
|
||||||
cell_y = max(0, row.y - pad)
|
cell_y = max(0, row.y - pad)
|
||||||
@@ -3051,33 +3093,22 @@ def _ocr_single_cell(
|
|||||||
'w': round(col.width / img_w * 100, 2),
|
'w': round(col.width / img_w * 100, 2),
|
||||||
'h': round(row.height / img_h * 100, 2),
|
'h': round(row.height / img_h * 100, 2),
|
||||||
},
|
},
|
||||||
'ocr_engine': engine_name,
|
'ocr_engine': 'word_lookup',
|
||||||
}
|
}
|
||||||
|
|
||||||
cell_region = PageRegion(
|
# --- PRIMARY: Word-lookup from full-page Tesseract ---
|
||||||
type=col.type,
|
# Use pre-existing words from row.words (Step 4) instead of
|
||||||
x=cell_x, y=cell_y,
|
# re-running OCR on a small crop. This is more reliable because
|
||||||
width=cell_w, height=cell_h,
|
# full-page Tesseract has better context for recognition.
|
||||||
)
|
words, avg_conf = _lookup_cell_words(row, col, pad=pad)
|
||||||
|
|
||||||
# OCR the cell
|
|
||||||
if use_rapid:
|
|
||||||
words = ocr_region_rapid(img_bgr, cell_region)
|
|
||||||
else:
|
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
|
||||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
|
||||||
|
|
||||||
# Group into lines, then join in reading order
|
|
||||||
if words:
|
if words:
|
||||||
avg_h = sum(w['height'] for w in words) / len(words)
|
avg_h = sum(w['height'] for w in words) / len(words)
|
||||||
y_tol = max(10, int(avg_h * 0.5))
|
y_tol = max(10, int(avg_h * 0.5))
|
||||||
|
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||||
else:
|
else:
|
||||||
y_tol = 15
|
text = ''
|
||||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
avg_conf = 0.0
|
||||||
|
|
||||||
avg_conf = 0.0
|
|
||||||
if words:
|
|
||||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||||
@@ -3093,7 +3124,7 @@ def _ocr_single_cell(
|
|||||||
'w': round(cell_w / img_w * 100, 2),
|
'w': round(cell_w / img_w * 100, 2),
|
||||||
'h': round(cell_h / img_h * 100, 2),
|
'h': round(cell_h / img_h * 100, 2),
|
||||||
},
|
},
|
||||||
'ocr_engine': engine_name,
|
'ocr_engine': 'word_lookup',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user