feat: hybrid OCR — full-page for broad columns, cell-crop for narrow
Fundamentally rearchitect build_cell_grid_v2 to combine the best of both approaches: - Broad columns (>15% image width): Use full-page Tesseract word assignment. Handles IPA brackets, punctuation, sentence flow, and ellipsis correctly. No garbled phonetics. - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent neighbour bleeding from adjacent broad columns. This eliminates the need for complex phonetic bracket replacement on broad columns since full-page Tesseract reads them correctly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4965,6 +4965,14 @@ def _ocr_cell_crop(
|
||||
return result
|
||||
|
||||
|
||||
# Threshold: columns narrower than this (% of image width) use single-cell
|
||||
# crop OCR instead of full-page word assignment. Broad columns (EN, DE,
|
||||
# Example) get full-page Tesseract which handles IPA brackets, punctuation,
|
||||
# and sentence flow much better. Narrow columns (page_ref, marker) use
|
||||
# isolated cell crops to prevent neighbour bleeding.
|
||||
_NARROW_COL_THRESHOLD_PCT = 15.0
|
||||
|
||||
|
||||
def build_cell_grid_v2(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
@@ -4975,30 +4983,24 @@ def build_cell_grid_v2(
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Cell-First Grid: crop each cell in isolation, then OCR.
|
||||
"""Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
|
||||
|
||||
Drop-in replacement for build_cell_grid() — same signature & return type.
|
||||
No full-page word assignment; each cell is OCR'd from its own crop.
|
||||
|
||||
Strategy:
|
||||
- Broad columns (>15% image width): Use pre-assigned full-page Tesseract
|
||||
words (from row.words). Handles IPA brackets, punctuation, sentence
|
||||
continuity correctly.
|
||||
- Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
|
||||
neighbour bleeding from adjacent broad columns.
|
||||
"""
|
||||
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||
use_rapid = False
|
||||
engine_name = "tesseract"
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
else:
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
|
||||
engine_name = "rapid"
|
||||
|
||||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}'")
|
||||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
|
||||
|
||||
# Filter to content rows only
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
@@ -5033,9 +5035,7 @@ def build_cell_grid_v2(
|
||||
logger.warning("build_cell_grid_v2: no usable columns found")
|
||||
return [], []
|
||||
|
||||
# Heal row gaps — use header/footer boundaries (NOT column bounds!)
|
||||
# In Cell-First OCR, the crop IS the OCR input, so extending into
|
||||
# header/footer means OCR'ing header/footer text ("VOCABULARY", page nums).
|
||||
# Heal row gaps — use header/footer boundaries
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
@@ -5063,38 +5063,91 @@ def build_cell_grid_v2(
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
# --- Parallel OCR with ThreadPoolExecutor ---
|
||||
# Tesseract is single-threaded per call, so we benefit from parallelism.
|
||||
# ~40 rows × 4 cols = 160 cells, ~50% empty (density skip) → ~80 OCR calls.
|
||||
# --- Classify columns as broad vs narrow ---
|
||||
narrow_col_indices = set()
|
||||
for ci, col in enumerate(relevant_cols):
|
||||
col_pct = (col.width / img_w * 100) if img_w > 0 else 0
|
||||
if col_pct < _NARROW_COL_THRESHOLD_PCT:
|
||||
narrow_col_indices.add(ci)
|
||||
|
||||
broad_col_count = len(relevant_cols) - len(narrow_col_indices)
|
||||
logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
|
||||
f"{len(narrow_col_indices)} narrow columns (cell-crop)")
|
||||
|
||||
# --- Phase 1: Broad columns via full-page word assignment ---
|
||||
cells: List[Dict[str, Any]] = []
|
||||
cell_tasks = []
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
# Assign full-page words to columns for this row
|
||||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||||
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
cell_tasks.append((row_idx, col_idx, row, col))
|
||||
if col_idx not in narrow_col_indices:
|
||||
# BROAD column: use pre-assigned full-page words
|
||||
words = col_words.get(col_idx, [])
|
||||
# Filter low-confidence words
|
||||
words = [w for w in words if w.get('conf', 0) >= 30]
|
||||
|
||||
max_workers = 4 if engine_name == "tesseract" else 2
|
||||
if words:
|
||||
y_tol = max(15, row.height)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
_ocr_cell_crop,
|
||||
ri, ci, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
): (ri, ci)
|
||||
for ri, ci, row, col in cell_tasks
|
||||
}
|
||||
# Apply noise filter
|
||||
text = _clean_cell_text(text)
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
cell = future.result()
|
||||
cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {
|
||||
'x': col.x, 'y': row.y,
|
||||
'w': col.width, 'h': row.height,
|
||||
},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(row.y / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(col.width / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
}
|
||||
cells.append(cell)
|
||||
except Exception as e:
|
||||
ri, ci = futures[future]
|
||||
logger.error(f"build_cell_grid_v2: cell R{ri:02d}_C{ci} failed: {e}")
|
||||
|
||||
# Sort cells by (row_index, col_index) since futures complete out of order
|
||||
# --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
|
||||
narrow_tasks = []
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
if col_idx in narrow_col_indices:
|
||||
narrow_tasks.append((row_idx, col_idx, row, col))
|
||||
|
||||
if narrow_tasks:
|
||||
max_workers = 4 if engine_name == "tesseract" else 2
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
_ocr_cell_crop,
|
||||
ri, ci, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
): (ri, ci)
|
||||
for ri, ci, row, col in narrow_tasks
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
cell = future.result()
|
||||
cells.append(cell)
|
||||
except Exception as e:
|
||||
ri, ci = futures[future]
|
||||
logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
|
||||
|
||||
# Sort cells by (row_index, col_index)
|
||||
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
|
||||
|
||||
# Remove all-empty rows
|
||||
@@ -5110,7 +5163,7 @@ def build_cell_grid_v2(
|
||||
|
||||
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||||
f"engine={engine_name}")
|
||||
f"engine={engine_name} (hybrid)")
|
||||
|
||||
return cells, columns_meta
|
||||
|
||||
|
||||
Reference in New Issue
Block a user