feat: hybrid OCR — full-page for broad columns, cell-crop for narrow

Fundamentally rearchitect build_cell_grid_v2 to combine the best of
both approaches:

- Broad columns (>15% image width): Use full-page Tesseract word
  assignment. Handles IPA brackets, punctuation, sentence flow,
  and ellipsis correctly. No garbled phonetics.
- Narrow columns (<15% image width): Use isolated cell-crop OCR
  to prevent neighbour bleeding from adjacent broad columns.

This eliminates the need for complex phonetic bracket replacement
on broad columns since full-page Tesseract reads them correctly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 23:38:44 +01:00
parent e6dc3fcdd7
commit 1e0c6bb4b5

View File

@@ -4965,6 +4965,14 @@ def _ocr_cell_crop(
return result
# Threshold: columns narrower than this (% of image width) use single-cell
# crop OCR instead of full-page word assignment. Broad columns (EN, DE,
# Example) get full-page Tesseract which handles IPA brackets, punctuation,
# and sentence flow much better. Narrow columns (page_ref, marker) use
# isolated cell crops to prevent neighbour bleeding.
_NARROW_COL_THRESHOLD_PCT = 15.0
def build_cell_grid_v2(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
@@ -4975,30 +4983,24 @@ def build_cell_grid_v2(
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Cell-First Grid: crop each cell in isolation, then OCR.
"""Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
Drop-in replacement for build_cell_grid() — same signature & return type.
No full-page word assignment; each cell is OCR'd from its own crop.
Strategy:
- Broad columns (>15% image width): Use pre-assigned full-page Tesseract
words (from row.words). Handles IPA brackets, punctuation, sentence
continuity correctly.
- Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
neighbour bleeding from adjacent broad columns.
"""
# Resolve engine — default to Tesseract for cell-first OCR.
# Tesseract excels at isolated text crops (binarized, upscaled).
# RapidOCR is optimized for full-page scene-text and produces artifacts
# on small cell crops (extra chars, missing punctuation, garbled IPA).
use_rapid = False
engine_name = "tesseract"
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
engine_name = "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
engine_name = "rapid"
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}'")
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
# Filter to content rows only
content_rows = [r for r in row_geometries if r.row_type == 'content']
@@ -5033,9 +5035,7 @@ def build_cell_grid_v2(
logger.warning("build_cell_grid_v2: no usable columns found")
return [], []
# Heal row gaps — use header/footer boundaries (NOT column bounds!)
# In Cell-First OCR, the crop IS the OCR input, so extending into
# header/footer means OCR'ing header/footer text ("VOCABULARY", page nums).
# Heal row gaps — use header/footer boundaries
content_rows.sort(key=lambda r: r.y)
header_rows = [r for r in row_geometries if r.row_type == 'header']
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
@@ -5063,38 +5063,91 @@ def build_cell_grid_v2(
'column_example': 'eng+deu',
}
# --- Parallel OCR with ThreadPoolExecutor ---
# Tesseract is single-threaded per call, so we benefit from parallelism.
# ~40 rows × 4 cols = 160 cells, ~50% empty (density skip) → ~80 OCR calls.
# --- Classify columns as broad vs narrow ---
narrow_col_indices = set()
for ci, col in enumerate(relevant_cols):
col_pct = (col.width / img_w * 100) if img_w > 0 else 0
if col_pct < _NARROW_COL_THRESHOLD_PCT:
narrow_col_indices.add(ci)
broad_col_count = len(relevant_cols) - len(narrow_col_indices)
logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
f"{len(narrow_col_indices)} narrow columns (cell-crop)")
# --- Phase 1: Broad columns via full-page word assignment ---
cells: List[Dict[str, Any]] = []
cell_tasks = []
for row_idx, row in enumerate(content_rows):
# Assign full-page words to columns for this row
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols):
cell_tasks.append((row_idx, col_idx, row, col))
if col_idx not in narrow_col_indices:
# BROAD column: use pre-assigned full-page words
words = col_words.get(col_idx, [])
# Filter low-confidence words
words = [w for w in words if w.get('conf', 0) >= 30]
max_workers = 4 if engine_name == "tesseract" else 2
if words:
y_tol = max(15, row.height)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
else:
text = ''
avg_conf = 0.0
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {
pool.submit(
_ocr_cell_crop,
ri, ci, row, col,
ocr_img, img_bgr, img_w, img_h,
engine_name, lang, lang_map,
): (ri, ci)
for ri, ci, row, col in cell_tasks
}
# Apply noise filter
text = _clean_cell_text(text)
for future in as_completed(futures):
try:
cell = future.result()
cell = {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': text,
'confidence': avg_conf,
'bbox_px': {
'x': col.x, 'y': row.y,
'w': col.width, 'h': row.height,
},
'bbox_pct': {
'x': round(col.x / img_w * 100, 2) if img_w else 0,
'y': round(row.y / img_h * 100, 2) if img_h else 0,
'w': round(col.width / img_w * 100, 2) if img_w else 0,
'h': round(row.height / img_h * 100, 2) if img_h else 0,
},
'ocr_engine': 'word_lookup',
}
cells.append(cell)
except Exception as e:
ri, ci = futures[future]
logger.error(f"build_cell_grid_v2: cell R{ri:02d}_C{ci} failed: {e}")
# Sort cells by (row_index, col_index) since futures complete out of order
# --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
narrow_tasks = []
for row_idx, row in enumerate(content_rows):
for col_idx, col in enumerate(relevant_cols):
if col_idx in narrow_col_indices:
narrow_tasks.append((row_idx, col_idx, row, col))
if narrow_tasks:
max_workers = 4 if engine_name == "tesseract" else 2
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {
pool.submit(
_ocr_cell_crop,
ri, ci, row, col,
ocr_img, img_bgr, img_w, img_h,
engine_name, lang, lang_map,
): (ri, ci)
for ri, ci, row, col in narrow_tasks
}
for future in as_completed(futures):
try:
cell = future.result()
cells.append(cell)
except Exception as e:
ri, ci = futures[future]
logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
# Sort cells by (row_index, col_index)
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
# Remove all-empty rows
@@ -5110,7 +5163,7 @@ def build_cell_grid_v2(
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
f"engine={engine_name}")
f"engine={engine_name} (hybrid)")
return cells, columns_meta