feat: hybrid OCR — full-page for broad columns, cell-crop for narrow

Fundamentally rearchitect build_cell_grid_v2 to combine the best of both approaches: - Broad columns (>15% image width): Use full-page Tesseract word assignment. Handles IPA brackets, punctuation, sentence flow, and ellipsis correctly. No garbled phonetics. - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent neighbour bleeding from adjacent broad columns. This eliminates the need for complex phonetic bracket replacement on broad columns since full-page Tesseract reads them correctly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 23:38:44 +01:00
parent e6dc3fcdd7
commit 1e0c6bb4b5
1 changed files with 98 additions and 45 deletions
@@ -4965,6 +4965,14 @@ def _ocr_cell_crop(
    return result


+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.  Broad columns (EN, DE,
+# Example) get full-page Tesseract which handles IPA brackets, punctuation,
+# and sentence flow much better.  Narrow columns (page_ref, marker) use
+# isolated cell crops to prevent neighbour bleeding.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
 def build_cell_grid_v2(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -4975,30 +4983,24 @@ def build_cell_grid_v2(
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-    """Cell-First Grid: crop each cell in isolation, then OCR.
+    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.

    Drop-in replacement for build_cell_grid() — same signature & return type.
-    No full-page word assignment; each cell is OCR'd from its own crop.
+
+    Strategy:
+    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+      words (from row.words). Handles IPA brackets, punctuation, sentence
+      continuity correctly.
+    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+      neighbour bleeding from adjacent broad columns.
    """
-    # Resolve engine — default to Tesseract for cell-first OCR.
-    # Tesseract excels at isolated text crops (binarized, upscaled).
-    # RapidOCR is optimized for full-page scene-text and produces artifacts
-    # on small cell crops (extra chars, missing punctuation, garbled IPA).
-    use_rapid = False
+    engine_name = "tesseract"
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        engine_name = "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
+    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+        engine_name = "rapid"

-    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}'")
+    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")

    # Filter to content rows only
    content_rows = [r for r in row_geometries if r.row_type == 'content']
@@ -5033,9 +5035,7 @@ def build_cell_grid_v2(
        logger.warning("build_cell_grid_v2: no usable columns found")
        return [], []

-    # Heal row gaps — use header/footer boundaries (NOT column bounds!)
-    # In Cell-First OCR, the crop IS the OCR input, so extending into
-    # header/footer means OCR'ing header/footer text ("VOCABULARY", page nums).
+    # Heal row gaps — use header/footer boundaries
    content_rows.sort(key=lambda r: r.y)
    header_rows = [r for r in row_geometries if r.row_type == 'header']
    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
@@ -5063,38 +5063,91 @@ def build_cell_grid_v2(
        'column_example': 'eng+deu',
    }

-    # --- Parallel OCR with ThreadPoolExecutor ---
-    # Tesseract is single-threaded per call, so we benefit from parallelism.
-    # ~40 rows × 4 cols = 160 cells, ~50% empty (density skip) → ~80 OCR calls.
+    # --- Classify columns as broad vs narrow ---
+    narrow_col_indices = set()
+    for ci, col in enumerate(relevant_cols):
+        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+        if col_pct < _NARROW_COL_THRESHOLD_PCT:
+            narrow_col_indices.add(ci)
+
+    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+    # --- Phase 1: Broad columns via full-page word assignment ---
    cells: List[Dict[str, Any]] = []
-    cell_tasks = []

    for row_idx, row in enumerate(content_rows):
+        # Assign full-page words to columns for this row
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+
        for col_idx, col in enumerate(relevant_cols):
-            cell_tasks.append((row_idx, col_idx, row, col))
+            if col_idx not in narrow_col_indices:
+                # BROAD column: use pre-assigned full-page words
+                words = col_words.get(col_idx, [])
+                # Filter low-confidence words
+                words = [w for w in words if w.get('conf', 0) >= 30]

-    max_workers = 4 if engine_name == "tesseract" else 2
+                if words:
+                    y_tol = max(15, row.height)
+                    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+                else:
+                    text = ''
+                    avg_conf = 0.0

-    with ThreadPoolExecutor(max_workers=max_workers) as pool:
-        futures = {
-            pool.submit(
-                _ocr_cell_crop,
-                ri, ci, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                engine_name, lang, lang_map,
-            ): (ri, ci)
-            for ri, ci, row, col in cell_tasks
-        }
+                # Apply noise filter
+                text = _clean_cell_text(text)

-        for future in as_completed(futures):
-            try:
-                cell = future.result()
+                cell = {
+                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
+                    'row_index': row_idx,
+                    'col_index': col_idx,
+                    'col_type': col.type,
+                    'text': text,
+                    'confidence': avg_conf,
+                    'bbox_px': {
+                        'x': col.x, 'y': row.y,
+                        'w': col.width, 'h': row.height,
+                    },
+                    'bbox_pct': {
+                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
+                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
+                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
+                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
+                    },
+                    'ocr_engine': 'word_lookup',
+                }
                cells.append(cell)
-            except Exception as e:
-                ri, ci = futures[future]
-                logger.error(f"build_cell_grid_v2: cell R{ri:02d}_C{ci} failed: {e}")

-    # Sort cells by (row_index, col_index) since futures complete out of order
+    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+    narrow_tasks = []
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx in narrow_col_indices:
+                narrow_tasks.append((row_idx, col_idx, row, col))
+
+    if narrow_tasks:
+        max_workers = 4 if engine_name == "tesseract" else 2
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            futures = {
+                pool.submit(
+                    _ocr_cell_crop,
+                    ri, ci, row, col,
+                    ocr_img, img_bgr, img_w, img_h,
+                    engine_name, lang, lang_map,
+                ): (ri, ci)
+                for ri, ci, row, col in narrow_tasks
+            }
+            for future in as_completed(futures):
+                try:
+                    cell = future.result()
+                    cells.append(cell)
+                except Exception as e:
+                    ri, ci = futures[future]
+                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+    # Sort cells by (row_index, col_index)
    cells.sort(key=lambda c: (c['row_index'], c['col_index']))

    # Remove all-empty rows
@@ -5110,7 +5163,7 @@ def build_cell_grid_v2(

    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name}")
+                f"engine={engine_name} (hybrid)")

    return cells, columns_meta