feat(ocr-pipeline): add SSE streaming for word recognition (Step 5)
Cells now appear one-by-one in the UI as they are OCR'd, with a live progress bar, instead of waiting for the full result. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -19,7 +19,7 @@ import io
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -3009,6 +3009,94 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
|
||||
def _ocr_single_cell(
|
||||
row_idx: int,
|
||||
col_idx: int,
|
||||
row: RowGeometry,
|
||||
col: PageRegion,
|
||||
ocr_img: np.ndarray,
|
||||
img_bgr: Optional[np.ndarray],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
use_rapid: bool,
|
||||
engine_name: str,
|
||||
lang: str,
|
||||
lang_map: Dict[str, str],
|
||||
) -> Dict[str, Any]:
|
||||
"""OCR a single cell (column × row intersection) and return its dict."""
|
||||
pad = 8 # pixels
|
||||
cell_x = max(0, col.x - pad)
|
||||
cell_y = max(0, row.y - pad)
|
||||
cell_w = col.width + 2 * pad
|
||||
cell_h = row.height + 2 * pad
|
||||
|
||||
# Clamp to image bounds
|
||||
if cell_x + cell_w > img_w:
|
||||
cell_w = img_w - cell_x
|
||||
if cell_y + cell_h > img_h:
|
||||
cell_h = img_h - cell_y
|
||||
|
||||
if cell_w <= 0 or cell_h <= 0:
|
||||
return {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': '',
|
||||
'confidence': 0.0,
|
||||
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2),
|
||||
'y': round(row.y / img_h * 100, 2),
|
||||
'w': round(col.width / img_w * 100, 2),
|
||||
'h': round(row.height / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': engine_name,
|
||||
}
|
||||
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
|
||||
# OCR the cell
|
||||
if use_rapid:
|
||||
words = ocr_region_rapid(img_bgr, cell_region)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
|
||||
# Group into lines, then join in reading order
|
||||
if words:
|
||||
avg_h = sum(w['height'] for w in words) / len(words)
|
||||
y_tol = max(10, int(avg_h * 0.5))
|
||||
else:
|
||||
y_tol = 15
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
|
||||
avg_conf = 0.0
|
||||
if words:
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
|
||||
return {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
|
||||
'bbox_pct': {
|
||||
'x': round(cell_x / img_w * 100, 2),
|
||||
'y': round(cell_y / img_h * 100, 2),
|
||||
'w': round(cell_w / img_w * 100, 2),
|
||||
'h': round(cell_h / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': engine_name,
|
||||
}
|
||||
|
||||
|
||||
def build_cell_grid(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
@@ -3089,79 +3177,12 @@ def build_cell_grid(
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
# Compute cell region: column x/width, row y/height
|
||||
pad = 8 # pixels
|
||||
cell_x = max(0, col.x - pad)
|
||||
cell_y = max(0, row.y - pad)
|
||||
cell_w = col.width + 2 * pad
|
||||
cell_h = row.height + 2 * pad
|
||||
|
||||
# Clamp to image bounds
|
||||
if cell_x + cell_w > img_w:
|
||||
cell_w = img_w - cell_x
|
||||
if cell_y + cell_h > img_h:
|
||||
cell_h = img_h - cell_y
|
||||
|
||||
if cell_w <= 0 or cell_h <= 0:
|
||||
cells.append({
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': '',
|
||||
'confidence': 0.0,
|
||||
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2),
|
||||
'y': round(row.y / img_h * 100, 2),
|
||||
'w': round(col.width / img_w * 100, 2),
|
||||
'h': round(row.height / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': engine_name,
|
||||
})
|
||||
continue
|
||||
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
cell = _ocr_single_cell(
|
||||
row_idx, col_idx, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
use_rapid, engine_name, lang, lang_map,
|
||||
)
|
||||
|
||||
# OCR the cell
|
||||
if use_rapid:
|
||||
words = ocr_region_rapid(img_bgr, cell_region)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
|
||||
# Group into lines, then join in reading order
|
||||
if words:
|
||||
avg_h = sum(w['height'] for w in words) / len(words)
|
||||
y_tol = max(10, int(avg_h * 0.5))
|
||||
else:
|
||||
y_tol = 15
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
|
||||
avg_conf = 0.0
|
||||
if words:
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
|
||||
cells.append({
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
|
||||
'bbox_pct': {
|
||||
'x': round(cell_x / img_w * 100, 2),
|
||||
'y': round(cell_y / img_h * 100, 2),
|
||||
'w': round(cell_w / img_w * 100, 2),
|
||||
'h': round(cell_h / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': engine_name,
|
||||
})
|
||||
cells.append(cell)
|
||||
|
||||
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||||
@@ -3170,6 +3191,72 @@ def build_cell_grid(
|
||||
return cells, columns_meta
|
||||
|
||||
|
||||
def build_cell_grid_streaming(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||||
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
|
||||
|
||||
Yields:
|
||||
(cell_dict, columns_meta, total_cells) for each cell.
|
||||
"""
|
||||
# Resolve engine choice (same as build_cell_grid)
|
||||
use_rapid = False
|
||||
if ocr_engine == "auto":
|
||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
return
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
{
|
||||
'index': col_idx,
|
||||
'type': col.type,
|
||||
'x': col.x,
|
||||
'width': col.width,
|
||||
}
|
||||
for col_idx, col in enumerate(relevant_cols)
|
||||
]
|
||||
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
total_cells = len(content_rows) * len(relevant_cols)
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
cell = _ocr_single_cell(
|
||||
row_idx, col_idx, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
use_rapid, engine_name, lang, lang_map,
|
||||
)
|
||||
yield cell, columns_meta, total_cells
|
||||
|
||||
|
||||
def _cells_to_vocab_entries(
|
||||
cells: List[Dict[str, Any]],
|
||||
columns_meta: List[Dict[str, Any]],
|
||||
|
||||
Reference in New Issue
Block a user