feat(ocr-pipeline): add SSE streaming for word recognition (Step 5)

Cells now appear one-by-one in the UI as they are OCR'd, with a live progress bar, instead of waiting for the full result. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 17:54:20 +01:00
parent a666e883da
commit 7f27783008
3 changed files with 506 additions and 93 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -19,7 +19,7 @@ import io
 import logging
 import time
 from dataclasses import dataclass, field
-from typing import List, Dict, Any, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple

 import numpy as np

@@ -3009,6 +3009,94 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    return _PHONETIC_BRACKET_RE.sub(replacer, text)


+def _ocr_single_cell(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    use_rapid: bool,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+    """OCR a single cell (column × row intersection) and return its dict."""
+    pad = 8  # pixels
+    cell_x = max(0, col.x - pad)
+    cell_y = max(0, row.y - pad)
+    cell_w = col.width + 2 * pad
+    cell_h = row.height + 2 * pad
+
+    # Clamp to image bounds
+    if cell_x + cell_w > img_w:
+        cell_w = img_w - cell_x
+    if cell_y + cell_h > img_h:
+        cell_h = img_h - cell_y
+
+    if cell_w <= 0 or cell_h <= 0:
+        return {
+            'cell_id': f"R{row_idx:02d}_C{col_idx}",
+            'row_index': row_idx,
+            'col_index': col_idx,
+            'col_type': col.type,
+            'text': '',
+            'confidence': 0.0,
+            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+            'bbox_pct': {
+                'x': round(col.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(col.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'ocr_engine': engine_name,
+        }
+
+    cell_region = PageRegion(
+        type=col.type,
+        x=cell_x, y=cell_y,
+        width=cell_w, height=cell_h,
+    )
+
+    # OCR the cell
+    if use_rapid:
+        words = ocr_region_rapid(img_bgr, cell_region)
+    else:
+        cell_lang = lang_map.get(col.type, lang)
+        words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
+
+    # Group into lines, then join in reading order
+    if words:
+        avg_h = sum(w['height'] for w in words) / len(words)
+        y_tol = max(10, int(avg_h * 0.5))
+    else:
+        y_tol = 15
+    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+
+    avg_conf = 0.0
+    if words:
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+
+    return {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': text,
+        'confidence': avg_conf,
+        'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
+        'bbox_pct': {
+            'x': round(cell_x / img_w * 100, 2),
+            'y': round(cell_y / img_h * 100, 2),
+            'w': round(cell_w / img_w * 100, 2),
+            'h': round(cell_h / img_h * 100, 2),
+        },
+        'ocr_engine': engine_name,
+    }
+
+
 def build_cell_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -3089,79 +3177,12 @@ def build_cell_grid(

    for row_idx, row in enumerate(content_rows):
        for col_idx, col in enumerate(relevant_cols):
-            # Compute cell region: column x/width, row y/height
-            pad = 8  # pixels
-            cell_x = max(0, col.x - pad)
-            cell_y = max(0, row.y - pad)
-            cell_w = col.width + 2 * pad
-            cell_h = row.height + 2 * pad
-
-            # Clamp to image bounds
-            if cell_x + cell_w > img_w:
-                cell_w = img_w - cell_x
-            if cell_y + cell_h > img_h:
-                cell_h = img_h - cell_y
-
-            if cell_w <= 0 or cell_h <= 0:
-                cells.append({
-                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
-                    'row_index': row_idx,
-                    'col_index': col_idx,
-                    'col_type': col.type,
-                    'text': '',
-                    'confidence': 0.0,
-                    'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
-                    'bbox_pct': {
-                        'x': round(col.x / img_w * 100, 2),
-                        'y': round(row.y / img_h * 100, 2),
-                        'w': round(col.width / img_w * 100, 2),
-                        'h': round(row.height / img_h * 100, 2),
-                    },
-                    'ocr_engine': engine_name,
-                })
-                continue
-
-            cell_region = PageRegion(
-                type=col.type,
-                x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
            )
-
-            # OCR the cell
-            if use_rapid:
-                words = ocr_region_rapid(img_bgr, cell_region)
-            else:
-                cell_lang = lang_map.get(col.type, lang)
-                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
-
-            # Group into lines, then join in reading order
-            if words:
-                avg_h = sum(w['height'] for w in words) / len(words)
-                y_tol = max(10, int(avg_h * 0.5))
-            else:
-                y_tol = 15
-            text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-
-            avg_conf = 0.0
-            if words:
-                avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-
-            cells.append({
-                'cell_id': f"R{row_idx:02d}_C{col_idx}",
-                'row_index': row_idx,
-                'col_index': col_idx,
-                'col_type': col.type,
-                'text': text,
-                'confidence': avg_conf,
-                'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
-                'bbox_pct': {
-                    'x': round(cell_x / img_w * 100, 2),
-                    'y': round(cell_y / img_h * 100, 2),
-                    'w': round(cell_w / img_w * 100, 2),
-                    'h': round(cell_h / img_h * 100, 2),
-                },
-                'ocr_engine': engine_name,
-            })
+            cells.append(cell)

    logger.info(f"build_cell_grid: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
@@ -3170,6 +3191,72 @@ def build_cell_grid(
    return cells, columns_meta


+def build_cell_grid_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells) for each cell.
+    """
+    # Resolve engine choice (same as build_cell_grid)
+    use_rapid = False
+    if ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+
+    engine_name = "rapid" if use_rapid else "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+            )
+            yield cell, columns_meta, total_cells
+
+
 def _cells_to_vocab_entries(
    cells: List[Dict[str, Any]],
    columns_meta: List[Dict[str, Any]],