feat(ocr-pipeline): add SSE streaming for word recognition (Step 5)

Cells now appear one-by-one in the UI as they are OCR'd, with a live progress bar, instead of waiting for the full result. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 17:54:20 +01:00
parent a666e883da
commit 7f27783008
3 changed files with 506 additions and 93 deletions
@@ -62,7 +62,11 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
  const [usedEngine, setUsedEngine] = useState<string>('')
  const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
  // Streaming progress state
  const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
  const enRef = useRef<HTMLInputElement>(null)
  const tableEndRef = useRef<HTMLDivElement>(null)
  const isVocab = gridResult?.layout === 'vocab'
@@ -110,16 +114,107 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
    const eng = engine || ocrEngine
    setDetecting(true)
    setError(null)
    setStreamProgress(null)
    setEditedCells([])
    setEditedEntries([])
    setGridResult(null)
    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}&pronunciation=${pronunciation}`, {
+      const res = await fetch(
-        method: 'POST',
+        `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}`,
-      })
+        { method: 'POST' },
      )
      if (!res.ok) {
        const err = await res.json().catch(() => ({ detail: res.statusText }))
        throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
      }
-      const data = await res.json()
+
-      applyGridResult(data)
+      const reader = res.body!.getReader()
      const decoder = new TextDecoder()
      let buffer = ''
      let streamLayout: string | null = null
      let streamColumnsUsed: GridResult['columns_used'] = []
      let streamGridShape: GridResult['grid_shape'] | null = null
      let streamCells: GridCell[] = []
      while (true) {
        const { done, value } = await reader.read()
        if (done) break
        buffer += decoder.decode(value, { stream: true })
        // Parse SSE events (separated by \n\n)
        while (buffer.includes('\n\n')) {
          const idx = buffer.indexOf('\n\n')
          const chunk = buffer.slice(0, idx).trim()
          buffer = buffer.slice(idx + 2)
          if (!chunk.startsWith('data: ')) continue
          const dataStr = chunk.slice(6) // strip "data: "
          let event: any
          try {
            event = JSON.parse(dataStr)
          } catch {
            continue
          }
          if (event.type === 'meta') {
            streamLayout = event.layout || 'generic'
            streamGridShape = event.grid_shape || null
            // Show partial grid result so UI renders structure
            setGridResult(prev => ({
              ...prev,
              layout: event.layout || 'generic',
              grid_shape: event.grid_shape,
              columns_used: [],
              cells: [],
              summary: { total_cells: event.grid_shape?.total_cells || 0, non_empty_cells: 0, low_confidence: 0 },
              duration_seconds: 0,
              ocr_engine: '',
            } as GridResult))
          }
          if (event.type === 'columns') {
            streamColumnsUsed = event.columns_used || []
            setGridResult(prev => prev ? { ...prev, columns_used: streamColumnsUsed } : prev)
          }
          if (event.type === 'cell') {
            const cell: GridCell = { ...event.cell, status: 'pending' }
            streamCells = [...streamCells, cell]
            setEditedCells(streamCells)
            setStreamProgress(event.progress)
            // Auto-scroll table to bottom
            setTimeout(() => tableEndRef.current?.scrollIntoView({ behavior: 'smooth', block: 'nearest' }), 16)
          }
          if (event.type === 'complete') {
            // Build final GridResult
            const finalResult: GridResult = {
              cells: streamCells,
              grid_shape: streamGridShape || { rows: 0, cols: 0, total_cells: streamCells.length },
              columns_used: streamColumnsUsed,
              layout: streamLayout || 'generic',
              image_width: 0,
              image_height: 0,
              duration_seconds: event.duration_seconds || 0,
              ocr_engine: event.ocr_engine || '',
              summary: event.summary || {},
            }
            // If vocab: apply post-processed entries from complete event
            if (event.vocab_entries) {
              finalResult.entries = event.vocab_entries
              finalResult.vocab_entries = event.vocab_entries
              finalResult.entry_count = event.vocab_entries.length
            }
            applyGridResult(finalResult)
            setUsedEngine(event.ocr_engine || '')
            setStreamProgress(null)
          }
        }
      }
    } catch (e) {
      setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
    } finally {
@@ -288,11 +383,23 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
  return (
    <div className="space-y-4">
-      {/* Loading */}
+      {/* Loading with streaming progress */}
      {detecting && (
        <div className="space-y-1">
          <div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
            <div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
-          Worterkennung laeuft...
+            {streamProgress
              ? `Zelle ${streamProgress.current}/${streamProgress.total} erkannt...`
              : 'Worterkennung startet...'}
          </div>
          {streamProgress && streamProgress.total > 0 && (
            <div className="w-full bg-gray-200 dark:bg-gray-700 rounded-full h-1.5">
              <div
                className="bg-teal-500 h-1.5 rounded-full transition-all duration-150"
                style={{ width: `${(streamProgress.current / streamProgress.total) * 100}%` }}
              />
            </div>
          )}
        </div>
      )}
@@ -378,8 +485,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
            </div>
          </div>
-          {/* Result summary */}
+          {/* Result summary (only after streaming completes) */}
-          {gridResult && summary && (
+          {gridResult && summary && !detecting && (
            <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
              <div className="flex items-center justify-between">
                <h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
@@ -511,6 +618,67 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                    </tbody>
                  </table>
                )}
                <div ref={tableEndRef} />
              </div>
            </div>
          )}
          {/* Streaming cell table (shown while detecting, before complete) */}
          {detecting && editedCells.length > 0 && !gridResult?.summary?.non_empty_cells && (
            <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
              <h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
                Live: {editedCells.length} Zellen erkannt...
              </h4>
              <div className="max-h-80 overflow-y-auto">
                <table className="w-full text-xs">
                  <thead className="sticky top-0 bg-white dark:bg-gray-800">
                    <tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
                      <th className="py-1 pr-2 w-12">Zelle</th>
                      {columnsUsed.map((col, i) => (
                        <th key={i} className={`py-1 pr-2 ${colTypeColor(col.type)}`}>
                          {colTypeLabel(col.type)}
                        </th>
                      ))}
                      <th className="py-1 w-12 text-right">Conf</th>
                    </tr>
                  </thead>
                  <tbody>
                    {(() => {
                      const liveByRow: Map<number, GridCell[]> = new Map()
                      for (const cell of editedCells) {
                        const existing = liveByRow.get(cell.row_index) || []
                        existing.push(cell)
                        liveByRow.set(cell.row_index, existing)
                      }
                      const liveSorted = [...liveByRow.keys()].sort((a, b) => a - b)
                      return liveSorted.map(rowIdx => {
                        const rowCells = liveByRow.get(rowIdx) || []
                        const avgConf = rowCells.length
                          ? Math.round(rowCells.reduce((s, c) => s + c.confidence, 0) / rowCells.length)
                          : 0
                        return (
                          <tr key={rowIdx} className="border-b dark:border-gray-700/50 animate-fade-in">
                            <td className="py-1 pr-2 text-gray-400 font-mono text-[10px]">
                              R{String(rowIdx).padStart(2, '0')}
                            </td>
                            {columnsUsed.map((col) => {
                              const cell = rowCells.find(c => c.col_index === col.index)
                              return (
                                <td key={col.index} className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300">
                                  <MultilineText text={cell?.text || ''} />
                                </td>
                              )
                            })}
                            <td className={`py-1 text-right font-mono ${confColor(avgConf)}`}>
                              {avgConf}%
                            </td>
                          </tr>
                        )
                      })
                    })()}
                  </tbody>
                </table>
                <div ref={tableEndRef} />
              </div>
            </div>
          )}
@@ -19,7 +19,7 @@ import io
 import logging
 import time
 from dataclasses import dataclass, field
-from typing import List, Dict, Any, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple
 import numpy as np
@@ -3009,6 +3009,94 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    return _PHONETIC_BRACKET_RE.sub(replacer, text)
 def _ocr_single_cell(
    row_idx: int,
    col_idx: int,
    row: RowGeometry,
    col: PageRegion,
    ocr_img: np.ndarray,
    img_bgr: Optional[np.ndarray],
    img_w: int,
    img_h: int,
    use_rapid: bool,
    engine_name: str,
    lang: str,
    lang_map: Dict[str, str],
 ) -> Dict[str, Any]:
    """OCR a single cell (column × row intersection) and return its dict."""
    pad = 8  # pixels
    cell_x = max(0, col.x - pad)
    cell_y = max(0, row.y - pad)
    cell_w = col.width + 2 * pad
    cell_h = row.height + 2 * pad
    # Clamp to image bounds
    if cell_x + cell_w > img_w:
        cell_w = img_w - cell_x
    if cell_y + cell_h > img_h:
        cell_h = img_h - cell_y
    if cell_w <= 0 or cell_h <= 0:
        return {
            'cell_id': f"R{row_idx:02d}_C{col_idx}",
            'row_index': row_idx,
            'col_index': col_idx,
            'col_type': col.type,
            'text': '',
            'confidence': 0.0,
            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
            'bbox_pct': {
                'x': round(col.x / img_w * 100, 2),
                'y': round(row.y / img_h * 100, 2),
                'w': round(col.width / img_w * 100, 2),
                'h': round(row.height / img_h * 100, 2),
            },
            'ocr_engine': engine_name,
        }
    cell_region = PageRegion(
        type=col.type,
        x=cell_x, y=cell_y,
        width=cell_w, height=cell_h,
    )
    # OCR the cell
    if use_rapid:
        words = ocr_region_rapid(img_bgr, cell_region)
    else:
        cell_lang = lang_map.get(col.type, lang)
        words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
    # Group into lines, then join in reading order
    if words:
        avg_h = sum(w['height'] for w in words) / len(words)
        y_tol = max(10, int(avg_h * 0.5))
    else:
        y_tol = 15
    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
    avg_conf = 0.0
    if words:
        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
    return {
        'cell_id': f"R{row_idx:02d}_C{col_idx}",
        'row_index': row_idx,
        'col_index': col_idx,
        'col_type': col.type,
        'text': text,
        'confidence': avg_conf,
        'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
        'bbox_pct': {
            'x': round(cell_x / img_w * 100, 2),
            'y': round(cell_y / img_h * 100, 2),
            'w': round(cell_w / img_w * 100, 2),
            'h': round(cell_h / img_h * 100, 2),
        },
        'ocr_engine': engine_name,
    }
 def build_cell_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -3089,79 +3177,12 @@ def build_cell_grid(
    for row_idx, row in enumerate(content_rows):
        for col_idx, col in enumerate(relevant_cols):
-            # Compute cell region: column x/width, row y/height
+            cell = _ocr_single_cell(
-            pad = 8  # pixels
+                row_idx, col_idx, row, col,
-            cell_x = max(0, col.x - pad)
+                ocr_img, img_bgr, img_w, img_h,
-            cell_y = max(0, row.y - pad)
+                use_rapid, engine_name, lang, lang_map,
            cell_w = col.width + 2 * pad
            cell_h = row.height + 2 * pad
            # Clamp to image bounds
            if cell_x + cell_w > img_w:
                cell_w = img_w - cell_x
            if cell_y + cell_h > img_h:
                cell_h = img_h - cell_y
            if cell_w <= 0 or cell_h <= 0:
                cells.append({
                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
                    'row_index': row_idx,
                    'col_index': col_idx,
                    'col_type': col.type,
                    'text': '',
                    'confidence': 0.0,
                    'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
                    'bbox_pct': {
                        'x': round(col.x / img_w * 100, 2),
                        'y': round(row.y / img_h * 100, 2),
                        'w': round(col.width / img_w * 100, 2),
                        'h': round(row.height / img_h * 100, 2),
                    },
                    'ocr_engine': engine_name,
                })
                continue
            cell_region = PageRegion(
                type=col.type,
                x=cell_x, y=cell_y,
                width=cell_w, height=cell_h,
            )
-
+            cells.append(cell)
            # OCR the cell
            if use_rapid:
                words = ocr_region_rapid(img_bgr, cell_region)
            else:
                cell_lang = lang_map.get(col.type, lang)
                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
            # Group into lines, then join in reading order
            if words:
                avg_h = sum(w['height'] for w in words) / len(words)
                y_tol = max(10, int(avg_h * 0.5))
            else:
                y_tol = 15
            text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
            avg_conf = 0.0
            if words:
                avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
            cells.append({
                'cell_id': f"R{row_idx:02d}_C{col_idx}",
                'row_index': row_idx,
                'col_index': col_idx,
                'col_type': col.type,
                'text': text,
                'confidence': avg_conf,
                'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
                'bbox_pct': {
                    'x': round(cell_x / img_w * 100, 2),
                    'y': round(cell_y / img_h * 100, 2),
                    'w': round(cell_w / img_w * 100, 2),
                    'h': round(cell_h / img_h * 100, 2),
                },
                'ocr_engine': engine_name,
            })
    logger.info(f"build_cell_grid: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
@@ -3170,6 +3191,72 @@ def build_cell_grid(
    return cells, columns_meta
 def build_cell_grid_streaming(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
 ) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
    """Like build_cell_grid(), but yields each cell as it is OCR'd.
    Yields:
        (cell_dict, columns_meta, total_cells) for each cell.
    """
    # Resolve engine choice (same as build_cell_grid)
    use_rapid = False
    if ocr_engine == "auto":
        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
        else:
            use_rapid = True
    engine_name = "rapid" if use_rapid else "tesseract"
    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        return
    _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        return
    relevant_cols.sort(key=lambda c: c.x)
    columns_meta = [
        {
            'index': col_idx,
            'type': col.type,
            'x': col.x,
            'width': col.width,
        }
        for col_idx, col in enumerate(relevant_cols)
    ]
    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }
    total_cells = len(content_rows) * len(relevant_cols)
    for row_idx, row in enumerate(content_rows):
        for col_idx, col in enumerate(relevant_cols):
            cell = _ocr_single_cell(
                row_idx, col_idx, row, col,
                ocr_img, img_bgr, img_w, img_h,
                use_rapid, engine_name, lang, lang_map,
            )
            yield cell, columns_meta, total_cells
 def _cells_to_vocab_entries(
    cells: List[Dict[str, Any]],
    columns_meta: List[Dict[str, Any]],
@@ -15,6 +15,7 @@ Lizenz: Apache 2.0
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import json
 import logging
 import time
 import uuid
@@ -24,8 +25,8 @@ from typing import Any, Dict, List, Optional
 import cv2
 import numpy as np
-from fastapi import APIRouter, File, Form, HTTPException, UploadFile
+from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
-from fastapi.responses import Response
+from fastapi.responses import Response, StreamingResponse
 from pydantic import BaseModel
 from cv_vocab_pipeline import (
@@ -39,6 +40,7 @@ from cv_vocab_pipeline import (
    analyze_layout,
    analyze_layout_by_words,
    build_cell_grid,
    build_cell_grid_streaming,
    build_word_grid,
    classify_column_types,
    create_layout_image,
@@ -1023,12 +1025,19 @@ async def get_row_ground_truth(session_id: str):
 # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/words")
-async def detect_words(session_id: str, engine: str = "auto", pronunciation: str = "british"):
+async def detect_words(
    session_id: str,
    request: Request,
    engine: str = "auto",
    pronunciation: str = "british",
    stream: bool = False,
 ):
    """Build word grid from columns × rows, OCR each cell.
    Query params:
        engine: 'auto' (default), 'tesseract', or 'rapid'
        pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
        stream: false (default) for JSON response, true for SSE streaming
    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
@@ -1049,12 +1058,6 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
    if not row_result or not row_result.get("rows"):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")
    t0 = time.time()
    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]
    # Convert column dicts back to PageRegion objects
    col_regions = [
        PageRegion(
@@ -1081,6 +1084,27 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
        for r in row_result["rows"]
    ]
    if stream:
        return StreamingResponse(
            _word_stream_generator(
                session_id, cached, col_regions, row_geoms,
                dewarped_bgr, engine, pronunciation, request,
            ),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            },
        )
    # --- Non-streaming path (unchanged) ---
    t0 = time.time()
    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]
    # Build generic cell grid
    cells, columns_meta = build_cell_grid(
        ocr_img, col_regions, row_geoms, img_w, img_h,
@@ -1154,6 +1178,140 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
    }
 async def _word_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
 ):
    """SSE generator that yields cell-by-cell OCR progress."""
    t0 = time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]
    # Compute grid shape upfront for the meta event
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
    n_cols = len([c for c in col_regions if c.type not in _skip_types])
    # Determine layout
    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    # Start streaming — first event: meta
    columns_meta = None  # will be set from first yield
    total_cells = n_content_rows * n_cols
    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"
    # Stream cells one by one
    all_cells: List[Dict[str, Any]] = []
    cell_idx = 0
    for cell, cols_meta, total in build_cell_grid_streaming(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
    ):
        if await request.is_disconnected():
            logger.info(f"SSE: client disconnected during streaming for {session_id}")
            return
        if columns_meta is None:
            columns_meta = cols_meta
            # Send columns_used as part of first cell or update meta
            meta_update = {
                "type": "columns",
                "columns_used": cols_meta,
            }
            yield f"data: {json.dumps(meta_update)}\n\n"
        all_cells.append(cell)
        cell_idx += 1
        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": cell_idx, "total": total},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"
    # All cells done — build final result
    duration = time.time() - t0
    if columns_meta is None:
        columns_meta = []
    used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
    word_result = {
        "cells": all_cells,
        "grid_shape": {
            "rows": n_content_rows,
            "cols": n_cols,
            "total_cells": len(all_cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(all_cells),
            "non_empty_cells": sum(1 for c in all_cells if c.get("text")),
            "low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
        },
    }
    # Vocab post-processing
    vocab_entries = None
    if is_vocab:
        entries = _cells_to_vocab_entries(all_cells, columns_meta)
        entries = _fix_character_confusion(entries)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        entries = _split_comma_entries(entries)
        entries = _attach_example_sentences(entries)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries
    # Persist to DB
    await update_session_db(
        session_id,
        word_result=word_result,
        current_step=5,
    )
    cached["word_result"] = word_result
    logger.info(f"OCR Pipeline SSE: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(all_cells)} cells ({duration:.2f}s)")
    # Final complete event
    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"
 class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None