""" OCR Pipeline Words Stream — SSE streaming generators for word detection. Extracted from ocr_pipeline_words.py. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import time from typing import Any, Dict, List import numpy as np from fastapi import Request from cv_vocab_pipeline import ( PageRegion, RowGeometry, _cells_to_vocab_entries, _fix_character_confusion, _fix_phonetic_brackets, fix_cell_phonetics, build_cell_grid_v2, build_cell_grid_v2_streaming, create_ocr_image, ) from ocr_pipeline_session_store import update_session_db from ocr_pipeline_common import _cache logger = logging.getLogger(__name__) async def _word_batch_stream_generator( session_id: str, cached: Dict[str, Any], col_regions: List[PageRegion], row_geoms: List[RowGeometry], dewarped_bgr: np.ndarray, engine: str, pronunciation: str, request: Request, skip_heal_gaps: bool = False, ): """SSE generator that runs batch OCR (parallel) then streams results. Uses build_cell_grid_v2 with ThreadPoolExecutor for parallel OCR, then emits all cells as SSE events. """ import asyncio t0 = time.time() ocr_img = create_ocr_image(dewarped_bgr) img_h, img_w = dewarped_bgr.shape[:2] _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) n_cols = len([c for c in col_regions if c.type not in _skip_types]) col_types = {c.type for c in col_regions if c.type not in _skip_types} is_vocab = bool(col_types & {'column_en', 'column_de'}) total_cells = n_content_rows * n_cols # 1. Send meta event immediately meta_event = { "type": "meta", "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells}, "layout": "vocab" if is_vocab else "generic", } yield f"data: {json.dumps(meta_event)}\n\n" # 2. Send preparing event (keepalive for proxy) yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n" # 3. Run batch OCR in thread pool with periodic keepalive events. loop = asyncio.get_event_loop() ocr_future = loop.run_in_executor( None, lambda: build_cell_grid_v2( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=engine, img_bgr=dewarped_bgr, skip_heal_gaps=skip_heal_gaps, ), ) # Send keepalive events every 5 seconds while OCR runs keepalive_count = 0 while not ocr_future.done(): try: cells, columns_meta = await asyncio.wait_for( asyncio.shield(ocr_future), timeout=5.0, ) break # OCR finished except asyncio.TimeoutError: keepalive_count += 1 elapsed = int(time.time() - t0) yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n" if await request.is_disconnected(): logger.info(f"SSE batch: client disconnected during OCR for {session_id}") ocr_future.cancel() return else: cells, columns_meta = ocr_future.result() if await request.is_disconnected(): logger.info(f"SSE batch: client disconnected after OCR for {session_id}") return # 4. Apply IPA phonetic fixes fix_cell_phonetics(cells, pronunciation=pronunciation) # 5. Send columns meta if columns_meta: yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n" # 6. Stream all cells for idx, cell in enumerate(cells): cell_event = { "type": "cell", "cell": cell, "progress": {"current": idx + 1, "total": len(cells)}, } yield f"data: {json.dumps(cell_event)}\n\n" # 7. Build final result and persist duration = time.time() - t0 used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine word_result = { "cells": cells, "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(cells), "non_empty_cells": sum(1 for c in cells if c.get("text")), "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), }, } vocab_entries = None has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) vocab_entries = entries await update_session_db(session_id, word_result=word_result, current_step=8) cached["word_result"] = word_result logger.info(f"OCR Pipeline SSE batch: words session {session_id}: " f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)") # 8. Send complete event complete_event = { "type": "complete", "summary": word_result["summary"], "duration_seconds": round(duration, 2), "ocr_engine": used_engine, } if vocab_entries is not None: complete_event["vocab_entries"] = vocab_entries yield f"data: {json.dumps(complete_event)}\n\n" async def _word_stream_generator( session_id: str, cached: Dict[str, Any], col_regions: List[PageRegion], row_geoms: List[RowGeometry], dewarped_bgr: np.ndarray, engine: str, pronunciation: str, request: Request, ): """SSE generator that yields cell-by-cell OCR progress.""" t0 = time.time() ocr_img = create_ocr_image(dewarped_bgr) img_h, img_w = dewarped_bgr.shape[:2] n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} n_cols = len([c for c in col_regions if c.type not in _skip_types]) col_types = {c.type for c in col_regions if c.type not in _skip_types} is_vocab = bool(col_types & {'column_en', 'column_de'}) columns_meta = None total_cells = n_content_rows * n_cols meta_event = { "type": "meta", "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells}, "layout": "vocab" if is_vocab else "generic", } yield f"data: {json.dumps(meta_event)}\n\n" yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n" all_cells: List[Dict[str, Any]] = [] cell_idx = 0 last_keepalive = time.time() for cell, cols_meta, total in build_cell_grid_v2_streaming( ocr_img, col_regions, row_geoms, img_w, img_h, ocr_engine=engine, img_bgr=dewarped_bgr, ): if await request.is_disconnected(): logger.info(f"SSE: client disconnected during streaming for {session_id}") return if columns_meta is None: columns_meta = cols_meta meta_update = {"type": "columns", "columns_used": cols_meta} yield f"data: {json.dumps(meta_update)}\n\n" all_cells.append(cell) cell_idx += 1 cell_event = { "type": "cell", "cell": cell, "progress": {"current": cell_idx, "total": total}, } yield f"data: {json.dumps(cell_event)}\n\n" # All cells done duration = time.time() - t0 if columns_meta is None: columns_meta = [] # Remove all-empty rows rows_with_text: set = set() for c in all_cells: if c.get("text", "").strip(): rows_with_text.add(c["row_index"]) before_filter = len(all_cells) all_cells = [c for c in all_cells if c["row_index"] in rows_with_text] empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1) if empty_rows_removed > 0: logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR") used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine fix_cell_phonetics(all_cells, pronunciation=pronunciation) word_result = { "cells": all_cells, "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(all_cells)}, "columns_used": columns_meta, "layout": "vocab" if is_vocab else "generic", "image_width": img_w, "image_height": img_h, "duration_seconds": round(duration, 2), "ocr_engine": used_engine, "summary": { "total_cells": len(all_cells), "non_empty_cells": sum(1 for c in all_cells if c.get("text")), "low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50), }, } vocab_entries = None has_text_col = 'column_text' in col_types if is_vocab or has_text_col: entries = _cells_to_vocab_entries(all_cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) vocab_entries = entries await update_session_db(session_id, word_result=word_result, current_step=8) cached["word_result"] = word_result logger.info(f"OCR Pipeline SSE: words session {session_id}: " f"layout={word_result['layout']}, " f"{len(all_cells)} cells ({duration:.2f}s)") complete_event = { "type": "complete", "summary": word_result["summary"], "duration_seconds": round(duration, 2), "ocr_engine": used_engine, } if vocab_entries is not None: complete_event["vocab_entries"] = vocab_entries yield f"data: {json.dumps(complete_event)}\n\n"