breakpilot-lehrer/klausur-service/backend/ocr_pipeline_words_stream.py

"""
OCR Pipeline Words Stream — SSE streaming generators for word detection.

Extracted from ocr_pipeline_words.py.

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import json
import logging
import time
from typing import Any, Dict, List

import numpy as np
from fastapi import Request

from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
    _fix_character_confusion,
    _fix_phonetic_brackets,
    fix_cell_phonetics,
    build_cell_grid_v2,
    build_cell_grid_v2_streaming,
    create_ocr_image,
)
from ocr_pipeline_session_store import update_session_db
from ocr_pipeline_common import _cache

logger = logging.getLogger(__name__)


async def _word_batch_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
    skip_heal_gaps: bool = False,
):
    """SSE generator that runs batch OCR (parallel) then streams results.

    Uses build_cell_grid_v2 with ThreadPoolExecutor for parallel OCR,
    then emits all cells as SSE events.
    """
    import asyncio

    t0 = time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    n_cols = len([c for c in col_regions if c.type not in _skip_types])
    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    total_cells = n_content_rows * n_cols

    # 1. Send meta event immediately
    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"

    # 2. Send preparing event (keepalive for proxy)
    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"

    # 3. Run batch OCR in thread pool with periodic keepalive events.
    loop = asyncio.get_event_loop()
    ocr_future = loop.run_in_executor(
        None,
        lambda: build_cell_grid_v2(
            ocr_img, col_regions, row_geoms, img_w, img_h,
            ocr_engine=engine, img_bgr=dewarped_bgr,
            skip_heal_gaps=skip_heal_gaps,
        ),
    )

    # Send keepalive events every 5 seconds while OCR runs
    keepalive_count = 0
    while not ocr_future.done():
        try:
            cells, columns_meta = await asyncio.wait_for(
                asyncio.shield(ocr_future), timeout=5.0,
            )
            break  # OCR finished
        except asyncio.TimeoutError:
            keepalive_count += 1
            elapsed = int(time.time() - t0)
            yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
            if await request.is_disconnected():
                logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
                ocr_future.cancel()
                return
    else:
        cells, columns_meta = ocr_future.result()

    if await request.is_disconnected():
        logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
        return

    # 4. Apply IPA phonetic fixes
    fix_cell_phonetics(cells, pronunciation=pronunciation)

    # 5. Send columns meta
    if columns_meta:
        yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"

    # 6. Stream all cells
    for idx, cell in enumerate(cells):
        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": idx + 1, "total": len(cells)},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"

    # 7. Build final result and persist
    duration = time.time() - t0
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    vocab_entries = None
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries

    await update_session_db(session_id, word_result=word_result, current_step=8)
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
                f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)")

    # 8. Send complete event
    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"


async def _word_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
):
    """SSE generator that yields cell-by-cell OCR progress."""
    t0 = time.time()

    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    n_cols = len([c for c in col_regions if c.type not in _skip_types])

    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})

    columns_meta = None
    total_cells = n_content_rows * n_cols

    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"

    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n"

    all_cells: List[Dict[str, Any]] = []
    cell_idx = 0
    last_keepalive = time.time()

    for cell, cols_meta, total in build_cell_grid_v2_streaming(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
    ):
        if await request.is_disconnected():
            logger.info(f"SSE: client disconnected during streaming for {session_id}")
            return

        if columns_meta is None:
            columns_meta = cols_meta
            meta_update = {"type": "columns", "columns_used": cols_meta}
            yield f"data: {json.dumps(meta_update)}\n\n"

        all_cells.append(cell)
        cell_idx += 1

        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": cell_idx, "total": total},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"

    # All cells done
    duration = time.time() - t0
    if columns_meta is None:
        columns_meta = []

    # Remove all-empty rows
    rows_with_text: set = set()
    for c in all_cells:
        if c.get("text", "").strip():
            rows_with_text.add(c["row_index"])
    before_filter = len(all_cells)
    all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
    empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
    if empty_rows_removed > 0:
        logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")

    used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine

    fix_cell_phonetics(all_cells, pronunciation=pronunciation)

    word_result = {
        "cells": all_cells,
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(all_cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(all_cells),
            "non_empty_cells": sum(1 for c in all_cells if c.get("text")),
            "low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    vocab_entries = None
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(all_cells, columns_meta)
        entries = _fix_character_confusion(entries)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries

    await update_session_db(session_id, word_result=word_result, current_step=8)
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline SSE: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(all_cells)} cells ({duration:.2f}s)")

    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"