breakpilot-lehrer/klausur-service/backend/ocr_pipeline_words.py

"""
OCR Pipeline Words - Word detection and ground truth endpoints.

Extracted from ocr_pipeline_api.py.
Handles:
- POST /sessions/{session_id}/words — main SSE streaming word detection
- POST /sessions/{session_id}/paddle-direct — PaddleOCR direct endpoint
- POST /sessions/{session_id}/ground-truth/words — save ground truth
- GET  /sessions/{session_id}/ground-truth/words — get ground truth

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import json
import logging
import time
from datetime import datetime
from typing import Any, Dict, List, Optional

import cv2
import numpy as np
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel

from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
    _fix_character_confusion,
    _fix_phonetic_brackets,
    fix_cell_phonetics,
    build_cell_grid_v2,
    build_cell_grid_v2_streaming,
    create_ocr_image,
    detect_column_geometry,
)
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
    get_session_db,
    get_session_image,
    update_session_db,
)
from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _get_base_image_png,
    _append_pipeline_log,
)

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])


# ---------------------------------------------------------------------------
# Pydantic models
# ---------------------------------------------------------------------------

class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None
    notes: Optional[str] = None


# ---------------------------------------------------------------------------
# Word Detection Endpoint (Step 7)
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/words")
async def detect_words(
    session_id: str,
    request: Request,
    engine: str = "auto",
    pronunciation: str = "british",
    stream: bool = False,
    skip_heal_gaps: bool = False,
    grid_method: str = "v2",
):
    """Build word grid from columns × rows, OCR each cell.

    Query params:
        engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
        pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry
            positions without gap-healing expansion. Better for overlay rendering.
        grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
            'v2' uses pre-detected columns/rows (top-down).
            'words_first' clusters words bottom-up (no column/row detection needed).
    """
    # PaddleOCR is full-page remote OCR → force words_first grid method
    if engine == "paddle" and grid_method != "words_first":
        logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
        grid_method = "words_first"

    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if dewarped_bgr is None:
        logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
                       session_id, [k for k in cached.keys() if k.endswith('_bgr')])
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")

    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    column_result = session.get("column_result")
    row_result = session.get("row_result")
    if not column_result or not column_result.get("columns"):
        # No column detection — synthesize a single full-page pseudo-column.
        # This enables the overlay pipeline which skips column detection.
        img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
        column_result = {
            "columns": [{
                "type": "column_text",
                "x": 0, "y": 0,
                "width": img_w_tmp, "height": img_h_tmp,
                "classification_confidence": 1.0,
                "classification_method": "full_page_fallback",
            }],
            "zones": [],
            "duration_seconds": 0,
        }
        logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")

    # Convert column dicts back to PageRegion objects
    col_regions = [
        PageRegion(
            type=c["type"],
            x=c["x"], y=c["y"],
            width=c["width"], height=c["height"],
            classification_confidence=c.get("classification_confidence", 1.0),
            classification_method=c.get("classification_method", ""),
        )
        for c in column_result["columns"]
    ]

    # Convert row dicts back to RowGeometry objects
    row_geoms = [
        RowGeometry(
            index=r["index"],
            x=r["x"], y=r["y"],
            width=r["width"], height=r["height"],
            word_count=r.get("word_count", 0),
            words=[],
            row_type=r.get("row_type", "content"),
            gap_before=r.get("gap_before", 0),
        )
        for r in row_result["rows"]
    ]

    # Cell-First OCR (v2): no full-page word re-population needed.
    # Each cell is cropped and OCR'd in isolation → no neighbour bleeding.
    # We still need word_count > 0 for row filtering in build_cell_grid_v2,
    # so populate from cached words if available (just for counting).
    word_dicts = cached.get("_word_dicts")
    if word_dicts is None:
        ocr_img_tmp = create_ocr_image(dewarped_bgr)
        geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
        if geo_result is not None:
            _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
            cached["_word_dicts"] = word_dicts
            cached["_inv"] = inv
            cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

    if word_dicts:
        content_bounds = cached.get("_content_bounds")
        if content_bounds:
            _lx, _rx, top_y, _by = content_bounds
        else:
            top_y = min(r.y for r in row_geoms) if row_geoms else 0

        for row in row_geoms:
            row_y_rel = row.y - top_y
            row_bottom_rel = row_y_rel + row.height
            row.words = [
                w for w in word_dicts
                if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
            ]
            row.word_count = len(row.words)

    # Exclude rows that fall within box zones.
    # Use inner box range (shrunk by border_thickness) so that rows at
    # the boundary (overlapping with the box border) are NOT excluded.
    zones = column_result.get("zones") or []
    box_ranges_inner = []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
            box = zone["box"]
            bt = max(box.get("border_thickness", 0), 5)  # minimum 5px margin
            box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))

    if box_ranges_inner:
        def _row_in_box(r):
            center_y = r.y + r.height / 2
            return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)

        before_count = len(row_geoms)
        row_geoms = [r for r in row_geoms if not _row_in_box(r)]
        excluded = before_count - len(row_geoms)
        if excluded:
            logger.info(f"detect_words: excluded {excluded} rows inside box zones")

    # --- Words-First path: bottom-up grid from word boxes ---
    if grid_method == "words_first":
        t0 = time.time()
        img_h, img_w = dewarped_bgr.shape[:2]

        # For paddle engine: run remote PaddleOCR full-page instead of Tesseract
        if engine == "paddle":
            from cv_ocr_engines import ocr_region_paddle

            wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
            # PaddleOCR returns absolute coordinates, no content_bounds offset needed
            cached["_paddle_word_dicts"] = wf_word_dicts
        else:
            # Get word_dicts from cache or run Tesseract full-page
            wf_word_dicts = cached.get("_word_dicts")
            if wf_word_dicts is None:
                ocr_img_tmp = create_ocr_image(dewarped_bgr)
                geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
                if geo_result is not None:
                    _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
                    cached["_word_dicts"] = wf_word_dicts
                    cached["_inv"] = inv
                    cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

        if not wf_word_dicts:
            raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")

        # Convert word coordinates to absolute image coordinates if needed
        # (detect_column_geometry returns words relative to content ROI)
        # PaddleOCR already returns absolute coordinates — skip offset.
        if engine != "paddle":
            content_bounds = cached.get("_content_bounds")
            if content_bounds:
                lx, _rx, ty, _by = content_bounds
                abs_words = []
                for w in wf_word_dicts:
                    abs_words.append({
                        **w,
                        'left': w['left'] + lx,
                        'top': w['top'] + ty,
                    })
                wf_word_dicts = abs_words

        # Extract box rects for box-aware column clustering
        box_rects = []
        for zone in zones:
            if zone.get("zone_type") == "box" and zone.get("box"):
                box_rects.append(zone["box"])

        cells, columns_meta = build_grid_from_words(
            wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
        )
        duration = time.time() - t0

        # Apply IPA phonetic fixes
        fix_cell_phonetics(cells, pronunciation=pronunciation)

        # Add zone_index for backward compat
        for cell in cells:
            cell.setdefault("zone_index", 0)

        col_types = {c['type'] for c in columns_meta}
        is_vocab = bool(col_types & {'column_en', 'column_de'})
        n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
        n_cols = len(columns_meta)
        used_engine = "paddle" if engine == "paddle" else "words_first"

        word_result = {
            "cells": cells,
            "grid_shape": {
                "rows": n_rows,
                "cols": n_cols,
                "total_cells": len(cells),
            },
            "columns_used": columns_meta,
            "layout": "vocab" if is_vocab else "generic",
            "image_width": img_w,
            "image_height": img_h,
            "duration_seconds": round(duration, 2),
            "ocr_engine": used_engine,
            "grid_method": "words_first",
            "summary": {
                "total_cells": len(cells),
                "non_empty_cells": sum(1 for c in cells if c.get("text")),
                "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
            },
        }

        if is_vocab or 'column_text' in col_types:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
            word_result["vocab_entries"] = entries
            word_result["entries"] = entries
            word_result["entry_count"] = len(entries)
            word_result["summary"]["total_entries"] = len(entries)
            word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
            word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))

        await update_session_db(session_id, word_result=word_result, current_step=8)
        cached["word_result"] = word_result

        logger.info(f"OCR Pipeline: words-first session {session_id}: "
                    f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")

        await _append_pipeline_log(session_id, "words", {
            "grid_method": "words_first",
            "total_cells": len(cells),
            "non_empty_cells": word_result["summary"]["non_empty_cells"],
            "ocr_engine": used_engine,
            "layout": word_result["layout"],
        }, duration_ms=int(duration * 1000))

        return {"session_id": session_id, **word_result}

    if stream:
        # Cell-First OCR v2: use batch-then-stream approach instead of
        # per-cell streaming. The parallel ThreadPoolExecutor in
        # build_cell_grid_v2 is much faster than sequential streaming.
        return StreamingResponse(
            _word_batch_stream_generator(
                session_id, cached, col_regions, row_geoms,
                dewarped_bgr, engine, pronunciation, request,
                skip_heal_gaps=skip_heal_gaps,
            ),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            },
        )

    # --- Non-streaming path (grid_method=v2) ---
    t0 = time.time()

    # Create binarized OCR image (for Tesseract)
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

    # Build cell grid using Cell-First OCR (v2) — each cell cropped in isolation
    cells, columns_meta = build_cell_grid_v2(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
        skip_heal_gaps=skip_heal_gaps,
    )
    duration = time.time() - t0

    # Add zone_index to each cell (default 0 for backward compatibility)
    for cell in cells:
        cell.setdefault("zone_index", 0)

    # Layout detection
    col_types = {c['type'] for c in columns_meta}
    is_vocab = bool(col_types & {'column_en', 'column_de'})

    # Count content rows and columns for grid_shape
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    n_cols = len(columns_meta)

    # Determine which engine was actually used
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
    fix_cell_phonetics(cells, pronunciation=pronunciation)

    # Grid result (always generic)
    word_result = {
        "cells": cells,
        "grid_shape": {
            "rows": n_content_rows,
            "cols": n_cols,
            "total_cells": len(cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    # For vocab layout or single-column (box sub-sessions): map cells 1:1
    # to vocab entries (row→entry).
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))

    # Persist to DB
    await update_session_db(
        session_id,
        word_result=word_result,
        current_step=8,
    )

    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")

    await _append_pipeline_log(session_id, "words", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "low_confidence_count": word_result["summary"]["low_confidence"],
        "ocr_engine": used_engine,
        "layout": word_result["layout"],
        "entry_count": word_result.get("entry_count", 0),
    }, duration_ms=int(duration * 1000))

    return {
        "session_id": session_id,
        **word_result,
    }


async def _word_batch_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
    skip_heal_gaps: bool = False,
):
    """SSE generator that runs batch OCR (parallel) then streams results.

    Unlike the old per-cell streaming, this uses build_cell_grid_v2 with
    ThreadPoolExecutor for parallel OCR, then emits all cells as SSE events.
    The 'preparing' event keeps the connection alive during OCR processing.
    """
    import asyncio

    t0 = time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    n_cols = len([c for c in col_regions if c.type not in _skip_types])
    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    total_cells = n_content_rows * n_cols

    # 1. Send meta event immediately
    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"

    # 2. Send preparing event (keepalive for proxy)
    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"

    # 3. Run batch OCR in thread pool with periodic keepalive events.
    # The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
    # connections after 30-60s. Send keepalive every 5s to prevent this.
    loop = asyncio.get_event_loop()
    ocr_future = loop.run_in_executor(
        None,
        lambda: build_cell_grid_v2(
            ocr_img, col_regions, row_geoms, img_w, img_h,
            ocr_engine=engine, img_bgr=dewarped_bgr,
            skip_heal_gaps=skip_heal_gaps,
        ),
    )

    # Send keepalive events every 5 seconds while OCR runs
    keepalive_count = 0
    while not ocr_future.done():
        try:
            cells, columns_meta = await asyncio.wait_for(
                asyncio.shield(ocr_future), timeout=5.0,
            )
            break  # OCR finished
        except asyncio.TimeoutError:
            keepalive_count += 1
            elapsed = int(time.time() - t0)
            yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
            if await request.is_disconnected():
                logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
                ocr_future.cancel()
                return
    else:
        cells, columns_meta = ocr_future.result()

    if await request.is_disconnected():
        logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
        return

    # 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
    fix_cell_phonetics(cells, pronunciation=pronunciation)

    # 5. Send columns meta
    if columns_meta:
        yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"

    # 6. Stream all cells
    for idx, cell in enumerate(cells):
        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": idx + 1, "total": len(cells)},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"

    # 6. Build final result and persist
    duration = time.time() - t0
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    vocab_entries = None
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries

    await update_session_db(session_id, word_result=word_result, current_step=8)
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
                f"layout={word_result['layout']}, {len(cells)} cells ({duration:.2f}s)")

    # 7. Send complete event
    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"


async def _word_stream_generator(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    request: Request,
):
    """SSE generator that yields cell-by-cell OCR progress."""
    t0 = time.time()

    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

    # Compute grid shape upfront for the meta event
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    n_cols = len([c for c in col_regions if c.type not in _skip_types])

    # Determine layout
    col_types = {c.type for c in col_regions if c.type not in _skip_types}
    is_vocab = bool(col_types & {'column_en', 'column_de'})

    # Start streaming — first event: meta
    columns_meta = None  # will be set from first yield
    total_cells = n_content_rows * n_cols

    meta_event = {
        "type": "meta",
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
        "layout": "vocab" if is_vocab else "generic",
    }
    yield f"data: {json.dumps(meta_event)}\n\n"

    # Keepalive: send preparing event so proxy doesn't timeout during OCR init
    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR wird initialisiert...'})}\n\n"

    # Stream cells one by one
    all_cells: List[Dict[str, Any]] = []
    cell_idx = 0
    last_keepalive = time.time()

    for cell, cols_meta, total in build_cell_grid_v2_streaming(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
    ):
        if await request.is_disconnected():
            logger.info(f"SSE: client disconnected during streaming for {session_id}")
            return

        if columns_meta is None:
            columns_meta = cols_meta
            # Send columns_used as part of first cell or update meta
            meta_update = {
                "type": "columns",
                "columns_used": cols_meta,
            }
            yield f"data: {json.dumps(meta_update)}\n\n"

        all_cells.append(cell)
        cell_idx += 1

        cell_event = {
            "type": "cell",
            "cell": cell,
            "progress": {"current": cell_idx, "total": total},
        }
        yield f"data: {json.dumps(cell_event)}\n\n"

    # All cells done — build final result
    duration = time.time() - t0
    if columns_meta is None:
        columns_meta = []

    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
    # that had stray Tesseract artifacts giving word_count > 0).
    rows_with_text: set = set()
    for c in all_cells:
        if c.get("text", "").strip():
            rows_with_text.add(c["row_index"])
    before_filter = len(all_cells)
    all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
    empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
    if empty_rows_removed > 0:
        logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")

    used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine

    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
    fix_cell_phonetics(all_cells, pronunciation=pronunciation)

    word_result = {
        "cells": all_cells,
        "grid_shape": {
            "rows": n_content_rows,
            "cols": n_cols,
            "total_cells": len(all_cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(all_cells),
            "non_empty_cells": sum(1 for c in all_cells if c.get("text")),
            "low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    # For vocab layout or single-column (box sub-sessions): map cells 1:1
    # to vocab entries (row→entry).
    vocab_entries = None
    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(all_cells, columns_meta)
        entries = _fix_character_confusion(entries)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        vocab_entries = entries

    # Persist to DB
    await update_session_db(
        session_id,
        word_result=word_result,
        current_step=8,
    )
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline SSE: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(all_cells)} cells ({duration:.2f}s)")

    # Final complete event
    complete_event = {
        "type": "complete",
        "summary": word_result["summary"],
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
    }
    if vocab_entries is not None:
        complete_event["vocab_entries"] = vocab_entries
    yield f"data: {json.dumps(complete_event)}\n\n"


# ---------------------------------------------------------------------------
# PaddleOCR Direct Endpoint
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/paddle-direct")
async def paddle_direct(session_id: str):
    """Run PaddleOCR on the preprocessed image and build a word grid directly.

    Expects orientation/deskew/dewarp/crop to be done already.
    Uses the cropped image (falls back to dewarped, then original).
    The used image is stored as cropped_png so OverlayReconstruction
    can display it as the background.
    """
    # Try preprocessed images first (crop > dewarp > original)
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
        img_png = await get_session_image(session_id, "dewarped")
    if not img_png:
        img_png = await get_session_image(session_id, "original")
    if not img_png:
        raise HTTPException(status_code=404, detail="No image found for this session")

    img_arr = np.frombuffer(img_png, dtype=np.uint8)
    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Failed to decode original image")

    img_h, img_w = img_bgr.shape[:2]

    from cv_ocr_engines import ocr_region_paddle

    t0 = time.time()
    word_dicts = await ocr_region_paddle(img_bgr, region=None)
    if not word_dicts:
        raise HTTPException(status_code=400, detail="PaddleOCR returned no words")

    # Reuse build_grid_from_words — same function that works in the regular
    # pipeline with PaddleOCR (engine=paddle, grid_method=words_first).
    # Handles phrase splitting, column clustering, and reading order.
    cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h)
    duration = time.time() - t0

    # Tag cells as paddle_direct
    for cell in cells:
        cell["ocr_engine"] = "paddle_direct"

    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    col_types = {c.get("type") for c in columns_meta}
    is_vocab = bool(col_types & {"column_en", "column_de"})

    word_result = {
        "cells": cells,
        "grid_shape": {
            "rows": n_rows,
            "cols": n_cols,
            "total_cells": len(cells),
        },
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": "paddle_direct",
        "grid_method": "paddle_direct",
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    # Store preprocessed image as cropped_png so OverlayReconstruction shows it
    await update_session_db(
        session_id,
        word_result=word_result,
        cropped_png=img_png,
        current_step=8,
    )

    logger.info(
        "paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
        session_id, len(cells), n_rows, n_cols, duration,
    )

    await _append_pipeline_log(session_id, "paddle_direct", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "ocr_engine": "paddle_direct",
    }, duration_ms=int(duration * 1000))

    return {"session_id": session_id, **word_result}


# ---------------------------------------------------------------------------
# Ground Truth Words Endpoints
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/ground-truth/words")
async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
    """Save ground truth feedback for the word recognition step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    ground_truth = session.get("ground_truth") or {}
    gt = {
        "is_correct": req.is_correct,
        "corrected_entries": req.corrected_entries,
        "notes": req.notes,
        "saved_at": datetime.utcnow().isoformat(),
        "word_result": session.get("word_result"),
    }
    ground_truth["words"] = gt

    await update_session_db(session_id, ground_truth=ground_truth)

    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth

    return {"session_id": session_id, "ground_truth": gt}


@router.get("/sessions/{session_id}/ground-truth/words")
async def get_word_ground_truth(session_id: str):
    """Retrieve saved ground truth for word recognition."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    ground_truth = session.get("ground_truth") or {}
    words_gt = ground_truth.get("words")
    if not words_gt:
        raise HTTPException(status_code=404, detail="No word ground truth saved")

    return {
        "session_id": session_id,
        "words_gt": words_gt,
        "words_auto": session.get("word_result"),
    }