breakpilot-lehrer/klausur-service/backend/ocr_pipeline_words_detect.py

"""
OCR Pipeline Words Detect — main word detection endpoint (Step 7).

Extracted from ocr_pipeline_words.py. Contains the ``detect_words``
endpoint which handles both v2 and words_first grid methods.

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import json
import logging
import time
from typing import Any, Dict, List

import numpy as np
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import StreamingResponse

from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
    _fix_phonetic_brackets,
    fix_cell_phonetics,
    build_cell_grid_v2,
    create_ocr_image,
    detect_column_geometry,
)
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
)
from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _append_pipeline_log,
)
from ocr_pipeline_words_stream import (
    _word_batch_stream_generator,
)

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])


# ---------------------------------------------------------------------------
# Word Detection Endpoint (Step 7)
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/words")
async def detect_words(
    session_id: str,
    request: Request,
    engine: str = "auto",
    pronunciation: str = "british",
    stream: bool = False,
    skip_heal_gaps: bool = False,
    grid_method: str = "v2",
):
    """Build word grid from columns x rows, OCR each cell.

    Query params:
        engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
        pronunciation: 'british' (default) or 'american'
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry.
        grid_method: 'v2' (default) or 'words_first'
    """
    # PaddleOCR is full-page remote OCR -> force words_first grid method
    if engine == "paddle" and grid_method != "words_first":
        logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
        grid_method = "words_first"

    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if dewarped_bgr is None:
        logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
                       session_id, [k for k in cached.keys() if k.endswith('_bgr')])
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")

    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    column_result = session.get("column_result")
    row_result = session.get("row_result")
    if not column_result or not column_result.get("columns"):
        img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
        column_result = {
            "columns": [{
                "type": "column_text",
                "x": 0, "y": 0,
                "width": img_w_tmp, "height": img_h_tmp,
                "classification_confidence": 1.0,
                "classification_method": "full_page_fallback",
            }],
            "zones": [],
            "duration_seconds": 0,
        }
        logger.info("detect_words: no column_result -- using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")

    # Convert column dicts back to PageRegion objects
    col_regions = [
        PageRegion(
            type=c["type"],
            x=c["x"], y=c["y"],
            width=c["width"], height=c["height"],
            classification_confidence=c.get("classification_confidence", 1.0),
            classification_method=c.get("classification_method", ""),
        )
        for c in column_result["columns"]
    ]

    # Convert row dicts back to RowGeometry objects
    row_geoms = [
        RowGeometry(
            index=r["index"],
            x=r["x"], y=r["y"],
            width=r["width"], height=r["height"],
            word_count=r.get("word_count", 0),
            words=[],
            row_type=r.get("row_type", "content"),
            gap_before=r.get("gap_before", 0),
        )
        for r in row_result["rows"]
    ]

    # Populate word counts from cached words
    word_dicts = cached.get("_word_dicts")
    if word_dicts is None:
        ocr_img_tmp = create_ocr_image(dewarped_bgr)
        geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
        if geo_result is not None:
            _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
            cached["_word_dicts"] = word_dicts
            cached["_inv"] = inv
            cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

    if word_dicts:
        content_bounds = cached.get("_content_bounds")
        if content_bounds:
            _lx, _rx, top_y, _by = content_bounds
        else:
            top_y = min(r.y for r in row_geoms) if row_geoms else 0

        for row in row_geoms:
            row_y_rel = row.y - top_y
            row_bottom_rel = row_y_rel + row.height
            row.words = [
                w for w in word_dicts
                if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
            ]
            row.word_count = len(row.words)

    # Exclude rows that fall within box zones
    zones = column_result.get("zones") or []
    box_ranges_inner = []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
            box = zone["box"]
            bt = max(box.get("border_thickness", 0), 5)
            box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))

    if box_ranges_inner:
        def _row_in_box(r):
            center_y = r.y + r.height / 2
            return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)

        before_count = len(row_geoms)
        row_geoms = [r for r in row_geoms if not _row_in_box(r)]
        excluded = before_count - len(row_geoms)
        if excluded:
            logger.info(f"detect_words: excluded {excluded} rows inside box zones")

    # --- Words-First path ---
    if grid_method == "words_first":
        return await _words_first_path(
            session_id, cached, dewarped_bgr, engine, pronunciation, zones,
        )

    if stream:
        return StreamingResponse(
            _word_batch_stream_generator(
                session_id, cached, col_regions, row_geoms,
                dewarped_bgr, engine, pronunciation, request,
                skip_heal_gaps=skip_heal_gaps,
            ),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            },
        )

    # --- Non-streaming path (grid_method=v2) ---
    return await _v2_path(
        session_id, cached, col_regions, row_geoms,
        dewarped_bgr, engine, pronunciation, skip_heal_gaps,
    )


async def _words_first_path(
    session_id: str,
    cached: Dict[str, Any],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    zones: list,
) -> dict:
    """Words-first grid construction path."""
    t0 = time.time()
    img_h, img_w = dewarped_bgr.shape[:2]

    if engine == "paddle":
        from cv_ocr_engines import ocr_region_paddle
        wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
        cached["_paddle_word_dicts"] = wf_word_dicts
    else:
        wf_word_dicts = cached.get("_word_dicts")
        if wf_word_dicts is None:
            ocr_img_tmp = create_ocr_image(dewarped_bgr)
            geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
            if geo_result is not None:
                _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
                cached["_word_dicts"] = wf_word_dicts
                cached["_inv"] = inv
                cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

    if not wf_word_dicts:
        raise HTTPException(status_code=400, detail="No words detected -- cannot build words-first grid")

    # Convert word coordinates to absolute if needed
    if engine != "paddle":
        content_bounds = cached.get("_content_bounds")
        if content_bounds:
            lx, _rx, ty, _by = content_bounds
            abs_words = []
            for w in wf_word_dicts:
                abs_words.append({**w, 'left': w['left'] + lx, 'top': w['top'] + ty})
            wf_word_dicts = abs_words

    box_rects = []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
            box_rects.append(zone["box"])

    cells, columns_meta = build_grid_from_words(
        wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
    )
    duration = time.time() - t0

    fix_cell_phonetics(cells, pronunciation=pronunciation)
    for cell in cells:
        cell.setdefault("zone_index", 0)

    col_types = {c['type'] for c in columns_meta}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    used_engine = "paddle" if engine == "paddle" else "words_first"

    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "grid_method": "words_first",
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    if is_vocab or 'column_text' in col_types:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))

    await update_session_db(session_id, word_result=word_result, current_step=8)
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline: words-first session {session_id}: "
                f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")

    await _append_pipeline_log(session_id, "words", {
        "grid_method": "words_first",
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "ocr_engine": used_engine,
        "layout": word_result["layout"],
    }, duration_ms=int(duration * 1000))

    return {"session_id": session_id, **word_result}


async def _v2_path(
    session_id: str,
    cached: Dict[str, Any],
    col_regions: List[PageRegion],
    row_geoms: List[RowGeometry],
    dewarped_bgr: np.ndarray,
    engine: str,
    pronunciation: str,
    skip_heal_gaps: bool,
) -> dict:
    """Cell-First OCR v2 non-streaming path."""
    t0 = time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    img_h, img_w = dewarped_bgr.shape[:2]

    cells, columns_meta = build_cell_grid_v2(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
        skip_heal_gaps=skip_heal_gaps,
    )
    duration = time.time() - t0

    for cell in cells:
        cell.setdefault("zone_index", 0)

    col_types = {c['type'] for c in columns_meta}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
    n_cols = len(columns_meta)
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

    fix_cell_phonetics(cells, pronunciation=pronunciation)

    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
        "ocr_engine": used_engine,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
        },
    }

    has_text_col = 'column_text' in col_types
    if is_vocab or has_text_col:
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
        word_result["vocab_entries"] = entries
        word_result["entries"] = entries
        word_result["entry_count"] = len(entries)
        word_result["summary"]["total_entries"] = len(entries)
        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))

    await update_session_db(session_id, word_result=word_result, current_step=8)
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline: words session {session_id}: "
                f"layout={word_result['layout']}, "
                f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")

    await _append_pipeline_log(session_id, "words", {
        "total_cells": len(cells),
        "non_empty_cells": word_result["summary"]["non_empty_cells"],
        "low_confidence_count": word_result["summary"]["low_confidence"],
        "ocr_engine": used_engine,
        "layout": word_result["layout"],
        "entry_count": word_result.get("entry_count", 0),
    }, duration_ms=int(duration * 1000))

    return {"session_id": session_id, **word_result}