breakpilot-lehrer/klausur-service/backend/cv_cell_grid_streaming.py

"""
Streaming variants of cell-grid builders (v2 + legacy).

Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
useful for progress reporting.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import Any, Dict, Generator, List, Optional, Tuple

import numpy as np

from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
    RAPIDOCR_AVAILABLE,
    _assign_row_words_to_columns,
)
from cv_cell_grid_helpers import (
    _heal_row_gaps,
    _is_artifact_row,
)
from cv_cell_grid_build import _ocr_cell_crop
from cv_cell_grid_legacy import _ocr_single_cell

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# build_cell_grid_v2_streaming
# ---------------------------------------------------------------------------

def build_cell_grid_v2_streaming(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
    """Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.

    Yields:
        (cell_dict, columns_meta, total_cells)
    """
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
        engine_name = "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
        else:
            use_rapid = True
        engine_name = "rapid" if use_rapid else "tesseract"
    else:
        engine_name = "tesseract"

    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        return

    content_rows = [r for r in content_rows if r.word_count > 0]
    if not content_rows:
        return

    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
                   'margin_bottom', 'margin_left', 'margin_right'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        return

    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    if not content_rows:
        return

    # Use header/footer boundaries for heal_row_gaps
    content_rows.sort(key=lambda r: r.y)
    header_rows = [r for r in row_geometries if r.row_type == 'header']
    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
    if header_rows:
        top_bound = max(r.y + r.height for r in header_rows)
    else:
        top_bound = content_rows[0].y
    if footer_rows:
        bottom_bound = min(r.y for r in footer_rows)
    else:
        bottom_bound = content_rows[-1].y + content_rows[-1].height

    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)

    relevant_cols.sort(key=lambda c: c.x)

    columns_meta = [
        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
        for ci, c in enumerate(relevant_cols)
    ]

    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }

    total_cells = len(content_rows) * len(relevant_cols)

    for row_idx, row in enumerate(content_rows):
        for col_idx, col in enumerate(relevant_cols):
            cell = _ocr_cell_crop(
                row_idx, col_idx, row, col,
                ocr_img, img_bgr, img_w, img_h,
                engine_name, lang, lang_map,
            )
            yield cell, columns_meta, total_cells


# ---------------------------------------------------------------------------
# build_cell_grid_streaming — legacy streaming variant
# ---------------------------------------------------------------------------

def build_cell_grid_streaming(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
    row_geometries: List[RowGeometry],
    img_w: int,
    img_h: int,
    lang: str = "eng+deu",
    ocr_engine: str = "auto",
    img_bgr: Optional[np.ndarray] = None,
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
    """Like build_cell_grid(), but yields each cell as it is OCR'd.

    DEPRECATED: Use build_cell_grid_v2_streaming instead.

    Yields:
        (cell_dict, columns_meta, total_cells) for each cell.
    """
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
        else:
            use_rapid = True
        engine_name = "rapid" if use_rapid else "tesseract"
    else:
        engine_name = "tesseract"

    content_rows = [r for r in row_geometries if r.row_type == 'content']
    if not content_rows:
        return

    before = len(content_rows)
    content_rows = [r for r in content_rows if r.word_count > 0]
    skipped = before - len(content_rows)
    if skipped > 0:
        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
    if not content_rows:
        return

    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols:
        return

    before_art = len(content_rows)
    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    artifact_skipped = before_art - len(content_rows)
    if artifact_skipped > 0:
        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
    if not content_rows:
        return
    _heal_row_gaps(
        content_rows,
        top_bound=min(c.y for c in relevant_cols),
        bottom_bound=max(c.y + c.height for c in relevant_cols),
    )

    relevant_cols.sort(key=lambda c: c.x)

    columns_meta = [
        {
            'index': col_idx,
            'type': col.type,
            'x': col.x,
            'width': col.width,
        }
        for col_idx, col in enumerate(relevant_cols)
    ]

    lang_map = {
        'column_en': 'eng',
        'column_de': 'deu',
        'column_example': 'eng+deu',
    }

    total_cells = len(content_rows) * len(relevant_cols)

    for row_idx, row in enumerate(content_rows):
        col_words = _assign_row_words_to_columns(row, relevant_cols)
        for col_idx, col in enumerate(relevant_cols):
            cell = _ocr_single_cell(
                row_idx, col_idx, row, col,
                ocr_img, img_bgr, img_w, img_h,
                use_rapid, engine_name, lang, lang_map,
                preassigned_words=col_words[col_idx],
            )
            yield cell, columns_meta, total_cells