""" Streaming variants of cell-grid builders (v2 + legacy). Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd, useful for progress reporting. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import Any, Dict, Generator, List, Optional, Tuple import numpy as np from cv_vocab_types import PageRegion, RowGeometry from cv_ocr_engines import ( RAPIDOCR_AVAILABLE, _assign_row_words_to_columns, ) from cv_cell_grid_helpers import ( _heal_row_gaps, _is_artifact_row, ) from cv_cell_grid_build import _ocr_cell_crop from cv_cell_grid_legacy import _ocr_single_cell logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # build_cell_grid_v2_streaming # --------------------------------------------------------------------------- def build_cell_grid_v2_streaming( ocr_img: np.ndarray, column_regions: List[PageRegion], row_geometries: List[RowGeometry], img_w: int, img_h: int, lang: str = "eng+deu", ocr_engine: str = "auto", img_bgr: Optional[np.ndarray] = None, ) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]: """Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd. Yields: (cell_dict, columns_meta, total_cells) """ use_rapid = False if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "auto": engine_name = "tesseract" elif ocr_engine == "rapid": if not RAPIDOCR_AVAILABLE: logger.warning("RapidOCR requested but not available, falling back to Tesseract") else: use_rapid = True engine_name = "rapid" if use_rapid else "tesseract" else: engine_name = "tesseract" content_rows = [r for r in row_geometries if r.row_type == 'content'] if not content_rows: return content_rows = [r for r in content_rows if r.word_count > 0] if not content_rows: return _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] if not relevant_cols: return content_rows = [r for r in content_rows if not _is_artifact_row(r)] if not content_rows: return # Use header/footer boundaries for heal_row_gaps content_rows.sort(key=lambda r: r.y) header_rows = [r for r in row_geometries if r.row_type == 'header'] footer_rows = [r for r in row_geometries if r.row_type == 'footer'] if header_rows: top_bound = max(r.y + r.height for r in header_rows) else: top_bound = content_rows[0].y if footer_rows: bottom_bound = min(r.y for r in footer_rows) else: bottom_bound = content_rows[-1].y + content_rows[-1].height _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) relevant_cols.sort(key=lambda c: c.x) columns_meta = [ {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width} for ci, c in enumerate(relevant_cols) ] lang_map = { 'column_en': 'eng', 'column_de': 'deu', 'column_example': 'eng+deu', } total_cells = len(content_rows) * len(relevant_cols) for row_idx, row in enumerate(content_rows): for col_idx, col in enumerate(relevant_cols): cell = _ocr_cell_crop( row_idx, col_idx, row, col, ocr_img, img_bgr, img_w, img_h, engine_name, lang, lang_map, ) yield cell, columns_meta, total_cells # --------------------------------------------------------------------------- # build_cell_grid_streaming — legacy streaming variant # --------------------------------------------------------------------------- def build_cell_grid_streaming( ocr_img: np.ndarray, column_regions: List[PageRegion], row_geometries: List[RowGeometry], img_w: int, img_h: int, lang: str = "eng+deu", ocr_engine: str = "auto", img_bgr: Optional[np.ndarray] = None, ) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]: """Like build_cell_grid(), but yields each cell as it is OCR'd. DEPRECATED: Use build_cell_grid_v2_streaming instead. Yields: (cell_dict, columns_meta, total_cells) for each cell. """ use_rapid = False if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): engine_name = ocr_engine elif ocr_engine == "auto": use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None engine_name = "rapid" if use_rapid else "tesseract" elif ocr_engine == "rapid": if not RAPIDOCR_AVAILABLE: logger.warning("RapidOCR requested but not available, falling back to Tesseract") else: use_rapid = True engine_name = "rapid" if use_rapid else "tesseract" else: engine_name = "tesseract" content_rows = [r for r in row_geometries if r.row_type == 'content'] if not content_rows: return before = len(content_rows) content_rows = [r for r in content_rows if r.word_count > 0] skipped = before - len(content_rows) if skipped > 0: logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)") if not content_rows: return _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] if not relevant_cols: return before_art = len(content_rows) content_rows = [r for r in content_rows if not _is_artifact_row(r)] artifact_skipped = before_art - len(content_rows) if artifact_skipped > 0: logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows") if not content_rows: return _heal_row_gaps( content_rows, top_bound=min(c.y for c in relevant_cols), bottom_bound=max(c.y + c.height for c in relevant_cols), ) relevant_cols.sort(key=lambda c: c.x) columns_meta = [ { 'index': col_idx, 'type': col.type, 'x': col.x, 'width': col.width, } for col_idx, col in enumerate(relevant_cols) ] lang_map = { 'column_en': 'eng', 'column_de': 'deu', 'column_example': 'eng+deu', } total_cells = len(content_rows) * len(relevant_cols) for row_idx, row in enumerate(content_rows): col_words = _assign_row_words_to_columns(row, relevant_cols) for col_idx, col in enumerate(relevant_cols): cell = _ocr_single_cell( row_idx, col_idx, row, col, ocr_img, img_bgr, img_w, img_h, use_rapid, engine_name, lang, lang_map, preassigned_words=col_words[col_idx], ) yield cell, columns_meta, total_cells