klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
218 lines
7.0 KiB
Python
218 lines
7.0 KiB
Python
"""
|
|
Streaming variants of cell-grid builders (v2 + legacy).
|
|
|
|
Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
|
|
useful for progress reporting.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict, Generator, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
from cv_vocab_types import PageRegion, RowGeometry
|
|
from cv_ocr_engines import (
|
|
RAPIDOCR_AVAILABLE,
|
|
_assign_row_words_to_columns,
|
|
)
|
|
from cv_cell_grid_helpers import (
|
|
_heal_row_gaps,
|
|
_is_artifact_row,
|
|
)
|
|
from cv_cell_grid_build import _ocr_cell_crop
|
|
from cv_cell_grid_legacy import _ocr_single_cell
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# build_cell_grid_v2_streaming
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def build_cell_grid_v2_streaming(
|
|
ocr_img: np.ndarray,
|
|
column_regions: List[PageRegion],
|
|
row_geometries: List[RowGeometry],
|
|
img_w: int,
|
|
img_h: int,
|
|
lang: str = "eng+deu",
|
|
ocr_engine: str = "auto",
|
|
img_bgr: Optional[np.ndarray] = None,
|
|
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
|
"""Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
|
|
|
|
Yields:
|
|
(cell_dict, columns_meta, total_cells)
|
|
"""
|
|
use_rapid = False
|
|
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
|
engine_name = ocr_engine
|
|
elif ocr_engine == "auto":
|
|
engine_name = "tesseract"
|
|
elif ocr_engine == "rapid":
|
|
if not RAPIDOCR_AVAILABLE:
|
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
|
else:
|
|
use_rapid = True
|
|
engine_name = "rapid" if use_rapid else "tesseract"
|
|
else:
|
|
engine_name = "tesseract"
|
|
|
|
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
|
if not content_rows:
|
|
return
|
|
|
|
content_rows = [r for r in content_rows if r.word_count > 0]
|
|
if not content_rows:
|
|
return
|
|
|
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
|
|
'margin_bottom', 'margin_left', 'margin_right'}
|
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
|
if not relevant_cols:
|
|
return
|
|
|
|
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
|
if not content_rows:
|
|
return
|
|
|
|
# Use header/footer boundaries for heal_row_gaps
|
|
content_rows.sort(key=lambda r: r.y)
|
|
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
|
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
|
if header_rows:
|
|
top_bound = max(r.y + r.height for r in header_rows)
|
|
else:
|
|
top_bound = content_rows[0].y
|
|
if footer_rows:
|
|
bottom_bound = min(r.y for r in footer_rows)
|
|
else:
|
|
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
|
|
|
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
|
|
|
relevant_cols.sort(key=lambda c: c.x)
|
|
|
|
columns_meta = [
|
|
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
|
|
for ci, c in enumerate(relevant_cols)
|
|
]
|
|
|
|
lang_map = {
|
|
'column_en': 'eng',
|
|
'column_de': 'deu',
|
|
'column_example': 'eng+deu',
|
|
}
|
|
|
|
total_cells = len(content_rows) * len(relevant_cols)
|
|
|
|
for row_idx, row in enumerate(content_rows):
|
|
for col_idx, col in enumerate(relevant_cols):
|
|
cell = _ocr_cell_crop(
|
|
row_idx, col_idx, row, col,
|
|
ocr_img, img_bgr, img_w, img_h,
|
|
engine_name, lang, lang_map,
|
|
)
|
|
yield cell, columns_meta, total_cells
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# build_cell_grid_streaming — legacy streaming variant
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def build_cell_grid_streaming(
|
|
ocr_img: np.ndarray,
|
|
column_regions: List[PageRegion],
|
|
row_geometries: List[RowGeometry],
|
|
img_w: int,
|
|
img_h: int,
|
|
lang: str = "eng+deu",
|
|
ocr_engine: str = "auto",
|
|
img_bgr: Optional[np.ndarray] = None,
|
|
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
|
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
|
|
|
|
DEPRECATED: Use build_cell_grid_v2_streaming instead.
|
|
|
|
Yields:
|
|
(cell_dict, columns_meta, total_cells) for each cell.
|
|
"""
|
|
use_rapid = False
|
|
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
|
engine_name = ocr_engine
|
|
elif ocr_engine == "auto":
|
|
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
|
engine_name = "rapid" if use_rapid else "tesseract"
|
|
elif ocr_engine == "rapid":
|
|
if not RAPIDOCR_AVAILABLE:
|
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
|
else:
|
|
use_rapid = True
|
|
engine_name = "rapid" if use_rapid else "tesseract"
|
|
else:
|
|
engine_name = "tesseract"
|
|
|
|
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
|
if not content_rows:
|
|
return
|
|
|
|
before = len(content_rows)
|
|
content_rows = [r for r in content_rows if r.word_count > 0]
|
|
skipped = before - len(content_rows)
|
|
if skipped > 0:
|
|
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
|
|
if not content_rows:
|
|
return
|
|
|
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
|
if not relevant_cols:
|
|
return
|
|
|
|
before_art = len(content_rows)
|
|
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
|
artifact_skipped = before_art - len(content_rows)
|
|
if artifact_skipped > 0:
|
|
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
|
|
if not content_rows:
|
|
return
|
|
_heal_row_gaps(
|
|
content_rows,
|
|
top_bound=min(c.y for c in relevant_cols),
|
|
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
|
)
|
|
|
|
relevant_cols.sort(key=lambda c: c.x)
|
|
|
|
columns_meta = [
|
|
{
|
|
'index': col_idx,
|
|
'type': col.type,
|
|
'x': col.x,
|
|
'width': col.width,
|
|
}
|
|
for col_idx, col in enumerate(relevant_cols)
|
|
]
|
|
|
|
lang_map = {
|
|
'column_en': 'eng',
|
|
'column_de': 'deu',
|
|
'column_example': 'eng+deu',
|
|
}
|
|
|
|
total_cells = len(content_rows) * len(relevant_cols)
|
|
|
|
for row_idx, row in enumerate(content_rows):
|
|
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
|
for col_idx, col in enumerate(relevant_cols):
|
|
cell = _ocr_single_cell(
|
|
row_idx, col_idx, row, col,
|
|
ocr_img, img_bgr, img_w, img_h,
|
|
use_rapid, engine_name, lang, lang_map,
|
|
preassigned_words=col_words[col_idx],
|
|
)
|
|
yield cell, columns_meta, total_cells
|