Files
breakpilot-lehrer/klausur-service/backend/cv_cell_grid_legacy.py
Benjamin Admin b2a0126f14 [split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths):
- grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones)
- cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab)
- worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes)
- legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion)
- cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel)
- cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel)
- rbac.py, admin_api.py, routes/eh.py remain (next batch)

backend-lehrer (1 monolith):
- classroom_engine/repository.py (1,705 → 7 files by domain)

All re-export barrels preserve backward compatibility.
Zero import errors verified.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:47:59 +02:00

437 lines
17 KiB
Python

"""
Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
_clean_cell_text,
_words_to_reading_order_text,
ocr_region_lighton,
ocr_region_rapid,
ocr_region_trocr,
)
from cv_cell_grid_helpers import (
_MIN_WORD_CONF,
_compute_cell_padding,
_ensure_minimum_crop_size,
_heal_row_gaps,
_is_artifact_row,
_select_psm_for_column,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
# ---------------------------------------------------------------------------
def _ocr_single_cell(
row_idx: int,
col_idx: int,
row: RowGeometry,
col: PageRegion,
ocr_img: np.ndarray,
img_bgr: Optional[np.ndarray],
img_w: int,
img_h: int,
use_rapid: bool,
engine_name: str,
lang: str,
lang_map: Dict[str, str],
preassigned_words: Optional[List[Dict]] = None,
) -> Dict[str, Any]:
"""Populate a single cell (column x row intersection) via word lookup."""
# Display bbox: exact column x row intersection (no padding)
disp_x = col.x
disp_y = row.y
disp_w = col.width
disp_h = row.height
# OCR crop: adaptive padding -- narrow columns get more context
pad = _compute_cell_padding(col.width, img_w)
cell_x = max(0, col.x - pad)
cell_y = max(0, row.y - pad)
cell_w = min(col.width + 2 * pad, img_w - cell_x)
cell_h = min(row.height + 2 * pad, img_h - cell_y)
is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
if disp_w <= 0 or disp_h <= 0:
return {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': '',
'confidence': 0.0,
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
'bbox_pct': {
'x': round(col.x / img_w * 100, 2),
'y': round(row.y / img_h * 100, 2),
'w': round(col.width / img_w * 100, 2),
'h': round(row.height / img_h * 100, 2),
},
'ocr_engine': 'word_lookup',
}
# --- PRIMARY: Word-lookup from full-page Tesseract ---
words = preassigned_words if preassigned_words is not None else []
used_engine = 'word_lookup'
# Filter low-confidence words
if words:
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
if words:
y_tol = max(15, row.height)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
else:
text = ''
avg_conf = 0.0
# --- FALLBACK: Cell-OCR for empty cells ---
_run_fallback = False
if not text.strip() and cell_w > 0 and cell_h > 0:
if ocr_img is not None:
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
_run_fallback = dark_ratio > 0.005
if _run_fallback:
# For narrow columns, upscale the crop before OCR
if is_narrow and ocr_img is not None:
_crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
_upscaled = _ensure_minimum_crop_size(_crop_slice)
if _upscaled is not _crop_slice:
_up_h, _up_w = _upscaled.shape[:2]
_tmp_region = PageRegion(
type=col.type, x=0, y=0, width=_up_w, height=_up_h,
)
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(_upscaled, _tmp_region,
lang=cell_lang, psm=_cell_psm)
# Remap word positions back to original image coordinates
_sx = cell_w / max(_up_w, 1)
_sy = cell_h / max(_up_h, 1)
for _fw in (fallback_words or []):
_fw['left'] = int(_fw['left'] * _sx) + cell_x
_fw['top'] = int(_fw['top'] * _sy) + cell_y
_fw['width'] = int(_fw['width'] * _sx)
_fw['height'] = int(_fw['height'] * _sy)
else:
cell_region = PageRegion(
type=col.type, x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(ocr_img, cell_region,
lang=cell_lang, psm=_cell_psm)
else:
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
fallback_words = ocr_region_lighton(img_bgr, cell_region)
elif use_rapid and img_bgr is not None:
fallback_words = ocr_region_rapid(img_bgr, cell_region)
else:
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(ocr_img, cell_region,
lang=cell_lang, psm=_cell_psm)
if fallback_words:
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if fallback_words:
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
fb_y_tol = max(10, int(fb_avg_h * 0.5))
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
if fb_text.strip():
text = fb_text
avg_conf = round(
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
)
used_engine = 'cell_ocr_fallback'
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
if not text.strip() and _run_fallback and not use_rapid:
_fb_region = PageRegion(
type=col.type, x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
cell_lang = lang_map.get(col.type, lang)
psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
if psm7_words:
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if psm7_words:
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
if p7_text.strip():
text = p7_text
avg_conf = round(
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
)
used_engine = 'cell_ocr_psm7'
# --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
if not text.strip() and is_narrow and img_bgr is not None:
row_region = PageRegion(
type='_row_strip', x=0, y=row.y,
width=img_w, height=row.height,
)
strip_words = ocr_region_rapid(img_bgr, row_region)
if strip_words:
col_left = col.x
col_right = col.x + col.width
col_words = []
for sw in strip_words:
sw_left = sw.get('left', 0)
sw_right = sw_left + sw.get('width', 0)
overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
if overlap > sw.get('width', 1) * 0.3:
col_words.append(sw)
if col_words:
col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if col_words:
rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
if rs_text.strip():
text = rs_text
avg_conf = round(
sum(w['conf'] for w in col_words) / len(col_words), 1
)
used_engine = 'row_strip_rapid'
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
if text.strip():
text = _clean_cell_text(text)
if not text:
avg_conf = 0.0
return {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': text,
'confidence': avg_conf,
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
'bbox_pct': {
'x': round(disp_x / img_w * 100, 2),
'y': round(disp_y / img_h * 100, 2),
'w': round(disp_w / img_w * 100, 2),
'h': round(disp_h / img_h * 100, 2),
},
'ocr_engine': used_engine,
}
# ---------------------------------------------------------------------------
# build_cell_grid — legacy grid builder (DEPRECATED)
# ---------------------------------------------------------------------------
def build_cell_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
row_geometries: List[RowGeometry],
img_w: int,
img_h: int,
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Generic Cell-Grid: Columns x Rows -> cells with OCR text.
DEPRECATED: Use build_cell_grid_v2 instead.
"""
# Resolve engine choice
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
# Filter to content rows only (skip header/footer)
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
logger.warning("build_cell_grid: no content rows found")
return [], []
before = len(content_rows)
content_rows = [r for r in content_rows if r.word_count > 0]
skipped = before - len(content_rows)
if skipped > 0:
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
if not content_rows:
logger.warning("build_cell_grid: no content rows with words found")
return [], []
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
logger.warning("build_cell_grid: no usable columns found")
return [], []
before_art = len(content_rows)
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
artifact_skipped = before_art - len(content_rows)
if artifact_skipped > 0:
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
if not content_rows:
logger.warning("build_cell_grid: no content rows after artifact filtering")
return [], []
_heal_row_gaps(
content_rows,
top_bound=min(c.y for c in relevant_cols),
bottom_bound=max(c.y + c.height for c in relevant_cols),
)
relevant_cols.sort(key=lambda c: c.x)
columns_meta = [
{
'index': col_idx,
'type': col.type,
'x': col.x,
'width': col.width,
}
for col_idx, col in enumerate(relevant_cols)
]
lang_map = {
'column_en': 'eng',
'column_de': 'deu',
'column_example': 'eng+deu',
}
cells: List[Dict[str, Any]] = []
for row_idx, row in enumerate(content_rows):
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols):
cell = _ocr_single_cell(
row_idx, col_idx, row, col,
ocr_img, img_bgr, img_w, img_h,
use_rapid, engine_name, lang, lang_map,
preassigned_words=col_words[col_idx],
)
cells.append(cell)
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
empty_by_col: Dict[int, List[int]] = {}
for ci, cell in enumerate(cells):
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
bpx = cell['bbox_px']
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
if w > 0 and h > 0 and ocr_img is not None:
crop = ocr_img[y:y + h, x:x + w]
if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
if dark_ratio > 0.005:
empty_by_col.setdefault(cell['col_index'], []).append(ci)
for col_idx, cell_indices in empty_by_col.items():
if len(cell_indices) < 3:
continue
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
col_x = cells[cell_indices[0]]['bbox_px']['x']
col_w = cells[cell_indices[0]]['bbox_px']['w']
strip_region = PageRegion(
type=relevant_cols[col_idx].type,
x=col_x, y=min_y,
width=col_w, height=max_y_h - min_y,
)
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
strip_words = ocr_region_lighton(img_bgr, strip_region)
elif use_rapid and img_bgr is not None:
strip_words = ocr_region_rapid(img_bgr, strip_region)
else:
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
if not strip_words:
continue
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
if not strip_words:
continue
for ci in cell_indices:
cell_y = cells[ci]['bbox_px']['y']
cell_h = cells[ci]['bbox_px']['h']
cell_mid_y = cell_y + cell_h / 2
matched_words = [
w for w in strip_words
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
]
if matched_words:
matched_words.sort(key=lambda w: w['left'])
batch_text = ' '.join(w['text'] for w in matched_words)
batch_text = _clean_cell_text(batch_text)
if batch_text.strip():
cells[ci]['text'] = batch_text
cells[ci]['confidence'] = round(
sum(w['conf'] for w in matched_words) / len(matched_words), 1
)
cells[ci]['ocr_engine'] = 'batch_column_ocr'
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
if batch_filled > 0:
logger.info(
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
f"empty cells in column {col_idx}"
)
# Remove all-empty rows
rows_with_text: set = set()
for cell in cells:
if cell['text'].strip():
rows_with_text.add(cell['row_index'])
before_filter = len(cells)
cells = [c for c in cells if c['row_index'] in rows_with_text]
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
if empty_rows_removed > 0:
logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
logger.info(f"build_cell_grid: {len(cells)} cells from "
f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
f"engine={engine_name}")
return cells, columns_meta