klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
137 lines
4.4 KiB
Python
137 lines
4.4 KiB
Python
"""
|
|
Shared helpers for cell-grid construction (v2 + legacy).
|
|
|
|
Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
|
|
cv_cell_grid_legacy.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List
|
|
|
|
import numpy as np
|
|
|
|
from cv_vocab_types import RowGeometry
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import cv2
|
|
except ImportError:
|
|
cv2 = None # type: ignore[assignment]
|
|
|
|
# Minimum OCR word confidence to keep (used across multiple functions)
|
|
_MIN_WORD_CONF = 30
|
|
|
|
|
|
def _compute_cell_padding(col_width: int, img_w: int) -> int:
|
|
"""Adaptive padding for OCR crops based on column width.
|
|
|
|
Narrow columns (page_ref, marker) need more surrounding context so
|
|
Tesseract can segment characters correctly. Wide columns keep the
|
|
minimal 4 px padding to avoid pulling in neighbours.
|
|
"""
|
|
col_pct = col_width / img_w * 100 if img_w > 0 else 100
|
|
if col_pct < 5:
|
|
return max(20, col_width // 2)
|
|
if col_pct < 10:
|
|
return max(12, col_width // 4)
|
|
if col_pct < 15:
|
|
return 8
|
|
return 4
|
|
|
|
|
|
def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
|
|
max_scale: int = 3) -> np.ndarray:
|
|
"""Upscale tiny crops so Tesseract gets enough pixel data.
|
|
|
|
If either dimension is below *min_dim*, the crop is bicubic-upscaled
|
|
so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
|
|
"""
|
|
h, w = crop.shape[:2]
|
|
if h >= min_dim and w >= min_dim:
|
|
return crop
|
|
scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
|
|
if scale <= 1.0:
|
|
return crop
|
|
new_w = int(w * scale)
|
|
new_h = int(h * scale)
|
|
return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
|
|
|
|
|
def _select_psm_for_column(col_type: str, col_width: int,
|
|
row_height: int) -> int:
|
|
"""Choose the best Tesseract PSM for a given column geometry.
|
|
|
|
- page_ref columns are almost always single short tokens -> PSM 8
|
|
- Very narrow or short cells -> PSM 7 (single text line)
|
|
- Everything else -> PSM 6 (uniform block)
|
|
"""
|
|
if col_type in ('page_ref', 'marker'):
|
|
return 8 # single word
|
|
if col_width < 100 or row_height < 30:
|
|
return 7 # single line
|
|
return 6 # uniform block
|
|
|
|
|
|
def _is_artifact_row(row: RowGeometry) -> bool:
|
|
"""Return True if this row contains only scan artifacts, not real text.
|
|
|
|
Artifact rows (scanner shadows, noise) typically produce only single-character
|
|
detections. A real content row always has at least one token with 2+ characters.
|
|
"""
|
|
if row.word_count == 0:
|
|
return True
|
|
texts = [w.get('text', '').strip() for w in row.words]
|
|
return all(len(t) <= 1 for t in texts)
|
|
|
|
|
|
def _heal_row_gaps(
|
|
rows: List[RowGeometry],
|
|
top_bound: int,
|
|
bottom_bound: int,
|
|
) -> None:
|
|
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
|
|
|
After filtering out empty or artifact rows, remaining content rows may have
|
|
gaps between them where the removed rows used to be. This function mutates
|
|
each row to extend upward/downward to the midpoint of such gaps so that
|
|
OCR crops cover the full available content area.
|
|
|
|
The first row always extends to top_bound; the last row to bottom_bound.
|
|
"""
|
|
if not rows:
|
|
return
|
|
rows.sort(key=lambda r: r.y)
|
|
n = len(rows)
|
|
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
|
|
|
for i, row in enumerate(rows):
|
|
# New top: midpoint between previous row's bottom and this row's top
|
|
if i == 0:
|
|
new_top = top_bound
|
|
else:
|
|
prev_bot = orig[i - 1][1]
|
|
my_top = orig[i][0]
|
|
gap = my_top - prev_bot
|
|
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
|
|
|
# New bottom: midpoint between this row's bottom and next row's top
|
|
if i == n - 1:
|
|
new_bottom = bottom_bound
|
|
else:
|
|
my_bot = orig[i][1]
|
|
next_top = orig[i + 1][0]
|
|
gap = next_top - my_bot
|
|
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
|
|
|
row.y = new_top
|
|
row.height = max(5, new_bottom - new_top)
|
|
|
|
logger.debug(
|
|
f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
|
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
|
)
|