[split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
136
klausur-service/backend/cv_cell_grid_helpers.py
Normal file
136
klausur-service/backend/cv_cell_grid_helpers.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
Shared helpers for cell-grid construction (v2 + legacy).
|
||||
|
||||
Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
|
||||
cv_cell_grid_legacy.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
# Minimum OCR word confidence to keep (used across multiple functions)
|
||||
_MIN_WORD_CONF = 30
|
||||
|
||||
|
||||
def _compute_cell_padding(col_width: int, img_w: int) -> int:
|
||||
"""Adaptive padding for OCR crops based on column width.
|
||||
|
||||
Narrow columns (page_ref, marker) need more surrounding context so
|
||||
Tesseract can segment characters correctly. Wide columns keep the
|
||||
minimal 4 px padding to avoid pulling in neighbours.
|
||||
"""
|
||||
col_pct = col_width / img_w * 100 if img_w > 0 else 100
|
||||
if col_pct < 5:
|
||||
return max(20, col_width // 2)
|
||||
if col_pct < 10:
|
||||
return max(12, col_width // 4)
|
||||
if col_pct < 15:
|
||||
return 8
|
||||
return 4
|
||||
|
||||
|
||||
def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
|
||||
max_scale: int = 3) -> np.ndarray:
|
||||
"""Upscale tiny crops so Tesseract gets enough pixel data.
|
||||
|
||||
If either dimension is below *min_dim*, the crop is bicubic-upscaled
|
||||
so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
|
||||
"""
|
||||
h, w = crop.shape[:2]
|
||||
if h >= min_dim and w >= min_dim:
|
||||
return crop
|
||||
scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
|
||||
if scale <= 1.0:
|
||||
return crop
|
||||
new_w = int(w * scale)
|
||||
new_h = int(h * scale)
|
||||
return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
|
||||
def _select_psm_for_column(col_type: str, col_width: int,
|
||||
row_height: int) -> int:
|
||||
"""Choose the best Tesseract PSM for a given column geometry.
|
||||
|
||||
- page_ref columns are almost always single short tokens -> PSM 8
|
||||
- Very narrow or short cells -> PSM 7 (single text line)
|
||||
- Everything else -> PSM 6 (uniform block)
|
||||
"""
|
||||
if col_type in ('page_ref', 'marker'):
|
||||
return 8 # single word
|
||||
if col_width < 100 or row_height < 30:
|
||||
return 7 # single line
|
||||
return 6 # uniform block
|
||||
|
||||
|
||||
def _is_artifact_row(row: RowGeometry) -> bool:
|
||||
"""Return True if this row contains only scan artifacts, not real text.
|
||||
|
||||
Artifact rows (scanner shadows, noise) typically produce only single-character
|
||||
detections. A real content row always has at least one token with 2+ characters.
|
||||
"""
|
||||
if row.word_count == 0:
|
||||
return True
|
||||
texts = [w.get('text', '').strip() for w in row.words]
|
||||
return all(len(t) <= 1 for t in texts)
|
||||
|
||||
|
||||
def _heal_row_gaps(
|
||||
rows: List[RowGeometry],
|
||||
top_bound: int,
|
||||
bottom_bound: int,
|
||||
) -> None:
|
||||
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
||||
|
||||
After filtering out empty or artifact rows, remaining content rows may have
|
||||
gaps between them where the removed rows used to be. This function mutates
|
||||
each row to extend upward/downward to the midpoint of such gaps so that
|
||||
OCR crops cover the full available content area.
|
||||
|
||||
The first row always extends to top_bound; the last row to bottom_bound.
|
||||
"""
|
||||
if not rows:
|
||||
return
|
||||
rows.sort(key=lambda r: r.y)
|
||||
n = len(rows)
|
||||
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
# New top: midpoint between previous row's bottom and this row's top
|
||||
if i == 0:
|
||||
new_top = top_bound
|
||||
else:
|
||||
prev_bot = orig[i - 1][1]
|
||||
my_top = orig[i][0]
|
||||
gap = my_top - prev_bot
|
||||
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
||||
|
||||
# New bottom: midpoint between this row's bottom and next row's top
|
||||
if i == n - 1:
|
||||
new_bottom = bottom_bound
|
||||
else:
|
||||
my_bot = orig[i][1]
|
||||
next_top = orig[i + 1][0]
|
||||
gap = next_top - my_bot
|
||||
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
||||
|
||||
row.y = new_top
|
||||
row.height = max(5, new_bottom - new_top)
|
||||
|
||||
logger.debug(
|
||||
f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
||||
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
||||
)
|
||||
Reference in New Issue
Block a user