[split-required] Split remaining Python monoliths (Phase 1 continued)

klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:47:59 +02:00
parent 0b37c5e692
commit b2a0126f14
34 changed files with 9264 additions and 9164 deletions
--- a/klausur-service/backend/cv_cell_grid_helpers.py
+++ b/klausur-service/backend/cv_cell_grid_helpers.py
@@ -0,0 +1,136 @@
+"""
+Shared helpers for cell-grid construction (v2 + legacy).
+
+Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
+cv_cell_grid_legacy.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+# Minimum OCR word confidence to keep (used across multiple functions)
+_MIN_WORD_CONF = 30
+
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+    """Adaptive padding for OCR crops based on column width.
+
+    Narrow columns (page_ref, marker) need more surrounding context so
+    Tesseract can segment characters correctly.  Wide columns keep the
+    minimal 4 px padding to avoid pulling in neighbours.
+    """
+    col_pct = col_width / img_w * 100 if img_w > 0 else 100
+    if col_pct < 5:
+        return max(20, col_width // 2)
+    if col_pct < 10:
+        return max(12, col_width // 4)
+    if col_pct < 15:
+        return 8
+    return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+                               max_scale: int = 3) -> np.ndarray:
+    """Upscale tiny crops so Tesseract gets enough pixel data.
+
+    If either dimension is below *min_dim*, the crop is bicubic-upscaled
+    so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
+    """
+    h, w = crop.shape[:2]
+    if h >= min_dim and w >= min_dim:
+        return crop
+    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+    if scale <= 1.0:
+        return crop
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+                            row_height: int) -> int:
+    """Choose the best Tesseract PSM for a given column geometry.
+
+    - page_ref columns are almost always single short tokens -> PSM 8
+    - Very narrow or short cells -> PSM 7 (single text line)
+    - Everything else -> PSM 6 (uniform block)
+    """
+    if col_type in ('page_ref', 'marker'):
+        return 8  # single word
+    if col_width < 100 or row_height < 30:
+        return 7  # single line
+    return 6  # uniform block
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+    """Return True if this row contains only scan artifacts, not real text.
+
+    Artifact rows (scanner shadows, noise) typically produce only single-character
+    detections. A real content row always has at least one token with 2+ characters.
+    """
+    if row.word_count == 0:
+        return True
+    texts = [w.get('text', '').strip() for w in row.words]
+    return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+    rows: List[RowGeometry],
+    top_bound: int,
+    bottom_bound: int,
+) -> None:
+    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+    After filtering out empty or artifact rows, remaining content rows may have
+    gaps between them where the removed rows used to be. This function mutates
+    each row to extend upward/downward to the midpoint of such gaps so that
+    OCR crops cover the full available content area.
+
+    The first row always extends to top_bound; the last row to bottom_bound.
+    """
+    if not rows:
+        return
+    rows.sort(key=lambda r: r.y)
+    n = len(rows)
+    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
+
+    for i, row in enumerate(rows):
+        # New top: midpoint between previous row's bottom and this row's top
+        if i == 0:
+            new_top = top_bound
+        else:
+            prev_bot = orig[i - 1][1]
+            my_top = orig[i][0]
+            gap = my_top - prev_bot
+            new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+        # New bottom: midpoint between this row's bottom and next row's top
+        if i == n - 1:
+            new_bottom = bottom_bound
+        else:
+            my_bot = orig[i][1]
+            next_top = orig[i + 1][0]
+            gap = next_top - my_bot
+            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+        row.y = new_top
+        row.height = max(5, new_bottom - new_top)
+
+    logger.debug(
+        f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+        f"(bounds: top={top_bound}, bottom={bottom_bound})"
+    )