breakpilot-lehrer/klausur-service/backend/cv_cell_grid_helpers.py

"""
Shared helpers for cell-grid construction (v2 + legacy).

Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
cv_cell_grid_legacy.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import List

import numpy as np

from cv_vocab_types import RowGeometry

logger = logging.getLogger(__name__)

try:
    import cv2
except ImportError:
    cv2 = None  # type: ignore[assignment]

# Minimum OCR word confidence to keep (used across multiple functions)
_MIN_WORD_CONF = 30


def _compute_cell_padding(col_width: int, img_w: int) -> int:
    """Adaptive padding for OCR crops based on column width.

    Narrow columns (page_ref, marker) need more surrounding context so
    Tesseract can segment characters correctly.  Wide columns keep the
    minimal 4 px padding to avoid pulling in neighbours.
    """
    col_pct = col_width / img_w * 100 if img_w > 0 else 100
    if col_pct < 5:
        return max(20, col_width // 2)
    if col_pct < 10:
        return max(12, col_width // 4)
    if col_pct < 15:
        return 8
    return 4


def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
                               max_scale: int = 3) -> np.ndarray:
    """Upscale tiny crops so Tesseract gets enough pixel data.

    If either dimension is below *min_dim*, the crop is bicubic-upscaled
    so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
    """
    h, w = crop.shape[:2]
    if h >= min_dim and w >= min_dim:
        return crop
    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
    if scale <= 1.0:
        return crop
    new_w = int(w * scale)
    new_h = int(h * scale)
    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)


def _select_psm_for_column(col_type: str, col_width: int,
                            row_height: int) -> int:
    """Choose the best Tesseract PSM for a given column geometry.

    - page_ref columns are almost always single short tokens -> PSM 8
    - Very narrow or short cells -> PSM 7 (single text line)
    - Everything else -> PSM 6 (uniform block)
    """
    if col_type in ('page_ref', 'marker'):
        return 8  # single word
    if col_width < 100 or row_height < 30:
        return 7  # single line
    return 6  # uniform block


def _is_artifact_row(row: RowGeometry) -> bool:
    """Return True if this row contains only scan artifacts, not real text.

    Artifact rows (scanner shadows, noise) typically produce only single-character
    detections. A real content row always has at least one token with 2+ characters.
    """
    if row.word_count == 0:
        return True
    texts = [w.get('text', '').strip() for w in row.words]
    return all(len(t) <= 1 for t in texts)


def _heal_row_gaps(
    rows: List[RowGeometry],
    top_bound: int,
    bottom_bound: int,
) -> None:
    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.

    After filtering out empty or artifact rows, remaining content rows may have
    gaps between them where the removed rows used to be. This function mutates
    each row to extend upward/downward to the midpoint of such gaps so that
    OCR crops cover the full available content area.

    The first row always extends to top_bound; the last row to bottom_bound.
    """
    if not rows:
        return
    rows.sort(key=lambda r: r.y)
    n = len(rows)
    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation

    for i, row in enumerate(rows):
        # New top: midpoint between previous row's bottom and this row's top
        if i == 0:
            new_top = top_bound
        else:
            prev_bot = orig[i - 1][1]
            my_top = orig[i][0]
            gap = my_top - prev_bot
            new_top = prev_bot + gap // 2 if gap > 1 else my_top

        # New bottom: midpoint between this row's bottom and next row's top
        if i == n - 1:
            new_bottom = bottom_bound
        else:
            my_bot = orig[i][1]
            next_top = orig[i + 1][0]
            gap = next_top - my_bot
            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot

        row.y = new_top
        row.height = max(5, new_bottom - new_top)

    logger.debug(
        f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
        f"(bounds: top={top_bound}, bottom={bottom_bound})"
    )