breakpilot-lehrer/klausur-service/backend/page_crop.py

"""
Page Crop - Content-based crop for scanned pages and book scans.

Detects the content boundary by analysing ink density projections and
(for book scans) the spine shadow gradient.  Works with both loose A4
sheets on dark scanners AND book scans with white backgrounds.

License: Apache 2.0
"""

import logging
from typing import Dict, Any, Tuple, Optional

import cv2
import numpy as np

logger = logging.getLogger(__name__)

# Known paper format aspect ratios (height / width, portrait orientation)
PAPER_FORMATS = {
    "A4": 297.0 / 210.0,       # 1.4143
    "A5": 210.0 / 148.0,       # 1.4189
    "Letter": 11.0 / 8.5,      # 1.2941
    "Legal": 14.0 / 8.5,       # 1.6471
    "A3": 420.0 / 297.0,       # 1.4141
}

# Minimum ink density (fraction of pixels) to count a row/column as "content"
_INK_THRESHOLD = 0.003  # 0.3%

# Minimum run length (fraction of dimension) to keep — shorter runs are noise
_MIN_RUN_FRAC = 0.005  # 0.5%


def detect_page_splits(
    img_bgr: np.ndarray,
) -> list:
    """Detect if the image is a multi-page spread and return split rectangles.

    Uses **brightness** (not ink density) to find the spine area:
    the scanner bed produces a characteristic gray strip where pages meet,
    which is darker than the white paper on either side.

    Returns a list of page dicts ``{x, y, width, height, page_index}``
    or an empty list if only one page is detected.
    """
    h, w = img_bgr.shape[:2]

    # Only check landscape-ish images (width > height * 1.15)
    if w < h * 1.15:
        return []

    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    # Column-mean brightness (0-255) — the spine is darker (gray scanner bed)
    col_brightness = np.mean(gray, axis=0).astype(np.float64)

    # Heavy smoothing to ignore individual text lines
    kern = max(11, w // 50)
    if kern % 2 == 0:
        kern += 1
    brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same")

    # Page paper is bright (typically > 200), spine/scanner bed is darker
    page_brightness = float(np.max(brightness_smooth))
    if page_brightness < 100:
        return []  # Very dark image, skip

    # Spine threshold: significantly darker than the page
    # Spine is typically 60-80% of paper brightness
    spine_thresh = page_brightness * 0.88

    # Search in center region (30-70% of width)
    center_lo = int(w * 0.30)
    center_hi = int(w * 0.70)

    # Find the darkest valley in the center region
    center_brightness = brightness_smooth[center_lo:center_hi]
    darkest_val = float(np.min(center_brightness))

    if darkest_val >= spine_thresh:
        logger.debug("No spine detected: min brightness %.0f >= threshold %.0f",
                      darkest_val, spine_thresh)
        return []

    # Find ALL contiguous dark runs in the center region
    is_dark = center_brightness < spine_thresh
    dark_runs: list = []  # list of (start, end) pairs
    run_start = -1
    for i in range(len(is_dark)):
        if is_dark[i]:
            if run_start < 0:
                run_start = i
        else:
            if run_start >= 0:
                dark_runs.append((run_start, i))
                run_start = -1
    if run_start >= 0:
        dark_runs.append((run_start, len(is_dark)))

    # Filter out runs that are too narrow (< 1% of image width)
    min_spine_px = int(w * 0.01)
    dark_runs = [(s, e) for s, e in dark_runs if e - s >= min_spine_px]

    if not dark_runs:
        logger.debug("No dark runs wider than %dpx in center region", min_spine_px)
        return []

    # Score each dark run: prefer centered, dark, narrow valleys
    center_region_len = center_hi - center_lo
    image_center_in_region = (w * 0.5 - center_lo)  # x=50% mapped into region coords
    best_score = -1.0
    best_start, best_end = dark_runs[0]

    for rs, re in dark_runs:
        run_width = re - rs
        run_center = (rs + re) / 2.0

        # --- Factor 1: Proximity to image center (gaussian, sigma = 15% of region) ---
        sigma = center_region_len * 0.15
        dist = abs(run_center - image_center_in_region)
        center_factor = float(np.exp(-0.5 * (dist / sigma) ** 2))

        # --- Factor 2: Darkness (how dark is the valley relative to threshold) ---
        run_brightness = float(np.mean(center_brightness[rs:re]))
        # Normalize: 1.0 when run_brightness == 0, 0.0 when run_brightness == spine_thresh
        darkness_factor = max(0.0, (spine_thresh - run_brightness) / spine_thresh)

        # --- Factor 3: Narrowness bonus (spine shadows are narrow, not wide plateaus) ---
        # Typical spine: 1-5% of image width. Penalise runs wider than ~8%.
        width_frac = run_width / w
        if width_frac <= 0.05:
            narrowness_bonus = 1.0
        elif width_frac <= 0.15:
            narrowness_bonus = 1.0 - (width_frac - 0.05) / 0.10  # linear decay 1.0 → 0.0
        else:
            narrowness_bonus = 0.0

        score = center_factor * darkness_factor * (0.3 + 0.7 * narrowness_bonus)

        logger.debug(
            "Dark run x=%d..%d (w=%d): center_f=%.3f dark_f=%.3f narrow_b=%.3f → score=%.4f",
            center_lo + rs, center_lo + re, run_width,
            center_factor, darkness_factor, narrowness_bonus, score,
        )

        if score > best_score:
            best_score = score
            best_start, best_end = rs, re

    spine_w = best_end - best_start
    spine_x = center_lo + best_start
    spine_center = spine_x + spine_w // 2

    logger.debug(
        "Best spine candidate: x=%d..%d (w=%d), score=%.4f",
        spine_x, spine_x + spine_w, spine_w, best_score,
    )

    # Verify: must have bright (paper) content on BOTH sides
    left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
    right_end = center_lo + best_end
    right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)]))

    if left_brightness < spine_thresh or right_brightness < spine_thresh:
        logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f",
                      left_brightness, right_brightness, spine_thresh)
        return []

    logger.info(
        "Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, "
        "left_paper=%.0f, right_paper=%.0f",
        spine_x, right_end, spine_w, darkest_val, page_brightness,
        left_brightness, right_brightness,
    )

    # Split at the spine center
    split_points = [spine_center]

    # Build page rectangles
    pages: list = []
    prev_x = 0
    for i, sx in enumerate(split_points):
        pages.append({"x": prev_x, "y": 0, "width": sx - prev_x,
                       "height": h, "page_index": i})
        prev_x = sx
    pages.append({"x": prev_x, "y": 0, "width": w - prev_x,
                   "height": h, "page_index": len(split_points)})

    # Filter out tiny pages (< 15% of total width)
    pages = [p for p in pages if p["width"] >= w * 0.15]
    if len(pages) < 2:
        return []

    # Re-index
    for i, p in enumerate(pages):
        p["page_index"] = i

    logger.info(
        "Page split detected: %d pages, spine_w=%d, split_points=%s",
        len(pages), spine_w, split_points,
    )
    return pages


def detect_and_crop_page(
    img_bgr: np.ndarray,
    margin_frac: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]:
    """Detect content boundary and crop scanner/book borders.

    Algorithm (4-edge detection):
    1. Adaptive threshold → binary (text=255, bg=0)
    2. Left edge: spine-shadow detection via grayscale column means,
       fallback to binary vertical projection
    3. Right edge: binary vertical projection (last ink column)
    4. Top/bottom edges: binary horizontal projection
    5. Sanity checks, then crop with configurable margin

    Args:
        img_bgr: Input BGR image (should already be deskewed/dewarped)
        margin_frac: Extra margin around content (fraction of dimension, default 1%)

    Returns:
        Tuple of (cropped_image, result_dict)
    """
    h, w = img_bgr.shape[:2]
    total_area = h * w

    result: Dict[str, Any] = {
        "crop_applied": False,
        "crop_rect": None,
        "crop_rect_pct": None,
        "original_size": {"width": w, "height": h},
        "cropped_size": {"width": w, "height": h},
        "detected_format": None,
        "format_confidence": 0.0,
        "aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
        "border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
    }

    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    # --- Binarise with adaptive threshold (works for white-on-white) ---
    binary = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV, blockSize=51, C=15,
    )

    # --- Left edge: spine-shadow detection ---
    left_edge = _detect_left_edge_shadow(gray, binary, w, h)

    # --- Right edge: spine-shadow detection ---
    right_edge = _detect_right_edge_shadow(gray, binary, w, h)

    # --- Top / bottom edges: binary horizontal projection ---
    top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)

    # Compute border fractions
    border_top = top_edge / h
    border_bottom = (h - bottom_edge) / h
    border_left = left_edge / w
    border_right = (w - right_edge) / w

    result["border_fractions"] = {
        "top": round(border_top, 4),
        "bottom": round(border_bottom, 4),
        "left": round(border_left, 4),
        "right": round(border_right, 4),
    }

    # Sanity: only crop if at least one edge has > 2% border
    min_border = 0.02
    if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
        logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
        result["detected_format"], result["format_confidence"] = _detect_format(w, h)
        return img_bgr, result

    # Add margin
    margin_x = int(w * margin_frac)
    margin_y = int(h * margin_frac)

    crop_x = max(0, left_edge - margin_x)
    crop_y = max(0, top_edge - margin_y)
    crop_x2 = min(w, right_edge + margin_x)
    crop_y2 = min(h, bottom_edge + margin_y)

    crop_w = crop_x2 - crop_x
    crop_h = crop_y2 - crop_y

    # Sanity: cropped area must be >= 40% of original
    if crop_w * crop_h < 0.40 * total_area:
        logger.warning("Cropped area too small (%.0f%%) — skipping crop",
                       100.0 * crop_w * crop_h / total_area)
        result["detected_format"], result["format_confidence"] = _detect_format(w, h)
        return img_bgr, result

    cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()

    detected_format, format_confidence = _detect_format(crop_w, crop_h)

    result["crop_applied"] = True
    result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
    result["crop_rect_pct"] = {
        "x": round(100.0 * crop_x / w, 2),
        "y": round(100.0 * crop_y / h, 2),
        "width": round(100.0 * crop_w / w, 2),
        "height": round(100.0 * crop_h / h, 2),
    }
    result["cropped_size"] = {"width": crop_w, "height": crop_h}
    result["detected_format"] = detected_format
    result["format_confidence"] = format_confidence
    result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)

    logger.info(
        "Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
        "borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
        w, h, crop_w, crop_h, detected_format, format_confidence * 100,
        border_top * 100, border_bottom * 100,
        border_left * 100, border_right * 100,
    )

    return cropped, result


# ---------------------------------------------------------------------------
# Edge detection helpers
# ---------------------------------------------------------------------------

def _detect_spine_shadow(
    gray: np.ndarray,
    search_region: np.ndarray,
    offset_x: int,
    w: int,
    side: str,
) -> Optional[int]:
    """Find the book spine center (darkest point) in a scanner shadow.

    The scanner produces a gray strip where the book spine presses against
    the glass.  The darkest column in that strip is the spine center —
    that's where we crop.

    Distinguishes real spine shadows from text content by checking:
    1. Strong brightness range (> 40 levels)
    2. Darkest point is genuinely dark (< 180 mean brightness)
    3. The dark area is a NARROW valley, not a text-content plateau
    4. Brightness rises significantly toward the page content side

    Args:
        gray: Full grayscale image (for context).
        search_region: Column slice of the grayscale image to search in.
        offset_x: X offset of search_region relative to full image.
        w: Full image width.
        side: 'left' or 'right' (for logging).

    Returns:
        X coordinate (in full image) of the spine center, or None.
    """
    region_w = search_region.shape[1]
    if region_w < 10:
        return None

    # Column-mean brightness in the search region
    col_means = np.mean(search_region, axis=0).astype(np.float64)

    # Smooth with boxcar kernel (width = 1% of image width, min 5)
    kernel_size = max(5, w // 100)
    if kernel_size % 2 == 0:
        kernel_size += 1
    kernel = np.ones(kernel_size) / kernel_size
    smoothed_raw = np.convolve(col_means, kernel, mode="same")

    # Trim convolution edge artifacts (edges are zero-padded → artificially low)
    margin = kernel_size // 2
    if region_w <= 2 * margin + 10:
        return None
    smoothed = smoothed_raw[margin:region_w - margin]
    trim_offset = margin  # offset of smoothed[0] relative to search_region

    val_min = float(np.min(smoothed))
    val_max = float(np.max(smoothed))
    shadow_range = val_max - val_min

    # --- Check 1: Strong brightness gradient ---
    if shadow_range <= 40:
        logger.debug(
            "%s edge: no spine (range=%.0f <= 40)", side.capitalize(), shadow_range,
        )
        return None

    # --- Check 2: Darkest point must be genuinely dark ---
    # Spine shadows have mean column brightness 60-160.
    # Text on white paper stays above 180.
    if val_min > 180:
        logger.debug(
            "%s edge: no spine (darkest=%.0f > 180, likely text)", side.capitalize(), val_min,
        )
        return None

    spine_idx = int(np.argmin(smoothed))  # index in trimmed array
    spine_local = spine_idx + trim_offset  # index in search_region
    trimmed_len = len(smoothed)

    # --- Check 3: Valley width (spine is narrow, text plateau is wide) ---
    # Count how many columns are within 20% of the shadow range above the min.
    valley_thresh = val_min + shadow_range * 0.20
    valley_mask = smoothed < valley_thresh
    valley_width = int(np.sum(valley_mask))
    # Spine valleys are typically 3-15% of image width (20-120px on a 800px image).
    # Text content plateaus span 20%+ of the search region.
    max_valley_frac = 0.50  # valley must not cover more than half the trimmed region
    if valley_width > trimmed_len * max_valley_frac:
        logger.debug(
            "%s edge: no spine (valley too wide: %d/%d = %.0f%%)",
            side.capitalize(), valley_width, trimmed_len,
            100.0 * valley_width / trimmed_len,
        )
        return None

    # --- Check 4: Brightness must rise toward page content ---
    # For left edge: after spine, brightness should rise (= page paper)
    # For right edge: before spine, brightness should rise
    rise_check_w = max(5, trimmed_len // 5)  # check 20% of trimmed region
    if side == "left":
        # Check columns to the right of the spine (in trimmed array)
        right_start = min(spine_idx + 5, trimmed_len - 1)
        right_end = min(right_start + rise_check_w, trimmed_len)
        if right_end > right_start:
            rise_brightness = float(np.mean(smoothed[right_start:right_end]))
            rise = rise_brightness - val_min
            if rise < shadow_range * 0.3:
                logger.debug(
                    "%s edge: no spine (insufficient rise: %.0f, need %.0f)",
                    side.capitalize(), rise, shadow_range * 0.3,
                )
                return None
    else:  # right
        # Check columns to the left of the spine (in trimmed array)
        left_end = max(spine_idx - 5, 0)
        left_start = max(left_end - rise_check_w, 0)
        if left_end > left_start:
            rise_brightness = float(np.mean(smoothed[left_start:left_end]))
            rise = rise_brightness - val_min
            if rise < shadow_range * 0.3:
                logger.debug(
                    "%s edge: no spine (insufficient rise: %.0f, need %.0f)",
                    side.capitalize(), rise, shadow_range * 0.3,
                )
                return None

    spine_x = offset_x + spine_local

    logger.info(
        "%s edge: spine center at x=%d (brightness=%.0f, range=%.0f, valley=%dpx)",
        side.capitalize(), spine_x, val_min, shadow_range, valley_width,
    )
    return spine_x


def _detect_left_edge_shadow(
    gray: np.ndarray,
    binary: np.ndarray,
    w: int,
    h: int,
) -> int:
    """Detect left content edge, accounting for book-spine shadow.

    Looks at the left 25% for a scanner gray strip.  Cuts at the
    darkest column (= spine center).  Fallback: binary projection.
    """
    search_w = max(1, w // 4)
    spine_x = _detect_spine_shadow(gray, gray[:, :search_w], 0, w, "left")
    if spine_x is not None:
        return spine_x

    # Fallback: binary vertical projection
    return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)


def _detect_right_edge_shadow(
    gray: np.ndarray,
    binary: np.ndarray,
    w: int,
    h: int,
) -> int:
    """Detect right content edge, accounting for book-spine shadow.

    Looks at the right 25% for a scanner gray strip.  Cuts at the
    darkest column (= spine center).  Fallback: binary projection.
    """
    search_w = max(1, w // 4)
    right_start = w - search_w
    spine_x = _detect_spine_shadow(gray, gray[:, right_start:], right_start, w, "right")
    if spine_x is not None:
        return spine_x

    # Fallback: binary vertical projection
    return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)


def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
    """Detect top and bottom content edges via binary horizontal projection."""
    top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
    bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
    return top, bottom


def _detect_edge_projection(
    binary: np.ndarray,
    axis: int,
    from_start: bool,
    dim: int,
) -> int:
    """Find the first/last row or column with ink density above threshold.

    axis=0 → project vertically (column densities) → returns x position
    axis=1 → project horizontally (row densities) → returns y position

    Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
    """
    # Compute density per row/column (mean of binary pixels / 255)
    projection = np.mean(binary, axis=axis) / 255.0

    # Create mask of "ink" positions
    ink_mask = projection >= _INK_THRESHOLD

    # Filter narrow runs (noise)
    min_run = max(1, int(dim * _MIN_RUN_FRAC))
    ink_mask = _filter_narrow_runs(ink_mask, min_run)

    ink_positions = np.where(ink_mask)[0]
    if len(ink_positions) == 0:
        return 0 if from_start else dim

    if from_start:
        return int(ink_positions[0])
    else:
        return int(ink_positions[-1])


def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
    """Remove True-runs shorter than min_run pixels."""
    if min_run <= 1:
        return mask

    result = mask.copy()
    n = len(result)
    i = 0
    while i < n:
        if result[i]:
            start = i
            while i < n and result[i]:
                i += 1
            if i - start < min_run:
                result[start:i] = False
        else:
            i += 1
    return result


# ---------------------------------------------------------------------------
# Format detection (kept as optional metadata)
# ---------------------------------------------------------------------------

def _detect_format(width: int, height: int) -> Tuple[str, float]:
    """Detect paper format from dimensions by comparing aspect ratios."""
    if width <= 0 or height <= 0:
        return "unknown", 0.0

    aspect = max(width, height) / min(width, height)

    best_format = "unknown"
    best_diff = float("inf")

    for fmt, expected_ratio in PAPER_FORMATS.items():
        diff = abs(aspect - expected_ratio)
        if diff < best_diff:
            best_diff = diff
            best_format = fmt

    confidence = max(0.0, 1.0 - best_diff * 5.0)

    if confidence < 0.3:
        return "unknown", 0.0

    return best_format, round(confidence, 3)