breakpilot-lehrer/klausur-service/backend/page_crop.py

"""
Page Crop - Content-based crop for scanned pages and book scans.

Detects the content boundary by analysing ink density projections and
(for book scans) the spine shadow gradient.  Works with both loose A4
sheets on dark scanners AND book scans with white backgrounds.

License: Apache 2.0
"""

import logging
from typing import Dict, Any, Tuple, Optional

import cv2
import numpy as np

logger = logging.getLogger(__name__)

# Known paper format aspect ratios (height / width, portrait orientation)
PAPER_FORMATS = {
    "A4": 297.0 / 210.0,       # 1.4143
    "A5": 210.0 / 148.0,       # 1.4189
    "Letter": 11.0 / 8.5,      # 1.2941
    "Legal": 14.0 / 8.5,       # 1.6471
    "A3": 420.0 / 297.0,       # 1.4141
}

# Minimum ink density (fraction of pixels) to count a row/column as "content"
_INK_THRESHOLD = 0.003  # 0.3%

# Minimum run length (fraction of dimension) to keep — shorter runs are noise
_MIN_RUN_FRAC = 0.005  # 0.5%


def detect_page_splits(
    img_bgr: np.ndarray,
    min_gap_frac: float = 0.008,
) -> list:
    """Detect if the image is a multi-page spread and return split rectangles.

    Checks for wide vertical gaps (spine area) that indicate the image
    contains multiple pages side by side (e.g. book on scanner).

    Returns a list of page dicts ``{x, y, width, height, page_index}``
    or an empty list if only one page is detected.
    """
    h, w = img_bgr.shape[:2]

    # Only check landscape-ish images (width > height * 0.85)
    if w < h * 1.15:
        return []

    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    binary = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV, blockSize=51, C=15,
    )

    # Vertical projection: mean ink density per column
    v_proj = np.mean(binary, axis=0) / 255.0

    # Smooth with boxcar (width = 0.5% of image width, min 5)
    kern = max(5, w // 200)
    if kern % 2 == 0:
        kern += 1
    v_smooth = np.convolve(v_proj, np.ones(kern) / kern, mode="same")

    peak = float(np.max(v_smooth))
    if peak < 0.005:
        return []

    # Look for valleys in center region (25-75% of width)
    gap_thresh = peak * 0.15  # valley must be < 15% of peak density
    center_lo = int(w * 0.25)
    center_hi = int(w * 0.75)
    min_gap_px = max(5, int(w * min_gap_frac))

    # Find contiguous gap runs in the center region
    gaps: list = []
    in_gap = False
    gap_start = 0
    for x in range(center_lo, center_hi):
        if v_smooth[x] < gap_thresh:
            if not in_gap:
                gap_start = x
                in_gap = True
        else:
            if in_gap:
                gap_w = x - gap_start
                if gap_w >= min_gap_px:
                    gaps.append({"x": gap_start, "width": gap_w,
                                 "center": gap_start + gap_w // 2})
                in_gap = False
    if in_gap:
        gap_w = center_hi - gap_start
        if gap_w >= min_gap_px:
            gaps.append({"x": gap_start, "width": gap_w,
                         "center": gap_start + gap_w // 2})

    if not gaps:
        return []

    # Merge nearby gaps (< 5% of width apart) — the spine area may have
    # thin ink strips between multiple gap segments
    merge_dist = max(20, int(w * 0.05))
    merged: list = [gaps[0]]
    for g in gaps[1:]:
        prev = merged[-1]
        prev_end = prev["x"] + prev["width"]
        if g["x"] - prev_end < merge_dist:
            # Merge: extend previous gap to cover both
            new_end = g["x"] + g["width"]
            prev["width"] = new_end - prev["x"]
            prev["center"] = prev["x"] + prev["width"] // 2
        else:
            merged.append(g)
    gaps = merged

    # Sort gaps by width (largest = most likely spine)
    gaps.sort(key=lambda g: g["width"], reverse=True)

    # Use only gaps that are significant (>= 2% of image width)
    significant_gaps = [g for g in gaps if g["width"] >= w * 0.02]
    if not significant_gaps:
        # Fall back to widest gap
        significant_gaps = [gaps[0]]

    # Use the significant gap(s) as split points
    split_points = sorted(g["center"] for g in significant_gaps[:3])

    # Build page rectangles
    pages: list = []
    prev_x = 0
    for i, sx in enumerate(split_points):
        pages.append({"x": prev_x, "y": 0, "width": sx - prev_x,
                       "height": h, "page_index": i})
        prev_x = sx
    pages.append({"x": prev_x, "y": 0, "width": w - prev_x,
                   "height": h, "page_index": len(split_points)})

    # Filter out tiny pages (< 15% of total width)
    pages = [p for p in pages if p["width"] >= w * 0.15]
    if len(pages) < 2:
        return []

    # Re-index
    for i, p in enumerate(pages):
        p["page_index"] = i

    logger.info(
        "Page split detected: %d pages, gap widths=%s, split_points=%s",
        len(pages),
        [g["width"] for g in gaps[:len(split_points)]],
        split_points,
    )
    return pages


def detect_and_crop_page(
    img_bgr: np.ndarray,
    margin_frac: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]:
    """Detect content boundary and crop scanner/book borders.

    Algorithm (4-edge detection):
    1. Adaptive threshold → binary (text=255, bg=0)
    2. Left edge: spine-shadow detection via grayscale column means,
       fallback to binary vertical projection
    3. Right edge: binary vertical projection (last ink column)
    4. Top/bottom edges: binary horizontal projection
    5. Sanity checks, then crop with configurable margin

    Args:
        img_bgr: Input BGR image (should already be deskewed/dewarped)
        margin_frac: Extra margin around content (fraction of dimension, default 1%)

    Returns:
        Tuple of (cropped_image, result_dict)
    """
    h, w = img_bgr.shape[:2]
    total_area = h * w

    result: Dict[str, Any] = {
        "crop_applied": False,
        "crop_rect": None,
        "crop_rect_pct": None,
        "original_size": {"width": w, "height": h},
        "cropped_size": {"width": w, "height": h},
        "detected_format": None,
        "format_confidence": 0.0,
        "aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
        "border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
    }

    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    # --- Binarise with adaptive threshold (works for white-on-white) ---
    binary = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV, blockSize=51, C=15,
    )

    # --- Left edge: spine-shadow detection ---
    left_edge = _detect_left_edge_shadow(gray, binary, w, h)

    # --- Right edge: binary vertical projection ---
    right_edge = _detect_right_edge(binary, w, h)

    # --- Top / bottom edges: binary horizontal projection ---
    top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)

    # Compute border fractions
    border_top = top_edge / h
    border_bottom = (h - bottom_edge) / h
    border_left = left_edge / w
    border_right = (w - right_edge) / w

    result["border_fractions"] = {
        "top": round(border_top, 4),
        "bottom": round(border_bottom, 4),
        "left": round(border_left, 4),
        "right": round(border_right, 4),
    }

    # Sanity: only crop if at least one edge has > 2% border
    min_border = 0.02
    if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
        logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
        result["detected_format"], result["format_confidence"] = _detect_format(w, h)
        return img_bgr, result

    # Add margin
    margin_x = int(w * margin_frac)
    margin_y = int(h * margin_frac)

    crop_x = max(0, left_edge - margin_x)
    crop_y = max(0, top_edge - margin_y)
    crop_x2 = min(w, right_edge + margin_x)
    crop_y2 = min(h, bottom_edge + margin_y)

    crop_w = crop_x2 - crop_x
    crop_h = crop_y2 - crop_y

    # Sanity: cropped area must be >= 40% of original
    if crop_w * crop_h < 0.40 * total_area:
        logger.warning("Cropped area too small (%.0f%%) — skipping crop",
                       100.0 * crop_w * crop_h / total_area)
        result["detected_format"], result["format_confidence"] = _detect_format(w, h)
        return img_bgr, result

    cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()

    detected_format, format_confidence = _detect_format(crop_w, crop_h)

    result["crop_applied"] = True
    result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
    result["crop_rect_pct"] = {
        "x": round(100.0 * crop_x / w, 2),
        "y": round(100.0 * crop_y / h, 2),
        "width": round(100.0 * crop_w / w, 2),
        "height": round(100.0 * crop_h / h, 2),
    }
    result["cropped_size"] = {"width": crop_w, "height": crop_h}
    result["detected_format"] = detected_format
    result["format_confidence"] = format_confidence
    result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)

    logger.info(
        "Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
        "borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
        w, h, crop_w, crop_h, detected_format, format_confidence * 100,
        border_top * 100, border_bottom * 100,
        border_left * 100, border_right * 100,
    )

    return cropped, result


# ---------------------------------------------------------------------------
# Edge detection helpers
# ---------------------------------------------------------------------------

def _detect_left_edge_shadow(
    gray: np.ndarray,
    binary: np.ndarray,
    w: int,
    h: int,
) -> int:
    """Detect left content edge, accounting for book-spine shadow.

    Strategy: look at the left 25% of the image.
    1. Compute column-mean brightness in grayscale.
    2. Smooth with a boxcar kernel.
    3. Find the transition from shadow (dark) to page (bright).
    4. Fallback: use binary vertical projection if no shadow detected.
    """
    search_w = max(1, w // 4)

    # Column-mean brightness in the left quarter
    col_means = np.mean(gray[:, :search_w], axis=0).astype(np.float64)

    # Smooth with boxcar kernel (width = 1% of image width, min 5)
    kernel_size = max(5, w // 100)
    if kernel_size % 2 == 0:
        kernel_size += 1
    kernel = np.ones(kernel_size) / kernel_size
    smoothed = np.convolve(col_means, kernel, mode="same")

    # Determine brightness threshold: midpoint between darkest and brightest
    val_min = float(np.min(smoothed))
    val_max = float(np.max(smoothed))
    shadow_range = val_max - val_min

    # Only use shadow detection if there is a meaningful brightness gradient (> 20 levels)
    if shadow_range > 20:
        threshold = val_min + shadow_range * 0.6
        # Find first column where brightness exceeds threshold
        above = np.where(smoothed >= threshold)[0]
        if len(above) > 0:
            shadow_edge = int(above[0])
            logger.debug("Left edge: shadow detected at x=%d (range=%.0f)", shadow_edge, shadow_range)
            return shadow_edge

    # Fallback: binary vertical projection
    return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)


def _detect_right_edge(binary: np.ndarray, w: int, h: int) -> int:
    """Detect right content edge via binary vertical projection."""
    return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)


def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
    """Detect top and bottom content edges via binary horizontal projection."""
    top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
    bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
    return top, bottom


def _detect_edge_projection(
    binary: np.ndarray,
    axis: int,
    from_start: bool,
    dim: int,
) -> int:
    """Find the first/last row or column with ink density above threshold.

    axis=0 → project vertically (column densities) → returns x position
    axis=1 → project horizontally (row densities) → returns y position

    Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
    """
    # Compute density per row/column (mean of binary pixels / 255)
    projection = np.mean(binary, axis=axis) / 255.0

    # Create mask of "ink" positions
    ink_mask = projection >= _INK_THRESHOLD

    # Filter narrow runs (noise)
    min_run = max(1, int(dim * _MIN_RUN_FRAC))
    ink_mask = _filter_narrow_runs(ink_mask, min_run)

    ink_positions = np.where(ink_mask)[0]
    if len(ink_positions) == 0:
        return 0 if from_start else dim

    if from_start:
        return int(ink_positions[0])
    else:
        return int(ink_positions[-1])


def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
    """Remove True-runs shorter than min_run pixels."""
    if min_run <= 1:
        return mask

    result = mask.copy()
    n = len(result)
    i = 0
    while i < n:
        if result[i]:
            start = i
            while i < n and result[i]:
                i += 1
            if i - start < min_run:
                result[start:i] = False
        else:
            i += 1
    return result


# ---------------------------------------------------------------------------
# Format detection (kept as optional metadata)
# ---------------------------------------------------------------------------

def _detect_format(width: int, height: int) -> Tuple[str, float]:
    """Detect paper format from dimensions by comparing aspect ratios."""
    if width <= 0 or height <= 0:
        return "unknown", 0.0

    aspect = max(width, height) / min(width, height)

    best_format = "unknown"
    best_diff = float("inf")

    for fmt, expected_ratio in PAPER_FORMATS.items():
        diff = abs(aspect - expected_ratio)
        if diff < best_diff:
            best_diff = diff
            best_format = fmt

    confidence = max(0.0, 1.0 - best_diff * 5.0)

    if confidence < 0.3:
        return "unknown", 0.0

    return best_format, round(confidence, 3)