Files
breakpilot-lehrer/klausur-service/backend/page_crop.py
Benjamin Admin d36972b464
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
fix: detect spine by brightness, not ink density
The previous algorithm used binary ink projection and found false
splits at normal text column gaps. The spine of a book on a scanner
has a characteristic DARK gray strip (scanner bed) flanked by bright
white paper on both sides.

New approach: column-mean brightness with heavy smoothing, looking for
a dark valley (< 88% of paper brightness) in the center region that
has bright paper on both sides.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 16:52:29 +01:00

419 lines
14 KiB
Python

"""
Page Crop - Content-based crop for scanned pages and book scans.
Detects the content boundary by analysing ink density projections and
(for book scans) the spine shadow gradient. Works with both loose A4
sheets on dark scanners AND book scans with white backgrounds.
License: Apache 2.0
"""
import logging
from typing import Dict, Any, Tuple, Optional
import cv2
import numpy as np
logger = logging.getLogger(__name__)
# Known paper format aspect ratios (height / width, portrait orientation)
PAPER_FORMATS = {
"A4": 297.0 / 210.0, # 1.4143
"A5": 210.0 / 148.0, # 1.4189
"Letter": 11.0 / 8.5, # 1.2941
"Legal": 14.0 / 8.5, # 1.6471
"A3": 420.0 / 297.0, # 1.4141
}
# Minimum ink density (fraction of pixels) to count a row/column as "content"
_INK_THRESHOLD = 0.003 # 0.3%
# Minimum run length (fraction of dimension) to keep — shorter runs are noise
_MIN_RUN_FRAC = 0.005 # 0.5%
def detect_page_splits(
img_bgr: np.ndarray,
) -> list:
"""Detect if the image is a multi-page spread and return split rectangles.
Uses **brightness** (not ink density) to find the spine area:
the scanner bed produces a characteristic gray strip where pages meet,
which is darker than the white paper on either side.
Returns a list of page dicts ``{x, y, width, height, page_index}``
or an empty list if only one page is detected.
"""
h, w = img_bgr.shape[:2]
# Only check landscape-ish images (width > height * 1.15)
if w < h * 1.15:
return []
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# Column-mean brightness (0-255) — the spine is darker (gray scanner bed)
col_brightness = np.mean(gray, axis=0).astype(np.float64)
# Heavy smoothing to ignore individual text lines
kern = max(11, w // 50)
if kern % 2 == 0:
kern += 1
brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same")
# Page paper is bright (typically > 200), spine/scanner bed is darker
page_brightness = float(np.max(brightness_smooth))
if page_brightness < 100:
return [] # Very dark image, skip
# Spine threshold: significantly darker than the page
# Spine is typically 60-80% of paper brightness
spine_thresh = page_brightness * 0.88
# Search in center region (30-70% of width)
center_lo = int(w * 0.30)
center_hi = int(w * 0.70)
# Find the darkest valley in the center region
center_brightness = brightness_smooth[center_lo:center_hi]
darkest_val = float(np.min(center_brightness))
if darkest_val >= spine_thresh:
logger.debug("No spine detected: min brightness %.0f >= threshold %.0f",
darkest_val, spine_thresh)
return []
# Find the contiguous dark region (spine area)
is_dark = center_brightness < spine_thresh
# Find the widest dark run
best_start, best_end = 0, 0
run_start = -1
for i in range(len(is_dark)):
if is_dark[i]:
if run_start < 0:
run_start = i
else:
if run_start >= 0:
if i - run_start > best_end - best_start:
best_start, best_end = run_start, i
run_start = -1
if run_start >= 0 and len(is_dark) - run_start > best_end - best_start:
best_start, best_end = run_start, len(is_dark)
spine_w = best_end - best_start
if spine_w < w * 0.01:
logger.debug("Spine too narrow: %dpx (< %dpx)", spine_w, int(w * 0.01))
return []
spine_x = center_lo + best_start
spine_center = spine_x + spine_w // 2
# Verify: must have bright (paper) content on BOTH sides
left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
right_end = center_lo + best_end
right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)]))
if left_brightness < spine_thresh or right_brightness < spine_thresh:
logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f",
left_brightness, right_brightness, spine_thresh)
return []
logger.info(
"Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, "
"left_paper=%.0f, right_paper=%.0f",
spine_x, right_end, spine_w, darkest_val, page_brightness,
left_brightness, right_brightness,
)
# Split at the spine center
split_points = [spine_center]
# Build page rectangles
pages: list = []
prev_x = 0
for i, sx in enumerate(split_points):
pages.append({"x": prev_x, "y": 0, "width": sx - prev_x,
"height": h, "page_index": i})
prev_x = sx
pages.append({"x": prev_x, "y": 0, "width": w - prev_x,
"height": h, "page_index": len(split_points)})
# Filter out tiny pages (< 15% of total width)
pages = [p for p in pages if p["width"] >= w * 0.15]
if len(pages) < 2:
return []
# Re-index
for i, p in enumerate(pages):
p["page_index"] = i
logger.info(
"Page split detected: %d pages, gap widths=%s, split_points=%s",
len(pages),
[g["width"] for g in gaps[:len(split_points)]],
split_points,
)
return pages
def detect_and_crop_page(
img_bgr: np.ndarray,
margin_frac: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Detect content boundary and crop scanner/book borders.
Algorithm (4-edge detection):
1. Adaptive threshold → binary (text=255, bg=0)
2. Left edge: spine-shadow detection via grayscale column means,
fallback to binary vertical projection
3. Right edge: binary vertical projection (last ink column)
4. Top/bottom edges: binary horizontal projection
5. Sanity checks, then crop with configurable margin
Args:
img_bgr: Input BGR image (should already be deskewed/dewarped)
margin_frac: Extra margin around content (fraction of dimension, default 1%)
Returns:
Tuple of (cropped_image, result_dict)
"""
h, w = img_bgr.shape[:2]
total_area = h * w
result: Dict[str, Any] = {
"crop_applied": False,
"crop_rect": None,
"crop_rect_pct": None,
"original_size": {"width": w, "height": h},
"cropped_size": {"width": w, "height": h},
"detected_format": None,
"format_confidence": 0.0,
"aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
"border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
}
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# --- Binarise with adaptive threshold (works for white-on-white) ---
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, blockSize=51, C=15,
)
# --- Left edge: spine-shadow detection ---
left_edge = _detect_left_edge_shadow(gray, binary, w, h)
# --- Right edge: binary vertical projection ---
right_edge = _detect_right_edge(binary, w, h)
# --- Top / bottom edges: binary horizontal projection ---
top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)
# Compute border fractions
border_top = top_edge / h
border_bottom = (h - bottom_edge) / h
border_left = left_edge / w
border_right = (w - right_edge) / w
result["border_fractions"] = {
"top": round(border_top, 4),
"bottom": round(border_bottom, 4),
"left": round(border_left, 4),
"right": round(border_right, 4),
}
# Sanity: only crop if at least one edge has > 2% border
min_border = 0.02
if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# Add margin
margin_x = int(w * margin_frac)
margin_y = int(h * margin_frac)
crop_x = max(0, left_edge - margin_x)
crop_y = max(0, top_edge - margin_y)
crop_x2 = min(w, right_edge + margin_x)
crop_y2 = min(h, bottom_edge + margin_y)
crop_w = crop_x2 - crop_x
crop_h = crop_y2 - crop_y
# Sanity: cropped area must be >= 40% of original
if crop_w * crop_h < 0.40 * total_area:
logger.warning("Cropped area too small (%.0f%%) — skipping crop",
100.0 * crop_w * crop_h / total_area)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
detected_format, format_confidence = _detect_format(crop_w, crop_h)
result["crop_applied"] = True
result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
result["crop_rect_pct"] = {
"x": round(100.0 * crop_x / w, 2),
"y": round(100.0 * crop_y / h, 2),
"width": round(100.0 * crop_w / w, 2),
"height": round(100.0 * crop_h / h, 2),
}
result["cropped_size"] = {"width": crop_w, "height": crop_h}
result["detected_format"] = detected_format
result["format_confidence"] = format_confidence
result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
logger.info(
"Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
"borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
border_top * 100, border_bottom * 100,
border_left * 100, border_right * 100,
)
return cropped, result
# ---------------------------------------------------------------------------
# Edge detection helpers
# ---------------------------------------------------------------------------
def _detect_left_edge_shadow(
gray: np.ndarray,
binary: np.ndarray,
w: int,
h: int,
) -> int:
"""Detect left content edge, accounting for book-spine shadow.
Strategy: look at the left 25% of the image.
1. Compute column-mean brightness in grayscale.
2. Smooth with a boxcar kernel.
3. Find the transition from shadow (dark) to page (bright).
4. Fallback: use binary vertical projection if no shadow detected.
"""
search_w = max(1, w // 4)
# Column-mean brightness in the left quarter
col_means = np.mean(gray[:, :search_w], axis=0).astype(np.float64)
# Smooth with boxcar kernel (width = 1% of image width, min 5)
kernel_size = max(5, w // 100)
if kernel_size % 2 == 0:
kernel_size += 1
kernel = np.ones(kernel_size) / kernel_size
smoothed = np.convolve(col_means, kernel, mode="same")
# Determine brightness threshold: midpoint between darkest and brightest
val_min = float(np.min(smoothed))
val_max = float(np.max(smoothed))
shadow_range = val_max - val_min
# Only use shadow detection if there is a meaningful brightness gradient (> 20 levels)
if shadow_range > 20:
threshold = val_min + shadow_range * 0.6
# Find first column where brightness exceeds threshold
above = np.where(smoothed >= threshold)[0]
if len(above) > 0:
shadow_edge = int(above[0])
logger.debug("Left edge: shadow detected at x=%d (range=%.0f)", shadow_edge, shadow_range)
return shadow_edge
# Fallback: binary vertical projection
return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)
def _detect_right_edge(binary: np.ndarray, w: int, h: int) -> int:
"""Detect right content edge via binary vertical projection."""
return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)
def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
"""Detect top and bottom content edges via binary horizontal projection."""
top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
return top, bottom
def _detect_edge_projection(
binary: np.ndarray,
axis: int,
from_start: bool,
dim: int,
) -> int:
"""Find the first/last row or column with ink density above threshold.
axis=0 → project vertically (column densities) → returns x position
axis=1 → project horizontally (row densities) → returns y position
Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
"""
# Compute density per row/column (mean of binary pixels / 255)
projection = np.mean(binary, axis=axis) / 255.0
# Create mask of "ink" positions
ink_mask = projection >= _INK_THRESHOLD
# Filter narrow runs (noise)
min_run = max(1, int(dim * _MIN_RUN_FRAC))
ink_mask = _filter_narrow_runs(ink_mask, min_run)
ink_positions = np.where(ink_mask)[0]
if len(ink_positions) == 0:
return 0 if from_start else dim
if from_start:
return int(ink_positions[0])
else:
return int(ink_positions[-1])
def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
"""Remove True-runs shorter than min_run pixels."""
if min_run <= 1:
return mask
result = mask.copy()
n = len(result)
i = 0
while i < n:
if result[i]:
start = i
while i < n and result[i]:
i += 1
if i - start < min_run:
result[start:i] = False
else:
i += 1
return result
# ---------------------------------------------------------------------------
# Format detection (kept as optional metadata)
# ---------------------------------------------------------------------------
def _detect_format(width: int, height: int) -> Tuple[str, float]:
"""Detect paper format from dimensions by comparing aspect ratios."""
if width <= 0 or height <= 0:
return "unknown", 0.0
aspect = max(width, height) / min(width, height)
best_format = "unknown"
best_diff = float("inf")
for fmt, expected_ratio in PAPER_FORMATS.items():
diff = abs(aspect - expected_ratio)
if diff < best_diff:
best_diff = diff
best_format = fmt
confidence = max(0.0, 1.0 - best_diff * 5.0)
if confidence < 0.3:
return "unknown", 0.0
return best_format, round(confidence, 3)