""" Page Crop - Core Crop and Format Detection Content-based crop for scanned pages and book scans. Detects the content boundary by analysing ink density projections and (for book scans) the spine shadow gradient. Extracted from page_crop.py to keep files under 500 LOC. License: Apache 2.0 """ import logging from typing import Dict, Any, Tuple import cv2 import numpy as np from page_crop_edges import ( _detect_left_edge_shadow, _detect_right_edge_shadow, _detect_top_bottom_edges, ) logger = logging.getLogger(__name__) # Known paper format aspect ratios (height / width, portrait orientation) PAPER_FORMATS = { "A4": 297.0 / 210.0, # 1.4143 "A5": 210.0 / 148.0, # 1.4189 "Letter": 11.0 / 8.5, # 1.2941 "Legal": 14.0 / 8.5, # 1.6471 "A3": 420.0 / 297.0, # 1.4141 } def detect_page_splits( img_bgr: np.ndarray, ) -> list: """Detect if the image is a multi-page spread and return split rectangles. Uses **brightness** (not ink density) to find the spine area: the scanner bed produces a characteristic gray strip where pages meet, which is darker than the white paper on either side. Returns a list of page dicts ``{x, y, width, height, page_index}`` or an empty list if only one page is detected. """ h, w = img_bgr.shape[:2] # Only check landscape-ish images (width > height * 1.15) if w < h * 1.15: return [] gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # Column-mean brightness (0-255) — the spine is darker (gray scanner bed) col_brightness = np.mean(gray, axis=0).astype(np.float64) # Heavy smoothing to ignore individual text lines kern = max(11, w // 50) if kern % 2 == 0: kern += 1 brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same") # Page paper is bright (typically > 200), spine/scanner bed is darker page_brightness = float(np.max(brightness_smooth)) if page_brightness < 100: return [] # Very dark image, skip # Spine threshold: significantly darker than the page spine_thresh = page_brightness * 0.88 # Search in center region (30-70% of width) center_lo = int(w * 0.30) center_hi = int(w * 0.70) # Find the darkest valley in the center region center_brightness = brightness_smooth[center_lo:center_hi] darkest_val = float(np.min(center_brightness)) if darkest_val >= spine_thresh: logger.debug("No spine detected: min brightness %.0f >= threshold %.0f", darkest_val, spine_thresh) return [] # Find ALL contiguous dark runs in the center region is_dark = center_brightness < spine_thresh dark_runs: list = [] run_start = -1 for i in range(len(is_dark)): if is_dark[i]: if run_start < 0: run_start = i else: if run_start >= 0: dark_runs.append((run_start, i)) run_start = -1 if run_start >= 0: dark_runs.append((run_start, len(is_dark))) # Filter out runs that are too narrow (< 1% of image width) min_spine_px = int(w * 0.01) dark_runs = [(s, e) for s, e in dark_runs if e - s >= min_spine_px] if not dark_runs: logger.debug("No dark runs wider than %dpx in center region", min_spine_px) return [] # Score each dark run: prefer centered, dark, narrow valleys center_region_len = center_hi - center_lo image_center_in_region = (w * 0.5 - center_lo) best_score = -1.0 best_start, best_end = dark_runs[0] for rs, re in dark_runs: run_width = re - rs run_center = (rs + re) / 2.0 sigma = center_region_len * 0.15 dist = abs(run_center - image_center_in_region) center_factor = float(np.exp(-0.5 * (dist / sigma) ** 2)) run_brightness = float(np.mean(center_brightness[rs:re])) darkness_factor = max(0.0, (spine_thresh - run_brightness) / spine_thresh) width_frac = run_width / w if width_frac <= 0.05: narrowness_bonus = 1.0 elif width_frac <= 0.15: narrowness_bonus = 1.0 - (width_frac - 0.05) / 0.10 else: narrowness_bonus = 0.0 score = center_factor * darkness_factor * (0.3 + 0.7 * narrowness_bonus) logger.debug( "Dark run x=%d..%d (w=%d): center_f=%.3f dark_f=%.3f narrow_b=%.3f -> score=%.4f", center_lo + rs, center_lo + re, run_width, center_factor, darkness_factor, narrowness_bonus, score, ) if score > best_score: best_score = score best_start, best_end = rs, re spine_w = best_end - best_start spine_x = center_lo + best_start spine_center = spine_x + spine_w // 2 logger.debug( "Best spine candidate: x=%d..%d (w=%d), score=%.4f", spine_x, spine_x + spine_w, spine_w, best_score, ) # Verify: must have bright (paper) content on BOTH sides left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x])) right_end = center_lo + best_end right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)])) if left_brightness < spine_thresh or right_brightness < spine_thresh: logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f", left_brightness, right_brightness, spine_thresh) return [] logger.info( "Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, " "left_paper=%.0f, right_paper=%.0f", spine_x, right_end, spine_w, darkest_val, page_brightness, left_brightness, right_brightness, ) # Split at the spine center split_points = [spine_center] # Build page rectangles pages: list = [] prev_x = 0 for i, sx in enumerate(split_points): pages.append({"x": prev_x, "y": 0, "width": sx - prev_x, "height": h, "page_index": i}) prev_x = sx pages.append({"x": prev_x, "y": 0, "width": w - prev_x, "height": h, "page_index": len(split_points)}) # Filter out tiny pages (< 15% of total width) pages = [p for p in pages if p["width"] >= w * 0.15] if len(pages) < 2: return [] # Re-index for i, p in enumerate(pages): p["page_index"] = i logger.info( "Page split detected: %d pages, spine_w=%d, split_points=%s", len(pages), spine_w, split_points, ) return pages def detect_and_crop_page( img_bgr: np.ndarray, margin_frac: float = 0.01, ) -> Tuple[np.ndarray, Dict[str, Any]]: """Detect content boundary and crop scanner/book borders. Algorithm (4-edge detection): 1. Adaptive threshold -> binary (text=255, bg=0) 2. Left edge: spine-shadow detection via grayscale column means, fallback to binary vertical projection 3. Right edge: binary vertical projection (last ink column) 4. Top/bottom edges: binary horizontal projection 5. Sanity checks, then crop with configurable margin Args: img_bgr: Input BGR image (should already be deskewed/dewarped) margin_frac: Extra margin around content (fraction of dimension, default 1%) Returns: Tuple of (cropped_image, result_dict) """ h, w = img_bgr.shape[:2] total_area = h * w result: Dict[str, Any] = { "crop_applied": False, "crop_rect": None, "crop_rect_pct": None, "original_size": {"width": w, "height": h}, "cropped_size": {"width": w, "height": h}, "detected_format": None, "format_confidence": 0.0, "aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4), "border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0}, } gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # --- Binarise with adaptive threshold --- binary = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, blockSize=51, C=15, ) # --- Edge detection --- left_edge = _detect_left_edge_shadow(gray, binary, w, h) right_edge = _detect_right_edge_shadow(gray, binary, w, h) top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h) # Compute border fractions border_top = top_edge / h border_bottom = (h - bottom_edge) / h border_left = left_edge / w border_right = (w - right_edge) / w result["border_fractions"] = { "top": round(border_top, 4), "bottom": round(border_bottom, 4), "left": round(border_left, 4), "right": round(border_right, 4), } # Sanity: only crop if at least one edge has > 2% border min_border = 0.02 if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]): logger.info("All borders < %.0f%% — no crop needed", min_border * 100) result["detected_format"], result["format_confidence"] = _detect_format(w, h) return img_bgr, result # Add margin margin_x = int(w * margin_frac) margin_y = int(h * margin_frac) crop_x = max(0, left_edge - margin_x) crop_y = max(0, top_edge - margin_y) crop_x2 = min(w, right_edge + margin_x) crop_y2 = min(h, bottom_edge + margin_y) crop_w = crop_x2 - crop_x crop_h = crop_y2 - crop_y # Sanity: cropped area must be >= 40% of original if crop_w * crop_h < 0.40 * total_area: logger.warning("Cropped area too small (%.0f%%) — skipping crop", 100.0 * crop_w * crop_h / total_area) result["detected_format"], result["format_confidence"] = _detect_format(w, h) return img_bgr, result cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy() detected_format, format_confidence = _detect_format(crop_w, crop_h) result["crop_applied"] = True result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h} result["crop_rect_pct"] = { "x": round(100.0 * crop_x / w, 2), "y": round(100.0 * crop_y / h, 2), "width": round(100.0 * crop_w / w, 2), "height": round(100.0 * crop_h / h, 2), } result["cropped_size"] = {"width": crop_w, "height": crop_h} result["detected_format"] = detected_format result["format_confidence"] = format_confidence result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4) logger.info( "Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), " "borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%", w, h, crop_w, crop_h, detected_format, format_confidence * 100, border_top * 100, border_bottom * 100, border_left * 100, border_right * 100, ) return cropped, result # --------------------------------------------------------------------------- # Format detection (kept as optional metadata) # --------------------------------------------------------------------------- def _detect_format(width: int, height: int) -> Tuple[str, float]: """Detect paper format from dimensions by comparing aspect ratios.""" if width <= 0 or height <= 0: return "unknown", 0.0 aspect = max(width, height) / min(width, height) best_format = "unknown" best_diff = float("inf") for fmt, expected_ratio in PAPER_FORMATS.items(): diff = abs(aspect - expected_ratio) if diff < best_diff: best_diff = diff best_format = fmt confidence = max(0.0, 1.0 - best_diff * 5.0) if confidence < 0.3: return "unknown", 0.0 return best_format, round(confidence, 3)