Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 40s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m39s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 32s
Left-side book fold shadows have a V-shape: brightness dips from the edge toward a peak at ~5-10% of width, then rises again. The previous algorithm scanned from the edge inward and immediately found a low dark fraction (0.13 at x=0), missing the gutter entirely. Now finds the PEAK of the dark fraction profile first, then scans from that peak toward the page center to find the transition point. Works for both V-shaped left gutters and edge-darkening right gutters. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
759 lines
26 KiB
Python
759 lines
26 KiB
Python
"""
|
||
Page Crop - Content-based crop for scanned pages and book scans.
|
||
|
||
Detects the content boundary by analysing ink density projections and
|
||
(for book scans) the spine shadow gradient. Works with both loose A4
|
||
sheets on dark scanners AND book scans with white backgrounds.
|
||
|
||
License: Apache 2.0
|
||
"""
|
||
|
||
import logging
|
||
from typing import Dict, Any, Tuple, Optional
|
||
|
||
import cv2
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Known paper format aspect ratios (height / width, portrait orientation)
|
||
PAPER_FORMATS = {
|
||
"A4": 297.0 / 210.0, # 1.4143
|
||
"A5": 210.0 / 148.0, # 1.4189
|
||
"Letter": 11.0 / 8.5, # 1.2941
|
||
"Legal": 14.0 / 8.5, # 1.6471
|
||
"A3": 420.0 / 297.0, # 1.4141
|
||
}
|
||
|
||
# Minimum ink density (fraction of pixels) to count a row/column as "content"
|
||
_INK_THRESHOLD = 0.003 # 0.3%
|
||
|
||
# Minimum run length (fraction of dimension) to keep — shorter runs are noise
|
||
_MIN_RUN_FRAC = 0.005 # 0.5%
|
||
|
||
|
||
def detect_page_splits(
|
||
img_bgr: np.ndarray,
|
||
) -> list:
|
||
"""Detect if the image is a multi-page spread and return split rectangles.
|
||
|
||
Uses **brightness** (not ink density) to find the spine area:
|
||
the scanner bed produces a characteristic gray strip where pages meet,
|
||
which is darker than the white paper on either side.
|
||
|
||
Returns a list of page dicts ``{x, y, width, height, page_index}``
|
||
or an empty list if only one page is detected.
|
||
"""
|
||
h, w = img_bgr.shape[:2]
|
||
|
||
# Only check landscape-ish images (width > height * 1.15)
|
||
if w < h * 1.15:
|
||
return []
|
||
|
||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Column-mean brightness (0-255) — the spine is darker (gray scanner bed)
|
||
col_brightness = np.mean(gray, axis=0).astype(np.float64)
|
||
|
||
# Heavy smoothing to ignore individual text lines
|
||
kern = max(11, w // 50)
|
||
if kern % 2 == 0:
|
||
kern += 1
|
||
brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same")
|
||
|
||
# Page paper is bright (typically > 200), spine/scanner bed is darker
|
||
page_brightness = float(np.max(brightness_smooth))
|
||
if page_brightness < 100:
|
||
return [] # Very dark image, skip
|
||
|
||
# Spine threshold: significantly darker than the page
|
||
# Spine is typically 60-80% of paper brightness
|
||
spine_thresh = page_brightness * 0.88
|
||
|
||
# Search in center region (30-70% of width)
|
||
center_lo = int(w * 0.30)
|
||
center_hi = int(w * 0.70)
|
||
|
||
# Find the darkest valley in the center region
|
||
center_brightness = brightness_smooth[center_lo:center_hi]
|
||
darkest_val = float(np.min(center_brightness))
|
||
|
||
if darkest_val >= spine_thresh:
|
||
logger.debug("No spine detected: min brightness %.0f >= threshold %.0f",
|
||
darkest_val, spine_thresh)
|
||
return []
|
||
|
||
# Find ALL contiguous dark runs in the center region
|
||
is_dark = center_brightness < spine_thresh
|
||
dark_runs: list = [] # list of (start, end) pairs
|
||
run_start = -1
|
||
for i in range(len(is_dark)):
|
||
if is_dark[i]:
|
||
if run_start < 0:
|
||
run_start = i
|
||
else:
|
||
if run_start >= 0:
|
||
dark_runs.append((run_start, i))
|
||
run_start = -1
|
||
if run_start >= 0:
|
||
dark_runs.append((run_start, len(is_dark)))
|
||
|
||
# Filter out runs that are too narrow (< 1% of image width)
|
||
min_spine_px = int(w * 0.01)
|
||
dark_runs = [(s, e) for s, e in dark_runs if e - s >= min_spine_px]
|
||
|
||
if not dark_runs:
|
||
logger.debug("No dark runs wider than %dpx in center region", min_spine_px)
|
||
return []
|
||
|
||
# Score each dark run: prefer centered, dark, narrow valleys
|
||
center_region_len = center_hi - center_lo
|
||
image_center_in_region = (w * 0.5 - center_lo) # x=50% mapped into region coords
|
||
best_score = -1.0
|
||
best_start, best_end = dark_runs[0]
|
||
|
||
for rs, re in dark_runs:
|
||
run_width = re - rs
|
||
run_center = (rs + re) / 2.0
|
||
|
||
# --- Factor 1: Proximity to image center (gaussian, sigma = 15% of region) ---
|
||
sigma = center_region_len * 0.15
|
||
dist = abs(run_center - image_center_in_region)
|
||
center_factor = float(np.exp(-0.5 * (dist / sigma) ** 2))
|
||
|
||
# --- Factor 2: Darkness (how dark is the valley relative to threshold) ---
|
||
run_brightness = float(np.mean(center_brightness[rs:re]))
|
||
# Normalize: 1.0 when run_brightness == 0, 0.0 when run_brightness == spine_thresh
|
||
darkness_factor = max(0.0, (spine_thresh - run_brightness) / spine_thresh)
|
||
|
||
# --- Factor 3: Narrowness bonus (spine shadows are narrow, not wide plateaus) ---
|
||
# Typical spine: 1-5% of image width. Penalise runs wider than ~8%.
|
||
width_frac = run_width / w
|
||
if width_frac <= 0.05:
|
||
narrowness_bonus = 1.0
|
||
elif width_frac <= 0.15:
|
||
narrowness_bonus = 1.0 - (width_frac - 0.05) / 0.10 # linear decay 1.0 → 0.0
|
||
else:
|
||
narrowness_bonus = 0.0
|
||
|
||
score = center_factor * darkness_factor * (0.3 + 0.7 * narrowness_bonus)
|
||
|
||
logger.debug(
|
||
"Dark run x=%d..%d (w=%d): center_f=%.3f dark_f=%.3f narrow_b=%.3f → score=%.4f",
|
||
center_lo + rs, center_lo + re, run_width,
|
||
center_factor, darkness_factor, narrowness_bonus, score,
|
||
)
|
||
|
||
if score > best_score:
|
||
best_score = score
|
||
best_start, best_end = rs, re
|
||
|
||
spine_w = best_end - best_start
|
||
spine_x = center_lo + best_start
|
||
spine_center = spine_x + spine_w // 2
|
||
|
||
logger.debug(
|
||
"Best spine candidate: x=%d..%d (w=%d), score=%.4f",
|
||
spine_x, spine_x + spine_w, spine_w, best_score,
|
||
)
|
||
|
||
# Verify: must have bright (paper) content on BOTH sides
|
||
left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
|
||
right_end = center_lo + best_end
|
||
right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)]))
|
||
|
||
if left_brightness < spine_thresh or right_brightness < spine_thresh:
|
||
logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f",
|
||
left_brightness, right_brightness, spine_thresh)
|
||
return []
|
||
|
||
logger.info(
|
||
"Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, "
|
||
"left_paper=%.0f, right_paper=%.0f",
|
||
spine_x, right_end, spine_w, darkest_val, page_brightness,
|
||
left_brightness, right_brightness,
|
||
)
|
||
|
||
# Split at the spine center
|
||
split_points = [spine_center]
|
||
|
||
# Build page rectangles
|
||
pages: list = []
|
||
prev_x = 0
|
||
for i, sx in enumerate(split_points):
|
||
pages.append({"x": prev_x, "y": 0, "width": sx - prev_x,
|
||
"height": h, "page_index": i})
|
||
prev_x = sx
|
||
pages.append({"x": prev_x, "y": 0, "width": w - prev_x,
|
||
"height": h, "page_index": len(split_points)})
|
||
|
||
# Filter out tiny pages (< 15% of total width)
|
||
pages = [p for p in pages if p["width"] >= w * 0.15]
|
||
if len(pages) < 2:
|
||
return []
|
||
|
||
# Re-index
|
||
for i, p in enumerate(pages):
|
||
p["page_index"] = i
|
||
|
||
logger.info(
|
||
"Page split detected: %d pages, spine_w=%d, split_points=%s",
|
||
len(pages), spine_w, split_points,
|
||
)
|
||
return pages
|
||
|
||
|
||
def detect_and_crop_page(
|
||
img_bgr: np.ndarray,
|
||
margin_frac: float = 0.01,
|
||
) -> Tuple[np.ndarray, Dict[str, Any]]:
|
||
"""Detect content boundary and crop scanner/book borders.
|
||
|
||
Algorithm (4-edge detection):
|
||
1. Adaptive threshold → binary (text=255, bg=0)
|
||
2. Left edge: spine-shadow detection via grayscale column means,
|
||
fallback to binary vertical projection
|
||
3. Right edge: binary vertical projection (last ink column)
|
||
4. Top/bottom edges: binary horizontal projection
|
||
5. Sanity checks, then crop with configurable margin
|
||
|
||
Args:
|
||
img_bgr: Input BGR image (should already be deskewed/dewarped)
|
||
margin_frac: Extra margin around content (fraction of dimension, default 1%)
|
||
|
||
Returns:
|
||
Tuple of (cropped_image, result_dict)
|
||
"""
|
||
h, w = img_bgr.shape[:2]
|
||
total_area = h * w
|
||
|
||
result: Dict[str, Any] = {
|
||
"crop_applied": False,
|
||
"crop_rect": None,
|
||
"crop_rect_pct": None,
|
||
"original_size": {"width": w, "height": h},
|
||
"cropped_size": {"width": w, "height": h},
|
||
"detected_format": None,
|
||
"format_confidence": 0.0,
|
||
"aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
|
||
"border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
|
||
}
|
||
|
||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
|
||
# --- Binarise with adaptive threshold (works for white-on-white) ---
|
||
binary = cv2.adaptiveThreshold(
|
||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY_INV, blockSize=51, C=15,
|
||
)
|
||
|
||
# --- Left edge: spine-shadow detection ---
|
||
left_edge = _detect_left_edge_shadow(gray, binary, w, h)
|
||
|
||
# --- Right edge: spine-shadow detection ---
|
||
right_edge = _detect_right_edge_shadow(gray, binary, w, h)
|
||
|
||
# --- Top / bottom edges: binary horizontal projection ---
|
||
top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)
|
||
|
||
# Compute border fractions
|
||
border_top = top_edge / h
|
||
border_bottom = (h - bottom_edge) / h
|
||
border_left = left_edge / w
|
||
border_right = (w - right_edge) / w
|
||
|
||
result["border_fractions"] = {
|
||
"top": round(border_top, 4),
|
||
"bottom": round(border_bottom, 4),
|
||
"left": round(border_left, 4),
|
||
"right": round(border_right, 4),
|
||
}
|
||
|
||
# Sanity: only crop if at least one edge has > 2% border
|
||
min_border = 0.02
|
||
if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
|
||
logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
|
||
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
|
||
return img_bgr, result
|
||
|
||
# Add margin
|
||
margin_x = int(w * margin_frac)
|
||
margin_y = int(h * margin_frac)
|
||
|
||
crop_x = max(0, left_edge - margin_x)
|
||
crop_y = max(0, top_edge - margin_y)
|
||
crop_x2 = min(w, right_edge + margin_x)
|
||
crop_y2 = min(h, bottom_edge + margin_y)
|
||
|
||
crop_w = crop_x2 - crop_x
|
||
crop_h = crop_y2 - crop_y
|
||
|
||
# Sanity: cropped area must be >= 40% of original
|
||
if crop_w * crop_h < 0.40 * total_area:
|
||
logger.warning("Cropped area too small (%.0f%%) — skipping crop",
|
||
100.0 * crop_w * crop_h / total_area)
|
||
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
|
||
return img_bgr, result
|
||
|
||
cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
|
||
|
||
detected_format, format_confidence = _detect_format(crop_w, crop_h)
|
||
|
||
result["crop_applied"] = True
|
||
result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
|
||
result["crop_rect_pct"] = {
|
||
"x": round(100.0 * crop_x / w, 2),
|
||
"y": round(100.0 * crop_y / h, 2),
|
||
"width": round(100.0 * crop_w / w, 2),
|
||
"height": round(100.0 * crop_h / h, 2),
|
||
}
|
||
result["cropped_size"] = {"width": crop_w, "height": crop_h}
|
||
result["detected_format"] = detected_format
|
||
result["format_confidence"] = format_confidence
|
||
result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
|
||
|
||
logger.info(
|
||
"Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
|
||
"borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
|
||
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
|
||
border_top * 100, border_bottom * 100,
|
||
border_left * 100, border_right * 100,
|
||
)
|
||
|
||
return cropped, result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Edge detection helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _detect_spine_shadow(
|
||
gray: np.ndarray,
|
||
search_region: np.ndarray,
|
||
offset_x: int,
|
||
w: int,
|
||
side: str,
|
||
) -> Optional[int]:
|
||
"""Find the book spine center (darkest point) in a scanner shadow.
|
||
|
||
The scanner produces a gray strip where the book spine presses against
|
||
the glass. The darkest column in that strip is the spine center —
|
||
that's where we crop.
|
||
|
||
Distinguishes real spine shadows from text content by checking:
|
||
1. Strong brightness range (> 40 levels)
|
||
2. Darkest point is genuinely dark (< 180 mean brightness)
|
||
3. The dark area is a NARROW valley, not a text-content plateau
|
||
4. Brightness rises significantly toward the page content side
|
||
|
||
Args:
|
||
gray: Full grayscale image (for context).
|
||
search_region: Column slice of the grayscale image to search in.
|
||
offset_x: X offset of search_region relative to full image.
|
||
w: Full image width.
|
||
side: 'left' or 'right' (for logging).
|
||
|
||
Returns:
|
||
X coordinate (in full image) of the spine center, or None.
|
||
"""
|
||
region_w = search_region.shape[1]
|
||
if region_w < 10:
|
||
return None
|
||
|
||
# Column-mean brightness in the search region
|
||
col_means = np.mean(search_region, axis=0).astype(np.float64)
|
||
|
||
# Smooth with boxcar kernel (width = 1% of image width, min 5)
|
||
kernel_size = max(5, w // 100)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
kernel = np.ones(kernel_size) / kernel_size
|
||
smoothed_raw = np.convolve(col_means, kernel, mode="same")
|
||
|
||
# Trim convolution edge artifacts (edges are zero-padded → artificially low)
|
||
margin = kernel_size // 2
|
||
if region_w <= 2 * margin + 10:
|
||
return None
|
||
smoothed = smoothed_raw[margin:region_w - margin]
|
||
trim_offset = margin # offset of smoothed[0] relative to search_region
|
||
|
||
val_min = float(np.min(smoothed))
|
||
val_max = float(np.max(smoothed))
|
||
shadow_range = val_max - val_min
|
||
|
||
# --- Check 1: Strong brightness gradient ---
|
||
if shadow_range <= 40:
|
||
logger.debug(
|
||
"%s edge: no spine (range=%.0f <= 40)", side.capitalize(), shadow_range,
|
||
)
|
||
return None
|
||
|
||
# --- Check 2: Darkest point must be genuinely dark ---
|
||
# Spine shadows have mean column brightness 60-160.
|
||
# Text on white paper stays above 180.
|
||
if val_min > 180:
|
||
logger.debug(
|
||
"%s edge: no spine (darkest=%.0f > 180, likely text)", side.capitalize(), val_min,
|
||
)
|
||
return None
|
||
|
||
spine_idx = int(np.argmin(smoothed)) # index in trimmed array
|
||
spine_local = spine_idx + trim_offset # index in search_region
|
||
trimmed_len = len(smoothed)
|
||
|
||
# --- Check 3: Valley width (spine is narrow, text plateau is wide) ---
|
||
# Count how many columns are within 20% of the shadow range above the min.
|
||
valley_thresh = val_min + shadow_range * 0.20
|
||
valley_mask = smoothed < valley_thresh
|
||
valley_width = int(np.sum(valley_mask))
|
||
# Spine valleys are typically 3-15% of image width (20-120px on a 800px image).
|
||
# Text content plateaus span 20%+ of the search region.
|
||
max_valley_frac = 0.50 # valley must not cover more than half the trimmed region
|
||
if valley_width > trimmed_len * max_valley_frac:
|
||
logger.debug(
|
||
"%s edge: no spine (valley too wide: %d/%d = %.0f%%)",
|
||
side.capitalize(), valley_width, trimmed_len,
|
||
100.0 * valley_width / trimmed_len,
|
||
)
|
||
return None
|
||
|
||
# --- Check 4: Brightness must rise toward page content ---
|
||
# For left edge: after spine, brightness should rise (= page paper)
|
||
# For right edge: before spine, brightness should rise
|
||
rise_check_w = max(5, trimmed_len // 5) # check 20% of trimmed region
|
||
if side == "left":
|
||
# Check columns to the right of the spine (in trimmed array)
|
||
right_start = min(spine_idx + 5, trimmed_len - 1)
|
||
right_end = min(right_start + rise_check_w, trimmed_len)
|
||
if right_end > right_start:
|
||
rise_brightness = float(np.mean(smoothed[right_start:right_end]))
|
||
rise = rise_brightness - val_min
|
||
if rise < shadow_range * 0.3:
|
||
logger.debug(
|
||
"%s edge: no spine (insufficient rise: %.0f, need %.0f)",
|
||
side.capitalize(), rise, shadow_range * 0.3,
|
||
)
|
||
return None
|
||
else: # right
|
||
# Check columns to the left of the spine (in trimmed array)
|
||
left_end = max(spine_idx - 5, 0)
|
||
left_start = max(left_end - rise_check_w, 0)
|
||
if left_end > left_start:
|
||
rise_brightness = float(np.mean(smoothed[left_start:left_end]))
|
||
rise = rise_brightness - val_min
|
||
if rise < shadow_range * 0.3:
|
||
logger.debug(
|
||
"%s edge: no spine (insufficient rise: %.0f, need %.0f)",
|
||
side.capitalize(), rise, shadow_range * 0.3,
|
||
)
|
||
return None
|
||
|
||
spine_x = offset_x + spine_local
|
||
|
||
logger.info(
|
||
"%s edge: spine center at x=%d (brightness=%.0f, range=%.0f, valley=%dpx)",
|
||
side.capitalize(), spine_x, val_min, shadow_range, valley_width,
|
||
)
|
||
return spine_x
|
||
|
||
|
||
def _detect_gutter_continuity(
|
||
gray: np.ndarray,
|
||
search_region: np.ndarray,
|
||
offset_x: int,
|
||
w: int,
|
||
side: str,
|
||
) -> Optional[int]:
|
||
"""Detect gutter shadow via vertical continuity analysis.
|
||
|
||
Camera book scans produce a subtle brightness gradient at the gutter
|
||
that is too faint for scanner-shadow detection (range < 40). However,
|
||
the gutter shadow has a unique property: it runs **continuously from
|
||
top to bottom** without interruption. Text and images always have
|
||
vertical gaps between lines, paragraphs, or sections.
|
||
|
||
Algorithm:
|
||
1. Divide image into N horizontal strips (~60px each)
|
||
2. For each column, compute what fraction of strips are darker than
|
||
the page median (from the center 50% of the full image)
|
||
3. A "gutter column" has ≥ 75% of strips darker than page_median − δ
|
||
4. Smooth the dark-fraction profile and find the transition point
|
||
from the edge inward where the fraction drops below 0.50
|
||
5. Validate: gutter band must be 0.5%-10% of image width
|
||
|
||
Args:
|
||
gray: Full grayscale image.
|
||
search_region: Edge slice of the grayscale image.
|
||
offset_x: X offset of search_region relative to full image.
|
||
w: Full image width.
|
||
side: 'left' or 'right'.
|
||
|
||
Returns:
|
||
X coordinate (in full image) of the gutter inner edge, or None.
|
||
"""
|
||
region_h, region_w = search_region.shape[:2]
|
||
if region_w < 20 or region_h < 100:
|
||
return None
|
||
|
||
# --- 1. Divide into horizontal strips ---
|
||
strip_target_h = 60 # ~60px per strip
|
||
n_strips = max(10, region_h // strip_target_h)
|
||
strip_h = region_h // n_strips
|
||
|
||
strip_means = np.zeros((n_strips, region_w), dtype=np.float64)
|
||
for s in range(n_strips):
|
||
y0 = s * strip_h
|
||
y1 = min((s + 1) * strip_h, region_h)
|
||
strip_means[s] = np.mean(search_region[y0:y1, :], axis=0)
|
||
|
||
# --- 2. Page median from center 50% of full image ---
|
||
center_lo = w // 4
|
||
center_hi = 3 * w // 4
|
||
page_median = float(np.median(gray[:, center_lo:center_hi]))
|
||
|
||
# Camera shadows are subtle — threshold just 5 levels below page median
|
||
dark_thresh = page_median - 5.0
|
||
|
||
# If page is very dark overall (e.g. photo, not a book page), bail out
|
||
if page_median < 180:
|
||
return None
|
||
|
||
# --- 3. Per-column dark fraction ---
|
||
dark_count = np.sum(strip_means < dark_thresh, axis=0).astype(np.float64)
|
||
dark_frac = dark_count / n_strips # shape: (region_w,)
|
||
|
||
# --- 4. Smooth and find transition ---
|
||
# Rolling mean (window = 1% of image width, min 5)
|
||
smooth_w = max(5, w // 100)
|
||
if smooth_w % 2 == 0:
|
||
smooth_w += 1
|
||
kernel = np.ones(smooth_w) / smooth_w
|
||
frac_smooth = np.convolve(dark_frac, kernel, mode="same")
|
||
|
||
# Trim convolution edges
|
||
margin = smooth_w // 2
|
||
if region_w <= 2 * margin + 10:
|
||
return None
|
||
|
||
# Find the peak of dark fraction (gutter center).
|
||
# For right gutters the peak is near the edge; for left gutters
|
||
# (V-shaped spine shadow) the peak may be well inside the region.
|
||
transition_thresh = 0.50
|
||
peak_frac = float(np.max(frac_smooth[margin:region_w - margin]))
|
||
|
||
if peak_frac < 0.70:
|
||
logger.debug(
|
||
"%s gutter: peak dark fraction %.2f < 0.70", side.capitalize(), peak_frac,
|
||
)
|
||
return None
|
||
|
||
peak_x = int(np.argmax(frac_smooth[margin:region_w - margin])) + margin
|
||
gutter_inner = None # local x in search_region
|
||
|
||
if side == "right":
|
||
# Scan from peak toward the page center (leftward)
|
||
for x in range(peak_x, margin, -1):
|
||
if frac_smooth[x] < transition_thresh:
|
||
gutter_inner = x + 1
|
||
break
|
||
else:
|
||
# Scan from peak toward the page center (rightward)
|
||
for x in range(peak_x, region_w - margin):
|
||
if frac_smooth[x] < transition_thresh:
|
||
gutter_inner = x - 1
|
||
break
|
||
|
||
if gutter_inner is None:
|
||
return None
|
||
|
||
# --- 5. Validate gutter width ---
|
||
if side == "right":
|
||
gutter_width = region_w - gutter_inner
|
||
else:
|
||
gutter_width = gutter_inner
|
||
|
||
min_gutter = max(3, int(w * 0.005)) # at least 0.5% of image
|
||
max_gutter = int(w * 0.10) # at most 10% of image
|
||
|
||
if gutter_width < min_gutter:
|
||
logger.debug(
|
||
"%s gutter: too narrow (%dpx < %dpx)", side.capitalize(),
|
||
gutter_width, min_gutter,
|
||
)
|
||
return None
|
||
|
||
if gutter_width > max_gutter:
|
||
logger.debug(
|
||
"%s gutter: too wide (%dpx > %dpx)", side.capitalize(),
|
||
gutter_width, max_gutter,
|
||
)
|
||
return None
|
||
|
||
# Check that the gutter band is meaningfully darker than the page
|
||
if side == "right":
|
||
gutter_brightness = float(np.mean(strip_means[:, gutter_inner:]))
|
||
else:
|
||
gutter_brightness = float(np.mean(strip_means[:, :gutter_inner]))
|
||
|
||
brightness_drop = page_median - gutter_brightness
|
||
if brightness_drop < 3:
|
||
logger.debug(
|
||
"%s gutter: insufficient brightness drop (%.1f levels)",
|
||
side.capitalize(), brightness_drop,
|
||
)
|
||
return None
|
||
|
||
gutter_x = offset_x + gutter_inner
|
||
|
||
logger.info(
|
||
"%s gutter (continuity): x=%d, width=%dpx (%.1f%%), "
|
||
"brightness=%.0f vs page=%.0f (drop=%.0f), frac@edge=%.2f",
|
||
side.capitalize(), gutter_x, gutter_width,
|
||
100.0 * gutter_width / w, gutter_brightness, page_median,
|
||
brightness_drop, float(frac_smooth[gutter_inner]),
|
||
)
|
||
return gutter_x
|
||
|
||
|
||
def _detect_left_edge_shadow(
|
||
gray: np.ndarray,
|
||
binary: np.ndarray,
|
||
w: int,
|
||
h: int,
|
||
) -> int:
|
||
"""Detect left content edge, accounting for book-spine shadow.
|
||
|
||
Tries three methods in order:
|
||
1. Scanner spine-shadow (dark gradient, range > 40)
|
||
2. Camera gutter continuity (subtle shadow running top-to-bottom)
|
||
3. Binary projection fallback (first ink column)
|
||
"""
|
||
search_w = max(1, w // 4)
|
||
spine_x = _detect_spine_shadow(gray, gray[:, :search_w], 0, w, "left")
|
||
if spine_x is not None:
|
||
return spine_x
|
||
|
||
# Fallback 1: vertical continuity (camera gutter shadow)
|
||
gutter_x = _detect_gutter_continuity(gray, gray[:, :search_w], 0, w, "left")
|
||
if gutter_x is not None:
|
||
return gutter_x
|
||
|
||
# Fallback 2: binary vertical projection
|
||
return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)
|
||
|
||
|
||
def _detect_right_edge_shadow(
|
||
gray: np.ndarray,
|
||
binary: np.ndarray,
|
||
w: int,
|
||
h: int,
|
||
) -> int:
|
||
"""Detect right content edge, accounting for book-spine shadow.
|
||
|
||
Tries three methods in order:
|
||
1. Scanner spine-shadow (dark gradient, range > 40)
|
||
2. Camera gutter continuity (subtle shadow running top-to-bottom)
|
||
3. Binary projection fallback (last ink column)
|
||
"""
|
||
search_w = max(1, w // 4)
|
||
right_start = w - search_w
|
||
spine_x = _detect_spine_shadow(gray, gray[:, right_start:], right_start, w, "right")
|
||
if spine_x is not None:
|
||
return spine_x
|
||
|
||
# Fallback 1: vertical continuity (camera gutter shadow)
|
||
gutter_x = _detect_gutter_continuity(gray, gray[:, right_start:], right_start, w, "right")
|
||
if gutter_x is not None:
|
||
return gutter_x
|
||
|
||
# Fallback 2: binary vertical projection
|
||
return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)
|
||
|
||
|
||
def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
|
||
"""Detect top and bottom content edges via binary horizontal projection."""
|
||
top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
|
||
bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
|
||
return top, bottom
|
||
|
||
|
||
def _detect_edge_projection(
|
||
binary: np.ndarray,
|
||
axis: int,
|
||
from_start: bool,
|
||
dim: int,
|
||
) -> int:
|
||
"""Find the first/last row or column with ink density above threshold.
|
||
|
||
axis=0 → project vertically (column densities) → returns x position
|
||
axis=1 → project horizontally (row densities) → returns y position
|
||
|
||
Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
|
||
"""
|
||
# Compute density per row/column (mean of binary pixels / 255)
|
||
projection = np.mean(binary, axis=axis) / 255.0
|
||
|
||
# Create mask of "ink" positions
|
||
ink_mask = projection >= _INK_THRESHOLD
|
||
|
||
# Filter narrow runs (noise)
|
||
min_run = max(1, int(dim * _MIN_RUN_FRAC))
|
||
ink_mask = _filter_narrow_runs(ink_mask, min_run)
|
||
|
||
ink_positions = np.where(ink_mask)[0]
|
||
if len(ink_positions) == 0:
|
||
return 0 if from_start else dim
|
||
|
||
if from_start:
|
||
return int(ink_positions[0])
|
||
else:
|
||
return int(ink_positions[-1])
|
||
|
||
|
||
def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
|
||
"""Remove True-runs shorter than min_run pixels."""
|
||
if min_run <= 1:
|
||
return mask
|
||
|
||
result = mask.copy()
|
||
n = len(result)
|
||
i = 0
|
||
while i < n:
|
||
if result[i]:
|
||
start = i
|
||
while i < n and result[i]:
|
||
i += 1
|
||
if i - start < min_run:
|
||
result[start:i] = False
|
||
else:
|
||
i += 1
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Format detection (kept as optional metadata)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _detect_format(width: int, height: int) -> Tuple[str, float]:
|
||
"""Detect paper format from dimensions by comparing aspect ratios."""
|
||
if width <= 0 or height <= 0:
|
||
return "unknown", 0.0
|
||
|
||
aspect = max(width, height) / min(width, height)
|
||
|
||
best_format = "unknown"
|
||
best_diff = float("inf")
|
||
|
||
for fmt, expected_ratio in PAPER_FORMATS.items():
|
||
diff = abs(aspect - expected_ratio)
|
||
if diff < best_diff:
|
||
best_diff = diff
|
||
best_format = fmt
|
||
|
||
confidence = max(0.0, 1.0 - best_diff * 5.0)
|
||
|
||
if confidence < 0.3:
|
||
return "unknown", 0.0
|
||
|
||
return best_format, round(confidence, 3)
|