fix: detect spine by brightness, not ink density
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
The previous algorithm used binary ink projection and found false splits at normal text column gaps. The spine of a book on a scanner has a characteristic DARK gray strip (scanner bed) flanked by bright white paper on both sides. New approach: column-mean brightness with heavy smoothing, looking for a dark valley (< 88% of paper brightness) in the center region that has bright paper on both sides. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,99 +34,99 @@ _MIN_RUN_FRAC = 0.005 # 0.5%
|
||||
|
||||
def detect_page_splits(
|
||||
img_bgr: np.ndarray,
|
||||
min_gap_frac: float = 0.008,
|
||||
) -> list:
|
||||
"""Detect if the image is a multi-page spread and return split rectangles.
|
||||
|
||||
Checks for wide vertical gaps (spine area) that indicate the image
|
||||
contains multiple pages side by side (e.g. book on scanner).
|
||||
Uses **brightness** (not ink density) to find the spine area:
|
||||
the scanner bed produces a characteristic gray strip where pages meet,
|
||||
which is darker than the white paper on either side.
|
||||
|
||||
Returns a list of page dicts ``{x, y, width, height, page_index}``
|
||||
or an empty list if only one page is detected.
|
||||
"""
|
||||
h, w = img_bgr.shape[:2]
|
||||
|
||||
# Only check landscape-ish images (width > height * 0.85)
|
||||
# Only check landscape-ish images (width > height * 1.15)
|
||||
if w < h * 1.15:
|
||||
return []
|
||||
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
binary = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY_INV, blockSize=51, C=15,
|
||||
)
|
||||
|
||||
# Vertical projection: mean ink density per column
|
||||
v_proj = np.mean(binary, axis=0) / 255.0
|
||||
# Column-mean brightness (0-255) — the spine is darker (gray scanner bed)
|
||||
col_brightness = np.mean(gray, axis=0).astype(np.float64)
|
||||
|
||||
# Smooth with boxcar (width = 0.5% of image width, min 5)
|
||||
kern = max(5, w // 200)
|
||||
# Heavy smoothing to ignore individual text lines
|
||||
kern = max(11, w // 50)
|
||||
if kern % 2 == 0:
|
||||
kern += 1
|
||||
v_smooth = np.convolve(v_proj, np.ones(kern) / kern, mode="same")
|
||||
brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same")
|
||||
|
||||
peak = float(np.max(v_smooth))
|
||||
if peak < 0.005:
|
||||
# Page paper is bright (typically > 200), spine/scanner bed is darker
|
||||
page_brightness = float(np.max(brightness_smooth))
|
||||
if page_brightness < 100:
|
||||
return [] # Very dark image, skip
|
||||
|
||||
# Spine threshold: significantly darker than the page
|
||||
# Spine is typically 60-80% of paper brightness
|
||||
spine_thresh = page_brightness * 0.88
|
||||
|
||||
# Search in center region (30-70% of width)
|
||||
center_lo = int(w * 0.30)
|
||||
center_hi = int(w * 0.70)
|
||||
|
||||
# Find the darkest valley in the center region
|
||||
center_brightness = brightness_smooth[center_lo:center_hi]
|
||||
darkest_val = float(np.min(center_brightness))
|
||||
|
||||
if darkest_val >= spine_thresh:
|
||||
logger.debug("No spine detected: min brightness %.0f >= threshold %.0f",
|
||||
darkest_val, spine_thresh)
|
||||
return []
|
||||
|
||||
# Look for valleys in center region (25-75% of width)
|
||||
gap_thresh = peak * 0.15 # valley must be < 15% of peak density
|
||||
center_lo = int(w * 0.25)
|
||||
center_hi = int(w * 0.75)
|
||||
min_gap_px = max(5, int(w * min_gap_frac))
|
||||
|
||||
# Find contiguous gap runs in the center region
|
||||
gaps: list = []
|
||||
in_gap = False
|
||||
gap_start = 0
|
||||
for x in range(center_lo, center_hi):
|
||||
if v_smooth[x] < gap_thresh:
|
||||
if not in_gap:
|
||||
gap_start = x
|
||||
in_gap = True
|
||||
# Find the contiguous dark region (spine area)
|
||||
is_dark = center_brightness < spine_thresh
|
||||
# Find the widest dark run
|
||||
best_start, best_end = 0, 0
|
||||
run_start = -1
|
||||
for i in range(len(is_dark)):
|
||||
if is_dark[i]:
|
||||
if run_start < 0:
|
||||
run_start = i
|
||||
else:
|
||||
if in_gap:
|
||||
gap_w = x - gap_start
|
||||
if gap_w >= min_gap_px:
|
||||
gaps.append({"x": gap_start, "width": gap_w,
|
||||
"center": gap_start + gap_w // 2})
|
||||
in_gap = False
|
||||
if in_gap:
|
||||
gap_w = center_hi - gap_start
|
||||
if gap_w >= min_gap_px:
|
||||
gaps.append({"x": gap_start, "width": gap_w,
|
||||
"center": gap_start + gap_w // 2})
|
||||
if run_start >= 0:
|
||||
if i - run_start > best_end - best_start:
|
||||
best_start, best_end = run_start, i
|
||||
run_start = -1
|
||||
if run_start >= 0 and len(is_dark) - run_start > best_end - best_start:
|
||||
best_start, best_end = run_start, len(is_dark)
|
||||
|
||||
if not gaps:
|
||||
spine_w = best_end - best_start
|
||||
if spine_w < w * 0.01:
|
||||
logger.debug("Spine too narrow: %dpx (< %dpx)", spine_w, int(w * 0.01))
|
||||
return []
|
||||
|
||||
# Merge nearby gaps (< 5% of width apart) — the spine area may have
|
||||
# thin ink strips between multiple gap segments
|
||||
merge_dist = max(20, int(w * 0.05))
|
||||
merged: list = [gaps[0]]
|
||||
for g in gaps[1:]:
|
||||
prev = merged[-1]
|
||||
prev_end = prev["x"] + prev["width"]
|
||||
if g["x"] - prev_end < merge_dist:
|
||||
# Merge: extend previous gap to cover both
|
||||
new_end = g["x"] + g["width"]
|
||||
prev["width"] = new_end - prev["x"]
|
||||
prev["center"] = prev["x"] + prev["width"] // 2
|
||||
else:
|
||||
merged.append(g)
|
||||
gaps = merged
|
||||
spine_x = center_lo + best_start
|
||||
spine_center = spine_x + spine_w // 2
|
||||
|
||||
# Sort gaps by width (largest = most likely spine)
|
||||
gaps.sort(key=lambda g: g["width"], reverse=True)
|
||||
# Verify: must have bright (paper) content on BOTH sides
|
||||
left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
|
||||
right_end = center_lo + best_end
|
||||
right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)]))
|
||||
|
||||
# Use only gaps that are significant (>= 2% of image width)
|
||||
significant_gaps = [g for g in gaps if g["width"] >= w * 0.02]
|
||||
if not significant_gaps:
|
||||
# Fall back to widest gap
|
||||
significant_gaps = [gaps[0]]
|
||||
if left_brightness < spine_thresh or right_brightness < spine_thresh:
|
||||
logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f",
|
||||
left_brightness, right_brightness, spine_thresh)
|
||||
return []
|
||||
|
||||
# Use the significant gap(s) as split points
|
||||
split_points = sorted(g["center"] for g in significant_gaps[:3])
|
||||
logger.info(
|
||||
"Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, "
|
||||
"left_paper=%.0f, right_paper=%.0f",
|
||||
spine_x, right_end, spine_w, darkest_val, page_brightness,
|
||||
left_brightness, right_brightness,
|
||||
)
|
||||
|
||||
# Split at the spine center
|
||||
split_points = [spine_center]
|
||||
|
||||
# Build page rectangles
|
||||
pages: list = []
|
||||
|
||||
Reference in New Issue
Block a user