fix: detect spine by brightness, not ink density
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
The previous algorithm used binary ink projection and found false splits at normal text column gaps. The spine of a book on a scanner has a characteristic DARK gray strip (scanner bed) flanked by bright white paper on both sides. New approach: column-mean brightness with heavy smoothing, looking for a dark valley (< 88% of paper brightness) in the center region that has bright paper on both sides. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,99 +34,99 @@ _MIN_RUN_FRAC = 0.005 # 0.5%
|
|||||||
|
|
||||||
def detect_page_splits(
|
def detect_page_splits(
|
||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
min_gap_frac: float = 0.008,
|
|
||||||
) -> list:
|
) -> list:
|
||||||
"""Detect if the image is a multi-page spread and return split rectangles.
|
"""Detect if the image is a multi-page spread and return split rectangles.
|
||||||
|
|
||||||
Checks for wide vertical gaps (spine area) that indicate the image
|
Uses **brightness** (not ink density) to find the spine area:
|
||||||
contains multiple pages side by side (e.g. book on scanner).
|
the scanner bed produces a characteristic gray strip where pages meet,
|
||||||
|
which is darker than the white paper on either side.
|
||||||
|
|
||||||
Returns a list of page dicts ``{x, y, width, height, page_index}``
|
Returns a list of page dicts ``{x, y, width, height, page_index}``
|
||||||
or an empty list if only one page is detected.
|
or an empty list if only one page is detected.
|
||||||
"""
|
"""
|
||||||
h, w = img_bgr.shape[:2]
|
h, w = img_bgr.shape[:2]
|
||||||
|
|
||||||
# Only check landscape-ish images (width > height * 0.85)
|
# Only check landscape-ish images (width > height * 1.15)
|
||||||
if w < h * 1.15:
|
if w < h * 1.15:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||||
binary = cv2.adaptiveThreshold(
|
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
||||||
cv2.THRESH_BINARY_INV, blockSize=51, C=15,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Vertical projection: mean ink density per column
|
# Column-mean brightness (0-255) — the spine is darker (gray scanner bed)
|
||||||
v_proj = np.mean(binary, axis=0) / 255.0
|
col_brightness = np.mean(gray, axis=0).astype(np.float64)
|
||||||
|
|
||||||
# Smooth with boxcar (width = 0.5% of image width, min 5)
|
# Heavy smoothing to ignore individual text lines
|
||||||
kern = max(5, w // 200)
|
kern = max(11, w // 50)
|
||||||
if kern % 2 == 0:
|
if kern % 2 == 0:
|
||||||
kern += 1
|
kern += 1
|
||||||
v_smooth = np.convolve(v_proj, np.ones(kern) / kern, mode="same")
|
brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same")
|
||||||
|
|
||||||
peak = float(np.max(v_smooth))
|
# Page paper is bright (typically > 200), spine/scanner bed is darker
|
||||||
if peak < 0.005:
|
page_brightness = float(np.max(brightness_smooth))
|
||||||
|
if page_brightness < 100:
|
||||||
|
return [] # Very dark image, skip
|
||||||
|
|
||||||
|
# Spine threshold: significantly darker than the page
|
||||||
|
# Spine is typically 60-80% of paper brightness
|
||||||
|
spine_thresh = page_brightness * 0.88
|
||||||
|
|
||||||
|
# Search in center region (30-70% of width)
|
||||||
|
center_lo = int(w * 0.30)
|
||||||
|
center_hi = int(w * 0.70)
|
||||||
|
|
||||||
|
# Find the darkest valley in the center region
|
||||||
|
center_brightness = brightness_smooth[center_lo:center_hi]
|
||||||
|
darkest_val = float(np.min(center_brightness))
|
||||||
|
|
||||||
|
if darkest_val >= spine_thresh:
|
||||||
|
logger.debug("No spine detected: min brightness %.0f >= threshold %.0f",
|
||||||
|
darkest_val, spine_thresh)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Look for valleys in center region (25-75% of width)
|
# Find the contiguous dark region (spine area)
|
||||||
gap_thresh = peak * 0.15 # valley must be < 15% of peak density
|
is_dark = center_brightness < spine_thresh
|
||||||
center_lo = int(w * 0.25)
|
# Find the widest dark run
|
||||||
center_hi = int(w * 0.75)
|
best_start, best_end = 0, 0
|
||||||
min_gap_px = max(5, int(w * min_gap_frac))
|
run_start = -1
|
||||||
|
for i in range(len(is_dark)):
|
||||||
# Find contiguous gap runs in the center region
|
if is_dark[i]:
|
||||||
gaps: list = []
|
if run_start < 0:
|
||||||
in_gap = False
|
run_start = i
|
||||||
gap_start = 0
|
|
||||||
for x in range(center_lo, center_hi):
|
|
||||||
if v_smooth[x] < gap_thresh:
|
|
||||||
if not in_gap:
|
|
||||||
gap_start = x
|
|
||||||
in_gap = True
|
|
||||||
else:
|
else:
|
||||||
if in_gap:
|
if run_start >= 0:
|
||||||
gap_w = x - gap_start
|
if i - run_start > best_end - best_start:
|
||||||
if gap_w >= min_gap_px:
|
best_start, best_end = run_start, i
|
||||||
gaps.append({"x": gap_start, "width": gap_w,
|
run_start = -1
|
||||||
"center": gap_start + gap_w // 2})
|
if run_start >= 0 and len(is_dark) - run_start > best_end - best_start:
|
||||||
in_gap = False
|
best_start, best_end = run_start, len(is_dark)
|
||||||
if in_gap:
|
|
||||||
gap_w = center_hi - gap_start
|
|
||||||
if gap_w >= min_gap_px:
|
|
||||||
gaps.append({"x": gap_start, "width": gap_w,
|
|
||||||
"center": gap_start + gap_w // 2})
|
|
||||||
|
|
||||||
if not gaps:
|
spine_w = best_end - best_start
|
||||||
|
if spine_w < w * 0.01:
|
||||||
|
logger.debug("Spine too narrow: %dpx (< %dpx)", spine_w, int(w * 0.01))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Merge nearby gaps (< 5% of width apart) — the spine area may have
|
spine_x = center_lo + best_start
|
||||||
# thin ink strips between multiple gap segments
|
spine_center = spine_x + spine_w // 2
|
||||||
merge_dist = max(20, int(w * 0.05))
|
|
||||||
merged: list = [gaps[0]]
|
|
||||||
for g in gaps[1:]:
|
|
||||||
prev = merged[-1]
|
|
||||||
prev_end = prev["x"] + prev["width"]
|
|
||||||
if g["x"] - prev_end < merge_dist:
|
|
||||||
# Merge: extend previous gap to cover both
|
|
||||||
new_end = g["x"] + g["width"]
|
|
||||||
prev["width"] = new_end - prev["x"]
|
|
||||||
prev["center"] = prev["x"] + prev["width"] // 2
|
|
||||||
else:
|
|
||||||
merged.append(g)
|
|
||||||
gaps = merged
|
|
||||||
|
|
||||||
# Sort gaps by width (largest = most likely spine)
|
# Verify: must have bright (paper) content on BOTH sides
|
||||||
gaps.sort(key=lambda g: g["width"], reverse=True)
|
left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
|
||||||
|
right_end = center_lo + best_end
|
||||||
|
right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)]))
|
||||||
|
|
||||||
# Use only gaps that are significant (>= 2% of image width)
|
if left_brightness < spine_thresh or right_brightness < spine_thresh:
|
||||||
significant_gaps = [g for g in gaps if g["width"] >= w * 0.02]
|
logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f",
|
||||||
if not significant_gaps:
|
left_brightness, right_brightness, spine_thresh)
|
||||||
# Fall back to widest gap
|
return []
|
||||||
significant_gaps = [gaps[0]]
|
|
||||||
|
|
||||||
# Use the significant gap(s) as split points
|
logger.info(
|
||||||
split_points = sorted(g["center"] for g in significant_gaps[:3])
|
"Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, "
|
||||||
|
"left_paper=%.0f, right_paper=%.0f",
|
||||||
|
spine_x, right_end, spine_w, darkest_val, page_brightness,
|
||||||
|
left_brightness, right_brightness,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split at the spine center
|
||||||
|
split_points = [spine_center]
|
||||||
|
|
||||||
# Build page rectangles
|
# Build page rectangles
|
||||||
pages: list = []
|
pages: list = []
|
||||||
|
|||||||
Reference in New Issue
Block a user