fix(ocr-pipeline): improve page crop spine detection and cell assignment
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
1. page_crop: Score all dark runs by center-proximity × darkness × narrowness instead of picking the widest. Fixes ad810209 where a wide dark area at 35% was chosen over the actual spine at 50%. 2. cv_words_first: Replace x-center-only word→column assignment with overlap-based three-pass strategy (overlap → midpoint-range → nearest). Fixes truncated German translations like "Schal" instead of "Schal - die Schals" in session 079cd0d9. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import pytest
|
||||
|
||||
from page_crop import (
|
||||
detect_and_crop_page,
|
||||
detect_page_splits,
|
||||
_detect_format,
|
||||
_detect_edge_projection,
|
||||
_detect_left_edge_shadow,
|
||||
@@ -465,3 +466,101 @@ class TestCropDeterminism:
|
||||
assert np.array_equal(ref_crop, crop), (
|
||||
f"Run {i} produced different pixel output"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: detect_page_splits — spine scoring logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_book_spread(h: int = 1616, w: int = 2288) -> np.ndarray:
|
||||
"""Create a synthetic landscape book spread (two pages side by side).
|
||||
|
||||
Simulates the ad810209 failure case:
|
||||
- A narrow spine shadow near the center (~50% of width)
|
||||
- A wider dark area off-center (~35% of width), simulating a text column
|
||||
- Bright paper flanking the spine on both sides
|
||||
"""
|
||||
img = np.full((h, w, 3), 230, dtype=np.uint8)
|
||||
|
||||
# --- Spine shadow: narrow dark valley centered at x = w/2 (1144) ---
|
||||
spine_center = w // 2
|
||||
spine_half_w = 30 # ~60px wide total
|
||||
for x in range(spine_center - spine_half_w, spine_center + spine_half_w + 1):
|
||||
dist = abs(x - spine_center)
|
||||
# Brightness dips from 230 (paper) to 130 (spine)
|
||||
brightness = int(130 + (230 - 130) * min(dist / spine_half_w, 1.0))
|
||||
img[:, x] = brightness
|
||||
|
||||
# --- Off-center dark area at ~35% of width (x=799), wider than spine ---
|
||||
dark_center = int(w * 0.35)
|
||||
dark_half_w = 80 # ~160px wide total (wider than spine)
|
||||
for x in range(dark_center - dark_half_w, dark_center + dark_half_w + 1):
|
||||
dist = abs(x - dark_center)
|
||||
# Brightness dips from 230 to 140 (slightly less dark than spine)
|
||||
brightness = int(140 + (230 - 140) * min(dist / dark_half_w, 1.0))
|
||||
img[:, x] = min(img[0, x, 0], brightness) # don't overwrite spine if overlapping
|
||||
|
||||
return img
|
||||
|
||||
|
||||
class TestDetectPageSplits:
|
||||
def test_portrait_image_returns_empty(self):
|
||||
"""Portrait images (width < height * 1.15) should not be split."""
|
||||
img = np.full((1000, 800, 3), 200, dtype=np.uint8)
|
||||
assert detect_page_splits(img) == []
|
||||
|
||||
def test_uniform_image_returns_empty(self):
|
||||
"""Uniform brightness image should not detect any spine."""
|
||||
img = np.full((800, 1600, 3), 220, dtype=np.uint8)
|
||||
assert detect_page_splits(img) == []
|
||||
|
||||
def test_prefers_centered_spine_over_wider_offcenter_dark(self):
|
||||
"""Scoring should pick the centered narrow spine over a wider off-center dark area.
|
||||
|
||||
This is the regression test for session ad810209 where the old algorithm
|
||||
picked x=799 (35%) instead of x=1144 (50%).
|
||||
"""
|
||||
img = _make_book_spread(h=1616, w=2288)
|
||||
pages = detect_page_splits(img)
|
||||
|
||||
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"
|
||||
|
||||
# Split point should be near the center (x ~ 1144), not at ~799
|
||||
split_x = pages[0]["width"] # pages[0] width = split point
|
||||
center = 2288 / 2 # 1144
|
||||
|
||||
assert abs(split_x - center) < 100, (
|
||||
f"Split at x={split_x}, expected near center {center:.0f}. "
|
||||
f"Old bug would have split at ~799."
|
||||
)
|
||||
|
||||
def test_split_produces_two_reasonable_pages(self):
|
||||
"""Both pages should be at least 15% of total width."""
|
||||
img = _make_book_spread()
|
||||
pages = detect_page_splits(img)
|
||||
|
||||
if len(pages) == 2:
|
||||
w = img.shape[1]
|
||||
for p in pages:
|
||||
assert p["width"] >= w * 0.15, (
|
||||
f"Page {p['page_index']} too narrow: {p['width']}px "
|
||||
f"(< {w * 0.15:.0f}px)"
|
||||
)
|
||||
|
||||
def test_page_indices_sequential(self):
|
||||
"""Page indices should be 0, 1, ..."""
|
||||
img = _make_book_spread()
|
||||
pages = detect_page_splits(img)
|
||||
if pages:
|
||||
indices = [p["page_index"] for p in pages]
|
||||
assert indices == list(range(len(pages)))
|
||||
|
||||
def test_pages_cover_full_width(self):
|
||||
"""Pages should cover the full image width without gaps or overlaps."""
|
||||
img = _make_book_spread()
|
||||
pages = detect_page_splits(img)
|
||||
if len(pages) >= 2:
|
||||
w = img.shape[1]
|
||||
assert pages[0]["x"] == 0
|
||||
total_w = sum(p["width"] for p in pages)
|
||||
assert total_w == w, f"Total page width {total_w} != image width {w}"
|
||||
|
||||
Reference in New Issue
Block a user