fix(ocr-pipeline): improve page crop spine detection and cell assignment
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s

1. page_crop: Score all dark runs by center-proximity × darkness ×
   narrowness instead of picking the widest. Fixes ad810209 where a
   wide dark area at 35% was chosen over the actual spine at 50%.

2. cv_words_first: Replace x-center-only word→column assignment with
   overlap-based three-pass strategy (overlap → midpoint-range → nearest).
   Fixes truncated German translations like "Schal" instead of
   "Schal - die Schals" in session 079cd0d9.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 09:23:30 +01:00
parent 9d34c5201e
commit 2a21127f01
3 changed files with 193 additions and 15 deletions

View File

@@ -15,6 +15,7 @@ import pytest
from page_crop import (
detect_and_crop_page,
detect_page_splits,
_detect_format,
_detect_edge_projection,
_detect_left_edge_shadow,
@@ -465,3 +466,101 @@ class TestCropDeterminism:
assert np.array_equal(ref_crop, crop), (
f"Run {i} produced different pixel output"
)
# ---------------------------------------------------------------------------
# Tests: detect_page_splits — spine scoring logic
# ---------------------------------------------------------------------------
def _make_book_spread(h: int = 1616, w: int = 2288) -> np.ndarray:
"""Create a synthetic landscape book spread (two pages side by side).
Simulates the ad810209 failure case:
- A narrow spine shadow near the center (~50% of width)
- A wider dark area off-center (~35% of width), simulating a text column
- Bright paper flanking the spine on both sides
"""
img = np.full((h, w, 3), 230, dtype=np.uint8)
# --- Spine shadow: narrow dark valley centered at x = w/2 (1144) ---
spine_center = w // 2
spine_half_w = 30 # ~60px wide total
for x in range(spine_center - spine_half_w, spine_center + spine_half_w + 1):
dist = abs(x - spine_center)
# Brightness dips from 230 (paper) to 130 (spine)
brightness = int(130 + (230 - 130) * min(dist / spine_half_w, 1.0))
img[:, x] = brightness
# --- Off-center dark area at ~35% of width (x=799), wider than spine ---
dark_center = int(w * 0.35)
dark_half_w = 80 # ~160px wide total (wider than spine)
for x in range(dark_center - dark_half_w, dark_center + dark_half_w + 1):
dist = abs(x - dark_center)
# Brightness dips from 230 to 140 (slightly less dark than spine)
brightness = int(140 + (230 - 140) * min(dist / dark_half_w, 1.0))
img[:, x] = min(img[0, x, 0], brightness) # don't overwrite spine if overlapping
return img
class TestDetectPageSplits:
def test_portrait_image_returns_empty(self):
"""Portrait images (width < height * 1.15) should not be split."""
img = np.full((1000, 800, 3), 200, dtype=np.uint8)
assert detect_page_splits(img) == []
def test_uniform_image_returns_empty(self):
"""Uniform brightness image should not detect any spine."""
img = np.full((800, 1600, 3), 220, dtype=np.uint8)
assert detect_page_splits(img) == []
def test_prefers_centered_spine_over_wider_offcenter_dark(self):
"""Scoring should pick the centered narrow spine over a wider off-center dark area.
This is the regression test for session ad810209 where the old algorithm
picked x=799 (35%) instead of x=1144 (50%).
"""
img = _make_book_spread(h=1616, w=2288)
pages = detect_page_splits(img)
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"
# Split point should be near the center (x ~ 1144), not at ~799
split_x = pages[0]["width"] # pages[0] width = split point
center = 2288 / 2 # 1144
assert abs(split_x - center) < 100, (
f"Split at x={split_x}, expected near center {center:.0f}. "
f"Old bug would have split at ~799."
)
def test_split_produces_two_reasonable_pages(self):
"""Both pages should be at least 15% of total width."""
img = _make_book_spread()
pages = detect_page_splits(img)
if len(pages) == 2:
w = img.shape[1]
for p in pages:
assert p["width"] >= w * 0.15, (
f"Page {p['page_index']} too narrow: {p['width']}px "
f"(< {w * 0.15:.0f}px)"
)
def test_page_indices_sequential(self):
"""Page indices should be 0, 1, ..."""
img = _make_book_spread()
pages = detect_page_splits(img)
if pages:
indices = [p["page_index"] for p in pages]
assert indices == list(range(len(pages)))
def test_pages_cover_full_width(self):
"""Pages should cover the full image width without gaps or overlaps."""
img = _make_book_spread()
pages = detect_page_splits(img)
if len(pages) >= 2:
w = img.shape[1]
assert pages[0]["x"] == 0
total_w = sum(p["width"] for p in pages)
assert total_w == w, f"Total page width {total_w} != image width {w}"