fix(ocr-pipeline): improve page crop spine detection and cell assignment
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
1. page_crop: Score all dark runs by center-proximity × darkness × narrowness instead of picking the widest. Fixes ad810209 where a wide dark area at 35% was chosen over the actual spine at 50%. 2. cv_words_first: Replace x-center-only word→column assignment with overlap-based three-pass strategy (overlap → midpoint-range → nearest). Fixes truncated German translations like "Schal" instead of "Schal - die Schals" in session 079cd0d9. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -124,13 +124,43 @@ def _cluster_rows(
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
||||||
"""Return column index for a word based on its X-center."""
|
"""Return column index for a word based on overlap, then center, then nearest.
|
||||||
x_center = word['left'] + word['width'] / 2
|
|
||||||
|
Three-pass strategy (consistent with _assign_row_words_to_columns):
|
||||||
|
1. Overlap-based: assign to column with maximum horizontal overlap.
|
||||||
|
2. Midpoint-range: if no overlap, use midpoints between adjacent columns.
|
||||||
|
3. Nearest center: last resort fallback.
|
||||||
|
"""
|
||||||
|
w_left = word['left']
|
||||||
|
w_right = w_left + word['width']
|
||||||
|
w_center = w_left + word['width'] / 2
|
||||||
|
|
||||||
|
# Pass 1: overlap-based
|
||||||
|
best_col = -1
|
||||||
|
best_overlap = 0
|
||||||
for col in columns:
|
for col in columns:
|
||||||
if col['x_min'] <= x_center < col['x_max']:
|
overlap = max(0, min(w_right, col['x_max']) - max(w_left, col['x_min']))
|
||||||
|
if overlap > best_overlap:
|
||||||
|
best_overlap = overlap
|
||||||
|
best_col = col['index']
|
||||||
|
if best_col >= 0 and best_overlap > 0:
|
||||||
|
return best_col
|
||||||
|
|
||||||
|
# Pass 2: midpoint-range (non-overlapping assignment zones)
|
||||||
|
for ci, col in enumerate(columns):
|
||||||
|
if ci == 0:
|
||||||
|
assign_left = 0
|
||||||
|
else:
|
||||||
|
assign_left = (columns[ci - 1]['x_max'] + col['x_min']) / 2
|
||||||
|
if ci == len(columns) - 1:
|
||||||
|
assign_right = float('inf')
|
||||||
|
else:
|
||||||
|
assign_right = (col['x_max'] + columns[ci + 1]['x_min']) / 2
|
||||||
|
if assign_left <= w_center < assign_right:
|
||||||
return col['index']
|
return col['index']
|
||||||
# Fallback: nearest column
|
|
||||||
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
|
# Pass 3: nearest column center
|
||||||
|
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - w_center))['index']
|
||||||
|
|
||||||
|
|
||||||
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
||||||
|
|||||||
@@ -83,10 +83,9 @@ def detect_page_splits(
|
|||||||
darkest_val, spine_thresh)
|
darkest_val, spine_thresh)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Find the contiguous dark region (spine area)
|
# Find ALL contiguous dark runs in the center region
|
||||||
is_dark = center_brightness < spine_thresh
|
is_dark = center_brightness < spine_thresh
|
||||||
# Find the widest dark run
|
dark_runs: list = [] # list of (start, end) pairs
|
||||||
best_start, best_end = 0, 0
|
|
||||||
run_start = -1
|
run_start = -1
|
||||||
for i in range(len(is_dark)):
|
for i in range(len(is_dark)):
|
||||||
if is_dark[i]:
|
if is_dark[i]:
|
||||||
@@ -94,20 +93,70 @@ def detect_page_splits(
|
|||||||
run_start = i
|
run_start = i
|
||||||
else:
|
else:
|
||||||
if run_start >= 0:
|
if run_start >= 0:
|
||||||
if i - run_start > best_end - best_start:
|
dark_runs.append((run_start, i))
|
||||||
best_start, best_end = run_start, i
|
|
||||||
run_start = -1
|
run_start = -1
|
||||||
if run_start >= 0 and len(is_dark) - run_start > best_end - best_start:
|
if run_start >= 0:
|
||||||
best_start, best_end = run_start, len(is_dark)
|
dark_runs.append((run_start, len(is_dark)))
|
||||||
|
|
||||||
spine_w = best_end - best_start
|
# Filter out runs that are too narrow (< 1% of image width)
|
||||||
if spine_w < w * 0.01:
|
min_spine_px = int(w * 0.01)
|
||||||
logger.debug("Spine too narrow: %dpx (< %dpx)", spine_w, int(w * 0.01))
|
dark_runs = [(s, e) for s, e in dark_runs if e - s >= min_spine_px]
|
||||||
|
|
||||||
|
if not dark_runs:
|
||||||
|
logger.debug("No dark runs wider than %dpx in center region", min_spine_px)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Score each dark run: prefer centered, dark, narrow valleys
|
||||||
|
center_region_len = center_hi - center_lo
|
||||||
|
image_center_in_region = (w * 0.5 - center_lo) # x=50% mapped into region coords
|
||||||
|
best_score = -1.0
|
||||||
|
best_start, best_end = dark_runs[0]
|
||||||
|
|
||||||
|
for rs, re in dark_runs:
|
||||||
|
run_width = re - rs
|
||||||
|
run_center = (rs + re) / 2.0
|
||||||
|
|
||||||
|
# --- Factor 1: Proximity to image center (gaussian, sigma = 15% of region) ---
|
||||||
|
sigma = center_region_len * 0.15
|
||||||
|
dist = abs(run_center - image_center_in_region)
|
||||||
|
center_factor = float(np.exp(-0.5 * (dist / sigma) ** 2))
|
||||||
|
|
||||||
|
# --- Factor 2: Darkness (how dark is the valley relative to threshold) ---
|
||||||
|
run_brightness = float(np.mean(center_brightness[rs:re]))
|
||||||
|
# Normalize: 1.0 when run_brightness == 0, 0.0 when run_brightness == spine_thresh
|
||||||
|
darkness_factor = max(0.0, (spine_thresh - run_brightness) / spine_thresh)
|
||||||
|
|
||||||
|
# --- Factor 3: Narrowness bonus (spine shadows are narrow, not wide plateaus) ---
|
||||||
|
# Typical spine: 1-5% of image width. Penalise runs wider than ~8%.
|
||||||
|
width_frac = run_width / w
|
||||||
|
if width_frac <= 0.05:
|
||||||
|
narrowness_bonus = 1.0
|
||||||
|
elif width_frac <= 0.15:
|
||||||
|
narrowness_bonus = 1.0 - (width_frac - 0.05) / 0.10 # linear decay 1.0 → 0.0
|
||||||
|
else:
|
||||||
|
narrowness_bonus = 0.0
|
||||||
|
|
||||||
|
score = center_factor * darkness_factor * (0.3 + 0.7 * narrowness_bonus)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Dark run x=%d..%d (w=%d): center_f=%.3f dark_f=%.3f narrow_b=%.3f → score=%.4f",
|
||||||
|
center_lo + rs, center_lo + re, run_width,
|
||||||
|
center_factor, darkness_factor, narrowness_bonus, score,
|
||||||
|
)
|
||||||
|
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_start, best_end = rs, re
|
||||||
|
|
||||||
|
spine_w = best_end - best_start
|
||||||
spine_x = center_lo + best_start
|
spine_x = center_lo + best_start
|
||||||
spine_center = spine_x + spine_w // 2
|
spine_center = spine_x + spine_w // 2
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Best spine candidate: x=%d..%d (w=%d), score=%.4f",
|
||||||
|
spine_x, spine_x + spine_w, spine_w, best_score,
|
||||||
|
)
|
||||||
|
|
||||||
# Verify: must have bright (paper) content on BOTH sides
|
# Verify: must have bright (paper) content on BOTH sides
|
||||||
left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
|
left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
|
||||||
right_end = center_lo + best_end
|
right_end = center_lo + best_end
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import pytest
|
|||||||
|
|
||||||
from page_crop import (
|
from page_crop import (
|
||||||
detect_and_crop_page,
|
detect_and_crop_page,
|
||||||
|
detect_page_splits,
|
||||||
_detect_format,
|
_detect_format,
|
||||||
_detect_edge_projection,
|
_detect_edge_projection,
|
||||||
_detect_left_edge_shadow,
|
_detect_left_edge_shadow,
|
||||||
@@ -465,3 +466,101 @@ class TestCropDeterminism:
|
|||||||
assert np.array_equal(ref_crop, crop), (
|
assert np.array_equal(ref_crop, crop), (
|
||||||
f"Run {i} produced different pixel output"
|
f"Run {i} produced different pixel output"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tests: detect_page_splits — spine scoring logic
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _make_book_spread(h: int = 1616, w: int = 2288) -> np.ndarray:
|
||||||
|
"""Create a synthetic landscape book spread (two pages side by side).
|
||||||
|
|
||||||
|
Simulates the ad810209 failure case:
|
||||||
|
- A narrow spine shadow near the center (~50% of width)
|
||||||
|
- A wider dark area off-center (~35% of width), simulating a text column
|
||||||
|
- Bright paper flanking the spine on both sides
|
||||||
|
"""
|
||||||
|
img = np.full((h, w, 3), 230, dtype=np.uint8)
|
||||||
|
|
||||||
|
# --- Spine shadow: narrow dark valley centered at x = w/2 (1144) ---
|
||||||
|
spine_center = w // 2
|
||||||
|
spine_half_w = 30 # ~60px wide total
|
||||||
|
for x in range(spine_center - spine_half_w, spine_center + spine_half_w + 1):
|
||||||
|
dist = abs(x - spine_center)
|
||||||
|
# Brightness dips from 230 (paper) to 130 (spine)
|
||||||
|
brightness = int(130 + (230 - 130) * min(dist / spine_half_w, 1.0))
|
||||||
|
img[:, x] = brightness
|
||||||
|
|
||||||
|
# --- Off-center dark area at ~35% of width (x=799), wider than spine ---
|
||||||
|
dark_center = int(w * 0.35)
|
||||||
|
dark_half_w = 80 # ~160px wide total (wider than spine)
|
||||||
|
for x in range(dark_center - dark_half_w, dark_center + dark_half_w + 1):
|
||||||
|
dist = abs(x - dark_center)
|
||||||
|
# Brightness dips from 230 to 140 (slightly less dark than spine)
|
||||||
|
brightness = int(140 + (230 - 140) * min(dist / dark_half_w, 1.0))
|
||||||
|
img[:, x] = min(img[0, x, 0], brightness) # don't overwrite spine if overlapping
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetectPageSplits:
|
||||||
|
def test_portrait_image_returns_empty(self):
|
||||||
|
"""Portrait images (width < height * 1.15) should not be split."""
|
||||||
|
img = np.full((1000, 800, 3), 200, dtype=np.uint8)
|
||||||
|
assert detect_page_splits(img) == []
|
||||||
|
|
||||||
|
def test_uniform_image_returns_empty(self):
|
||||||
|
"""Uniform brightness image should not detect any spine."""
|
||||||
|
img = np.full((800, 1600, 3), 220, dtype=np.uint8)
|
||||||
|
assert detect_page_splits(img) == []
|
||||||
|
|
||||||
|
def test_prefers_centered_spine_over_wider_offcenter_dark(self):
|
||||||
|
"""Scoring should pick the centered narrow spine over a wider off-center dark area.
|
||||||
|
|
||||||
|
This is the regression test for session ad810209 where the old algorithm
|
||||||
|
picked x=799 (35%) instead of x=1144 (50%).
|
||||||
|
"""
|
||||||
|
img = _make_book_spread(h=1616, w=2288)
|
||||||
|
pages = detect_page_splits(img)
|
||||||
|
|
||||||
|
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"
|
||||||
|
|
||||||
|
# Split point should be near the center (x ~ 1144), not at ~799
|
||||||
|
split_x = pages[0]["width"] # pages[0] width = split point
|
||||||
|
center = 2288 / 2 # 1144
|
||||||
|
|
||||||
|
assert abs(split_x - center) < 100, (
|
||||||
|
f"Split at x={split_x}, expected near center {center:.0f}. "
|
||||||
|
f"Old bug would have split at ~799."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_split_produces_two_reasonable_pages(self):
|
||||||
|
"""Both pages should be at least 15% of total width."""
|
||||||
|
img = _make_book_spread()
|
||||||
|
pages = detect_page_splits(img)
|
||||||
|
|
||||||
|
if len(pages) == 2:
|
||||||
|
w = img.shape[1]
|
||||||
|
for p in pages:
|
||||||
|
assert p["width"] >= w * 0.15, (
|
||||||
|
f"Page {p['page_index']} too narrow: {p['width']}px "
|
||||||
|
f"(< {w * 0.15:.0f}px)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_page_indices_sequential(self):
|
||||||
|
"""Page indices should be 0, 1, ..."""
|
||||||
|
img = _make_book_spread()
|
||||||
|
pages = detect_page_splits(img)
|
||||||
|
if pages:
|
||||||
|
indices = [p["page_index"] for p in pages]
|
||||||
|
assert indices == list(range(len(pages)))
|
||||||
|
|
||||||
|
def test_pages_cover_full_width(self):
|
||||||
|
"""Pages should cover the full image width without gaps or overlaps."""
|
||||||
|
img = _make_book_spread()
|
||||||
|
pages = detect_page_splits(img)
|
||||||
|
if len(pages) >= 2:
|
||||||
|
w = img.shape[1]
|
||||||
|
assert pages[0]["x"] == 0
|
||||||
|
total_w = sum(p["width"] for p in pages)
|
||||||
|
assert total_w == w, f"Total page width {total_w} != image width {w}"
|
||||||
|
|||||||
Reference in New Issue
Block a user