From 2a21127f013cb5c1f2e821e70156d0085d6c52aa Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 24 Mar 2026 09:23:30 +0100 Subject: [PATCH] fix(ocr-pipeline): improve page crop spine detection and cell assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. page_crop: Score all dark runs by center-proximity × darkness × narrowness instead of picking the widest. Fixes ad810209 where a wide dark area at 35% was chosen over the actual spine at 50%. 2. cv_words_first: Replace x-center-only word→column assignment with overlap-based three-pass strategy (overlap → midpoint-range → nearest). Fixes truncated German translations like "Schal" instead of "Schal - die Schals" in session 079cd0d9. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_words_first.py | 40 +++++++- klausur-service/backend/page_crop.py | 69 +++++++++++-- .../backend/tests/test_page_crop.py | 99 +++++++++++++++++++ 3 files changed, 193 insertions(+), 15 deletions(-) diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index 723f62c..19f77cd 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -124,13 +124,43 @@ def _cluster_rows( # --------------------------------------------------------------------------- def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int: - """Return column index for a word based on its X-center.""" - x_center = word['left'] + word['width'] / 2 + """Return column index for a word based on overlap, then center, then nearest. + + Three-pass strategy (consistent with _assign_row_words_to_columns): + 1. Overlap-based: assign to column with maximum horizontal overlap. + 2. Midpoint-range: if no overlap, use midpoints between adjacent columns. + 3. Nearest center: last resort fallback. + """ + w_left = word['left'] + w_right = w_left + word['width'] + w_center = w_left + word['width'] / 2 + + # Pass 1: overlap-based + best_col = -1 + best_overlap = 0 for col in columns: - if col['x_min'] <= x_center < col['x_max']: + overlap = max(0, min(w_right, col['x_max']) - max(w_left, col['x_min'])) + if overlap > best_overlap: + best_overlap = overlap + best_col = col['index'] + if best_col >= 0 and best_overlap > 0: + return best_col + + # Pass 2: midpoint-range (non-overlapping assignment zones) + for ci, col in enumerate(columns): + if ci == 0: + assign_left = 0 + else: + assign_left = (columns[ci - 1]['x_max'] + col['x_min']) / 2 + if ci == len(columns) - 1: + assign_right = float('inf') + else: + assign_right = (col['x_max'] + columns[ci + 1]['x_min']) / 2 + if assign_left <= w_center < assign_right: return col['index'] - # Fallback: nearest column - return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index'] + + # Pass 3: nearest column center + return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - w_center))['index'] def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int: diff --git a/klausur-service/backend/page_crop.py b/klausur-service/backend/page_crop.py index 7b63b9c..6caf979 100644 --- a/klausur-service/backend/page_crop.py +++ b/klausur-service/backend/page_crop.py @@ -83,10 +83,9 @@ def detect_page_splits( darkest_val, spine_thresh) return [] - # Find the contiguous dark region (spine area) + # Find ALL contiguous dark runs in the center region is_dark = center_brightness < spine_thresh - # Find the widest dark run - best_start, best_end = 0, 0 + dark_runs: list = [] # list of (start, end) pairs run_start = -1 for i in range(len(is_dark)): if is_dark[i]: @@ -94,20 +93,70 @@ def detect_page_splits( run_start = i else: if run_start >= 0: - if i - run_start > best_end - best_start: - best_start, best_end = run_start, i + dark_runs.append((run_start, i)) run_start = -1 - if run_start >= 0 and len(is_dark) - run_start > best_end - best_start: - best_start, best_end = run_start, len(is_dark) + if run_start >= 0: + dark_runs.append((run_start, len(is_dark))) - spine_w = best_end - best_start - if spine_w < w * 0.01: - logger.debug("Spine too narrow: %dpx (< %dpx)", spine_w, int(w * 0.01)) + # Filter out runs that are too narrow (< 1% of image width) + min_spine_px = int(w * 0.01) + dark_runs = [(s, e) for s, e in dark_runs if e - s >= min_spine_px] + + if not dark_runs: + logger.debug("No dark runs wider than %dpx in center region", min_spine_px) return [] + # Score each dark run: prefer centered, dark, narrow valleys + center_region_len = center_hi - center_lo + image_center_in_region = (w * 0.5 - center_lo) # x=50% mapped into region coords + best_score = -1.0 + best_start, best_end = dark_runs[0] + + for rs, re in dark_runs: + run_width = re - rs + run_center = (rs + re) / 2.0 + + # --- Factor 1: Proximity to image center (gaussian, sigma = 15% of region) --- + sigma = center_region_len * 0.15 + dist = abs(run_center - image_center_in_region) + center_factor = float(np.exp(-0.5 * (dist / sigma) ** 2)) + + # --- Factor 2: Darkness (how dark is the valley relative to threshold) --- + run_brightness = float(np.mean(center_brightness[rs:re])) + # Normalize: 1.0 when run_brightness == 0, 0.0 when run_brightness == spine_thresh + darkness_factor = max(0.0, (spine_thresh - run_brightness) / spine_thresh) + + # --- Factor 3: Narrowness bonus (spine shadows are narrow, not wide plateaus) --- + # Typical spine: 1-5% of image width. Penalise runs wider than ~8%. + width_frac = run_width / w + if width_frac <= 0.05: + narrowness_bonus = 1.0 + elif width_frac <= 0.15: + narrowness_bonus = 1.0 - (width_frac - 0.05) / 0.10 # linear decay 1.0 → 0.0 + else: + narrowness_bonus = 0.0 + + score = center_factor * darkness_factor * (0.3 + 0.7 * narrowness_bonus) + + logger.debug( + "Dark run x=%d..%d (w=%d): center_f=%.3f dark_f=%.3f narrow_b=%.3f → score=%.4f", + center_lo + rs, center_lo + re, run_width, + center_factor, darkness_factor, narrowness_bonus, score, + ) + + if score > best_score: + best_score = score + best_start, best_end = rs, re + + spine_w = best_end - best_start spine_x = center_lo + best_start spine_center = spine_x + spine_w // 2 + logger.debug( + "Best spine candidate: x=%d..%d (w=%d), score=%.4f", + spine_x, spine_x + spine_w, spine_w, best_score, + ) + # Verify: must have bright (paper) content on BOTH sides left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x])) right_end = center_lo + best_end diff --git a/klausur-service/backend/tests/test_page_crop.py b/klausur-service/backend/tests/test_page_crop.py index d506486..b791ac4 100644 --- a/klausur-service/backend/tests/test_page_crop.py +++ b/klausur-service/backend/tests/test_page_crop.py @@ -15,6 +15,7 @@ import pytest from page_crop import ( detect_and_crop_page, + detect_page_splits, _detect_format, _detect_edge_projection, _detect_left_edge_shadow, @@ -465,3 +466,101 @@ class TestCropDeterminism: assert np.array_equal(ref_crop, crop), ( f"Run {i} produced different pixel output" ) + + +# --------------------------------------------------------------------------- +# Tests: detect_page_splits — spine scoring logic +# --------------------------------------------------------------------------- + +def _make_book_spread(h: int = 1616, w: int = 2288) -> np.ndarray: + """Create a synthetic landscape book spread (two pages side by side). + + Simulates the ad810209 failure case: + - A narrow spine shadow near the center (~50% of width) + - A wider dark area off-center (~35% of width), simulating a text column + - Bright paper flanking the spine on both sides + """ + img = np.full((h, w, 3), 230, dtype=np.uint8) + + # --- Spine shadow: narrow dark valley centered at x = w/2 (1144) --- + spine_center = w // 2 + spine_half_w = 30 # ~60px wide total + for x in range(spine_center - spine_half_w, spine_center + spine_half_w + 1): + dist = abs(x - spine_center) + # Brightness dips from 230 (paper) to 130 (spine) + brightness = int(130 + (230 - 130) * min(dist / spine_half_w, 1.0)) + img[:, x] = brightness + + # --- Off-center dark area at ~35% of width (x=799), wider than spine --- + dark_center = int(w * 0.35) + dark_half_w = 80 # ~160px wide total (wider than spine) + for x in range(dark_center - dark_half_w, dark_center + dark_half_w + 1): + dist = abs(x - dark_center) + # Brightness dips from 230 to 140 (slightly less dark than spine) + brightness = int(140 + (230 - 140) * min(dist / dark_half_w, 1.0)) + img[:, x] = min(img[0, x, 0], brightness) # don't overwrite spine if overlapping + + return img + + +class TestDetectPageSplits: + def test_portrait_image_returns_empty(self): + """Portrait images (width < height * 1.15) should not be split.""" + img = np.full((1000, 800, 3), 200, dtype=np.uint8) + assert detect_page_splits(img) == [] + + def test_uniform_image_returns_empty(self): + """Uniform brightness image should not detect any spine.""" + img = np.full((800, 1600, 3), 220, dtype=np.uint8) + assert detect_page_splits(img) == [] + + def test_prefers_centered_spine_over_wider_offcenter_dark(self): + """Scoring should pick the centered narrow spine over a wider off-center dark area. + + This is the regression test for session ad810209 where the old algorithm + picked x=799 (35%) instead of x=1144 (50%). + """ + img = _make_book_spread(h=1616, w=2288) + pages = detect_page_splits(img) + + assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}" + + # Split point should be near the center (x ~ 1144), not at ~799 + split_x = pages[0]["width"] # pages[0] width = split point + center = 2288 / 2 # 1144 + + assert abs(split_x - center) < 100, ( + f"Split at x={split_x}, expected near center {center:.0f}. " + f"Old bug would have split at ~799." + ) + + def test_split_produces_two_reasonable_pages(self): + """Both pages should be at least 15% of total width.""" + img = _make_book_spread() + pages = detect_page_splits(img) + + if len(pages) == 2: + w = img.shape[1] + for p in pages: + assert p["width"] >= w * 0.15, ( + f"Page {p['page_index']} too narrow: {p['width']}px " + f"(< {w * 0.15:.0f}px)" + ) + + def test_page_indices_sequential(self): + """Page indices should be 0, 1, ...""" + img = _make_book_spread() + pages = detect_page_splits(img) + if pages: + indices = [p["page_index"] for p in pages] + assert indices == list(range(len(pages))) + + def test_pages_cover_full_width(self): + """Pages should cover the full image width without gaps or overlaps.""" + img = _make_book_spread() + pages = detect_page_splits(img) + if len(pages) >= 2: + w = img.shape[1] + assert pages[0]["x"] == 0 + total_w = sum(p["width"] for p in pages) + assert total_w == w, f"Total page width {total_w} != image width {w}"