feat(ocr-pipeline): generic header/footer detection via projection gap analysis
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s

Replace the trivial top_y/bottom_y threshold check with horizontal
projection gap analysis that finds large whitespace gaps separating
header/footer content from the main body. This correctly detects
headers (e.g. "VOCABULARY" banners) and footers (page numbers) even
when _find_content_bounds includes them in the content area.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 16:13:48 +01:00
parent a052f73de3
commit f615c5f66d
3 changed files with 233 additions and 23 deletions

View File

@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
_find_content_bounds,
_filter_narrow_runs,
_build_margin_regions,
_detect_header_footer_gaps,
analyze_layout,
_group_words_into_lines,
match_lines_to_vocab,
@@ -989,6 +990,106 @@ class TestMarginRegions:
assert m.classification_method == 'content_bounds'
# =============================================
# Header/Footer Gap Detection
# =============================================
class TestHeaderFooterGapDetection:
"""Tests for _detect_header_footer_gaps()."""
def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray:
"""Create an inverted binary image with white horizontal bands.
Args:
height: Image height.
width: Image width.
bands: List of (y_start, y_end) tuples where pixels are white (255).
"""
inv = np.zeros((height, width), dtype=np.uint8)
for y1, y2 in bands:
inv[y1:y2, :] = 255
return inv
def _make_body_with_lines(self, h, w, body_start, body_end,
line_h=15, gap_h=12):
"""Create bands simulating text lines with inter-line gaps.
gap_h must be large enough to survive smoothing (kernel ~ h//200).
"""
bands = []
y = body_start
while y + line_h <= body_end:
bands.append((y, y + line_h))
y += line_h + gap_h
return bands
def test_header_gap_detected(self):
"""Content at top + large gap + main body → header_y at the gap."""
h, w = 2000, 800
# Header content at rows 20-80
bands = [(20, 80)]
# Large gap 80-300 (220px) — much larger than 12px line gaps
# Body lines from 300 to ~1990 (extends near bottom, no footer gap)
bands += self._make_body_with_lines(h, w, 300, 1990)
inv = self._make_inv(h, w, bands)
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
assert header_y is not None
assert 80 <= header_y <= 310
def test_footer_gap_detected(self):
"""Main body + large gap + page number → footer_y at the gap."""
h, w = 2000, 800
# Body lines from 10 to 1600 (starts near top, no header gap)
bands = self._make_body_with_lines(h, w, 10, 1600)
# Large gap 1600-1880 (280px)
# Page number 1880-1920
bands.append((1880, 1920))
inv = self._make_inv(h, w, bands)
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
assert footer_y is not None
assert 1580 <= footer_y <= 1890
def test_both_header_and_footer(self):
"""Header + gap + body lines + gap + footer → both detected."""
h, w = 2000, 800
# Header 10-60
bands = [(10, 60)]
# Large gap 60-250 (190px)
# Body lines from 250 to 1700
bands += self._make_body_with_lines(h, w, 250, 1700)
# Large gap 1700-1900 (200px)
# Footer 1900-1970
bands.append((1900, 1970))
inv = self._make_inv(h, w, bands)
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
assert header_y is not None
assert footer_y is not None
assert 60 <= header_y <= 260
assert 1690 <= footer_y <= 1910
def test_no_gaps_returns_none(self):
"""Uniform content across the page → (None, None)."""
h, w = 1000, 800
# Content across entire height
inv = self._make_inv(h, w, [(0, 1000)])
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
assert header_y is None
assert footer_y is None
def test_small_gaps_ignored(self):
"""Gaps smaller than 2x median should be ignored."""
h, w = 1000, 800
# Many small, evenly-spaced gaps (like line spacing) — no large outlier
bands = []
for row_start in range(0, 1000, 20):
bands.append((row_start, row_start + 15)) # 15px content, 5px gap
inv = self._make_inv(h, w, bands)
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
# All gaps are equal size, none > 2x median → no header/footer
assert header_y is None
assert footer_y is None
# =============================================
# RUN TESTS
# =============================================