feat(ocr-pipeline): generic header/footer detection via projection gap analysis
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s
Replace the trivial top_y/bottom_y threshold check with horizontal projection gap analysis that finds large whitespace gaps separating header/footer content from the main body. This correctly detects headers (e.g. "VOCABULARY" banners) and footers (page numbers) even when _find_content_bounds includes them in the content area. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
|
||||
_find_content_bounds,
|
||||
_filter_narrow_runs,
|
||||
_build_margin_regions,
|
||||
_detect_header_footer_gaps,
|
||||
analyze_layout,
|
||||
_group_words_into_lines,
|
||||
match_lines_to_vocab,
|
||||
@@ -989,6 +990,106 @@ class TestMarginRegions:
|
||||
assert m.classification_method == 'content_bounds'
|
||||
|
||||
|
||||
# =============================================
|
||||
# Header/Footer Gap Detection
|
||||
# =============================================
|
||||
|
||||
class TestHeaderFooterGapDetection:
|
||||
"""Tests for _detect_header_footer_gaps()."""
|
||||
|
||||
def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray:
|
||||
"""Create an inverted binary image with white horizontal bands.
|
||||
|
||||
Args:
|
||||
height: Image height.
|
||||
width: Image width.
|
||||
bands: List of (y_start, y_end) tuples where pixels are white (255).
|
||||
"""
|
||||
inv = np.zeros((height, width), dtype=np.uint8)
|
||||
for y1, y2 in bands:
|
||||
inv[y1:y2, :] = 255
|
||||
return inv
|
||||
|
||||
def _make_body_with_lines(self, h, w, body_start, body_end,
|
||||
line_h=15, gap_h=12):
|
||||
"""Create bands simulating text lines with inter-line gaps.
|
||||
|
||||
gap_h must be large enough to survive smoothing (kernel ~ h//200).
|
||||
"""
|
||||
bands = []
|
||||
y = body_start
|
||||
while y + line_h <= body_end:
|
||||
bands.append((y, y + line_h))
|
||||
y += line_h + gap_h
|
||||
return bands
|
||||
|
||||
def test_header_gap_detected(self):
|
||||
"""Content at top + large gap + main body → header_y at the gap."""
|
||||
h, w = 2000, 800
|
||||
# Header content at rows 20-80
|
||||
bands = [(20, 80)]
|
||||
# Large gap 80-300 (220px) — much larger than 12px line gaps
|
||||
# Body lines from 300 to ~1990 (extends near bottom, no footer gap)
|
||||
bands += self._make_body_with_lines(h, w, 300, 1990)
|
||||
inv = self._make_inv(h, w, bands)
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||
assert header_y is not None
|
||||
assert 80 <= header_y <= 310
|
||||
|
||||
def test_footer_gap_detected(self):
|
||||
"""Main body + large gap + page number → footer_y at the gap."""
|
||||
h, w = 2000, 800
|
||||
# Body lines from 10 to 1600 (starts near top, no header gap)
|
||||
bands = self._make_body_with_lines(h, w, 10, 1600)
|
||||
# Large gap 1600-1880 (280px)
|
||||
# Page number 1880-1920
|
||||
bands.append((1880, 1920))
|
||||
inv = self._make_inv(h, w, bands)
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||
assert footer_y is not None
|
||||
assert 1580 <= footer_y <= 1890
|
||||
|
||||
def test_both_header_and_footer(self):
|
||||
"""Header + gap + body lines + gap + footer → both detected."""
|
||||
h, w = 2000, 800
|
||||
# Header 10-60
|
||||
bands = [(10, 60)]
|
||||
# Large gap 60-250 (190px)
|
||||
# Body lines from 250 to 1700
|
||||
bands += self._make_body_with_lines(h, w, 250, 1700)
|
||||
# Large gap 1700-1900 (200px)
|
||||
# Footer 1900-1970
|
||||
bands.append((1900, 1970))
|
||||
inv = self._make_inv(h, w, bands)
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||
assert header_y is not None
|
||||
assert footer_y is not None
|
||||
assert 60 <= header_y <= 260
|
||||
assert 1690 <= footer_y <= 1910
|
||||
|
||||
def test_no_gaps_returns_none(self):
|
||||
"""Uniform content across the page → (None, None)."""
|
||||
h, w = 1000, 800
|
||||
# Content across entire height
|
||||
inv = self._make_inv(h, w, [(0, 1000)])
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||
assert header_y is None
|
||||
assert footer_y is None
|
||||
|
||||
def test_small_gaps_ignored(self):
|
||||
"""Gaps smaller than 2x median should be ignored."""
|
||||
h, w = 1000, 800
|
||||
# Many small, evenly-spaced gaps (like line spacing) — no large outlier
|
||||
bands = []
|
||||
for row_start in range(0, 1000, 20):
|
||||
bands.append((row_start, row_start + 15)) # 15px content, 5px gap
|
||||
inv = self._make_inv(h, w, bands)
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||
# All gaps are equal size, none > 2x median → no header/footer
|
||||
assert header_y is None
|
||||
assert footer_y is None
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user