feat(ocr-pipeline): distinguish header/footer vs margin_top/margin_bottom
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Check for actual ink content in detected top/bottom regions: - 'header'/'footer' when text is present (e.g. title, page number) - 'margin_top'/'margin_bottom' when the region is empty page margin Also update all skip-type sets and color maps for the new types. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,6 +35,8 @@ from cv_vocab_pipeline import (
|
||||
_filter_narrow_runs,
|
||||
_build_margin_regions,
|
||||
_detect_header_footer_gaps,
|
||||
_region_has_content,
|
||||
_add_header_footer,
|
||||
analyze_layout,
|
||||
_group_words_into_lines,
|
||||
match_lines_to_vocab,
|
||||
@@ -340,7 +342,8 @@ class TestLayoutAnalysis:
|
||||
ocr_img = create_ocr_image(text_like_image)
|
||||
layout_img = create_layout_image(text_like_image)
|
||||
regions = analyze_layout(layout_img, ocr_img)
|
||||
valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'}
|
||||
valid_types = {'column_en', 'column_de', 'column_example',
|
||||
'header', 'footer', 'margin_top', 'margin_bottom'}
|
||||
for r in regions:
|
||||
assert r.type in valid_types, f"Unexpected region type: {r.type}"
|
||||
|
||||
@@ -976,7 +979,7 @@ class TestMarginRegions:
|
||||
|
||||
def test_margins_in_skip_types(self):
|
||||
"""Verify margin types are in the skip set used by build_cell_grid."""
|
||||
skip = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
|
||||
skip = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
|
||||
assert 'margin_left' in skip
|
||||
assert 'margin_right' in skip
|
||||
|
||||
@@ -1090,6 +1093,72 @@ class TestHeaderFooterGapDetection:
|
||||
assert footer_y is None
|
||||
|
||||
|
||||
class TestRegionContentCheck:
|
||||
"""Tests for _region_has_content() and _add_header_footer() type selection."""
|
||||
|
||||
def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray:
|
||||
inv = np.zeros((height, width), dtype=np.uint8)
|
||||
for y1, y2 in bands:
|
||||
inv[y1:y2, :] = 255
|
||||
return inv
|
||||
|
||||
def test_region_with_text_has_content(self):
|
||||
"""Strip with ink → True."""
|
||||
inv = self._make_inv(1000, 800, [(10, 50)])
|
||||
assert _region_has_content(inv, 0, 100) is True
|
||||
|
||||
def test_empty_region_no_content(self):
|
||||
"""Strip without ink → False."""
|
||||
inv = self._make_inv(1000, 800, [(500, 600)])
|
||||
assert _region_has_content(inv, 0, 100) is False
|
||||
|
||||
def test_header_with_text_is_header(self):
|
||||
"""Top region with text → type='header' (via content bounds fallback)."""
|
||||
h, w = 1000, 800
|
||||
# Header text at 20-60, body starts at 200
|
||||
inv = self._make_inv(h, w, [(20, 60), (200, 900)])
|
||||
regions: list = []
|
||||
# Simulate content bounds detecting body start at y=200
|
||||
_add_header_footer(regions, top_y=200, bottom_y=h, img_w=w, img_h=h, inv=inv)
|
||||
top_regions = [r for r in regions if r.type in ('header', 'margin_top')]
|
||||
assert len(top_regions) == 1
|
||||
assert top_regions[0].type == 'header' # text at 20-60 → header
|
||||
|
||||
def test_empty_top_is_margin_top(self):
|
||||
"""Top region without text → type='margin_top'."""
|
||||
h, w = 1000, 800
|
||||
# Content only in body area (200-900), nothing in top 200px
|
||||
inv = self._make_inv(h, w, [(200, 900)])
|
||||
regions: list = []
|
||||
# Simulate top_y=200 from content bounds
|
||||
_add_header_footer(regions, top_y=200, bottom_y=h, img_w=w, img_h=h, inv=inv)
|
||||
top_regions = [r for r in regions if r.type in ('header', 'margin_top')]
|
||||
assert len(top_regions) == 1
|
||||
assert top_regions[0].type == 'margin_top'
|
||||
|
||||
def test_empty_bottom_is_margin_bottom(self):
|
||||
"""Bottom region without text → type='margin_bottom'."""
|
||||
h, w = 1000, 800
|
||||
# Content only in top/body (50-700), nothing below 700
|
||||
inv = self._make_inv(h, w, [(50, 700)])
|
||||
regions: list = []
|
||||
_add_header_footer(regions, top_y=50, bottom_y=700, img_w=w, img_h=h, inv=inv)
|
||||
bottom_regions = [r for r in regions if r.type in ('footer', 'margin_bottom')]
|
||||
assert len(bottom_regions) == 1
|
||||
assert bottom_regions[0].type == 'margin_bottom'
|
||||
|
||||
def test_footer_with_page_number_is_footer(self):
|
||||
"""Bottom region with page number text → type='footer'."""
|
||||
h, w = 1000, 800
|
||||
# Body 50-700, page number at 900-930
|
||||
inv = self._make_inv(h, w, [(50, 700), (900, 930)])
|
||||
regions: list = []
|
||||
_add_header_footer(regions, top_y=50, bottom_y=700, img_w=w, img_h=h, inv=inv)
|
||||
bottom_regions = [r for r in regions if r.type in ('footer', 'margin_bottom')]
|
||||
assert len(bottom_regions) == 1
|
||||
assert bottom_regions[0].type == 'footer'
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user