fix(ocr-pipeline): skip edge-touching gaps in header/footer detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Gaps that extend to the image boundary (top/bottom edge) are not valid content separators — they typically represent dewarp padding. Only gaps with content on both sides qualify as header/footer boundaries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2593,6 +2593,9 @@ def _detect_header_footer_gaps(
|
||||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||
|
||||
# Step 6: Find largest qualifying gap in header / footer zones
|
||||
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||||
# (e.g. dewarp padding at bottom) are not valid separators.
|
||||
EDGE_MARGIN = max(5, actual_h // 400)
|
||||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||
|
||||
@@ -2601,6 +2604,8 @@ def _detect_header_footer_gaps(
|
||||
|
||||
best_header_size = 0
|
||||
for gs, ge in raw_gaps:
|
||||
if gs <= EDGE_MARGIN:
|
||||
continue # skip gaps touching the top edge
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||
@@ -2610,6 +2615,8 @@ def _detect_header_footer_gaps(
|
||||
|
||||
best_footer_size = 0
|
||||
for gs, ge in raw_gaps:
|
||||
if ge >= actual_h - EDGE_MARGIN:
|
||||
continue # skip gaps touching the bottom edge
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||
|
||||
@@ -1092,6 +1092,17 @@ class TestHeaderFooterGapDetection:
|
||||
assert header_y is None
|
||||
assert footer_y is None
|
||||
|
||||
def test_edge_gaps_ignored_dewarp_padding(self):
|
||||
"""Trailing gap at bottom edge (dewarp padding) should not be detected as footer."""
|
||||
h, w = 2000, 800
|
||||
# Body lines from 10 to 1700
|
||||
bands = self._make_body_with_lines(h, w, 10, 1700)
|
||||
# Gap from 1700 to 2000 = bottom edge padding (no content after)
|
||||
inv = self._make_inv(h, w, bands)
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||
# The trailing gap touches the image edge → not a valid separator
|
||||
assert footer_y is None
|
||||
|
||||
|
||||
class TestRegionContentCheck:
|
||||
"""Tests for _region_has_content() and _add_header_footer() type selection."""
|
||||
|
||||
Reference in New Issue
Block a user