fix(ocr-pipeline): skip edge-touching gaps in header/footer detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Gaps that extend to the image boundary (top/bottom edge) are not valid content separators — they typically represent dewarp padding. Only gaps with content on both sides qualify as header/footer boundaries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2593,6 +2593,9 @@ def _detect_header_footer_gaps(
|
|||||||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||||
|
|
||||||
# Step 6: Find largest qualifying gap in header / footer zones
|
# Step 6: Find largest qualifying gap in header / footer zones
|
||||||
|
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||||||
|
# (e.g. dewarp padding at bottom) are not valid separators.
|
||||||
|
EDGE_MARGIN = max(5, actual_h // 400)
|
||||||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||||||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||||
|
|
||||||
@@ -2601,6 +2604,8 @@ def _detect_header_footer_gaps(
|
|||||||
|
|
||||||
best_header_size = 0
|
best_header_size = 0
|
||||||
for gs, ge in raw_gaps:
|
for gs, ge in raw_gaps:
|
||||||
|
if gs <= EDGE_MARGIN:
|
||||||
|
continue # skip gaps touching the top edge
|
||||||
gap_mid = (gs + ge) / 2
|
gap_mid = (gs + ge) / 2
|
||||||
gap_size = ge - gs
|
gap_size = ge - gs
|
||||||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||||
@@ -2610,6 +2615,8 @@ def _detect_header_footer_gaps(
|
|||||||
|
|
||||||
best_footer_size = 0
|
best_footer_size = 0
|
||||||
for gs, ge in raw_gaps:
|
for gs, ge in raw_gaps:
|
||||||
|
if ge >= actual_h - EDGE_MARGIN:
|
||||||
|
continue # skip gaps touching the bottom edge
|
||||||
gap_mid = (gs + ge) / 2
|
gap_mid = (gs + ge) / 2
|
||||||
gap_size = ge - gs
|
gap_size = ge - gs
|
||||||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||||
|
|||||||
@@ -1092,6 +1092,17 @@ class TestHeaderFooterGapDetection:
|
|||||||
assert header_y is None
|
assert header_y is None
|
||||||
assert footer_y is None
|
assert footer_y is None
|
||||||
|
|
||||||
|
def test_edge_gaps_ignored_dewarp_padding(self):
|
||||||
|
"""Trailing gap at bottom edge (dewarp padding) should not be detected as footer."""
|
||||||
|
h, w = 2000, 800
|
||||||
|
# Body lines from 10 to 1700
|
||||||
|
bands = self._make_body_with_lines(h, w, 10, 1700)
|
||||||
|
# Gap from 1700 to 2000 = bottom edge padding (no content after)
|
||||||
|
inv = self._make_inv(h, w, bands)
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||||
|
# The trailing gap touches the image edge → not a valid separator
|
||||||
|
assert footer_y is None
|
||||||
|
|
||||||
|
|
||||||
class TestRegionContentCheck:
|
class TestRegionContentCheck:
|
||||||
"""Tests for _region_has_content() and _add_header_footer() type selection."""
|
"""Tests for _region_has_content() and _add_header_footer() type selection."""
|
||||||
|
|||||||
Reference in New Issue
Block a user