From 0532b2a79764564f7a69e5a3cdb5325dc205a0c6 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 17:54:49 +0100 Subject: [PATCH] fix(ocr-pipeline): skip edge-touching gaps in header/footer detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gaps that extend to the image boundary (top/bottom edge) are not valid content separators — they typically represent dewarp padding. Only gaps with content on both sides qualify as header/footer boundaries. Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 7 +++++++ .../backend/tests/test_cv_vocab_pipeline.py | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 3a9dde9..2bd8bd1 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2593,6 +2593,9 @@ def _detect_header_footer_gaps( large_gap_threshold = median_gap * GAP_MULTIPLIER # Step 6: Find largest qualifying gap in header / footer zones + # A separator gap must have content on BOTH sides — edge-touching gaps + # (e.g. dewarp padding at bottom) are not valid separators. + EDGE_MARGIN = max(5, actual_h // 400) header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE) footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE)) @@ -2601,6 +2604,8 @@ def _detect_header_footer_gaps( best_header_size = 0 for gs, ge in raw_gaps: + if gs <= EDGE_MARGIN: + continue # skip gaps touching the top edge gap_mid = (gs + ge) / 2 gap_size = ge - gs if gap_mid < header_zone_limit and gap_size > large_gap_threshold: @@ -2610,6 +2615,8 @@ def _detect_header_footer_gaps( best_footer_size = 0 for gs, ge in raw_gaps: + if ge >= actual_h - EDGE_MARGIN: + continue # skip gaps touching the bottom edge gap_mid = (gs + ge) / 2 gap_size = ge - gs if gap_mid > footer_zone_start and gap_size > large_gap_threshold: diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index 0ce25d7..4d764c7 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -1092,6 +1092,17 @@ class TestHeaderFooterGapDetection: assert header_y is None assert footer_y is None + def test_edge_gaps_ignored_dewarp_padding(self): + """Trailing gap at bottom edge (dewarp padding) should not be detected as footer.""" + h, w = 2000, 800 + # Body lines from 10 to 1700 + bands = self._make_body_with_lines(h, w, 10, 1700) + # Gap from 1700 to 2000 = bottom edge padding (no content after) + inv = self._make_inv(h, w, bands) + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) + # The trailing gap touches the image edge → not a valid separator + assert footer_y is None + class TestRegionContentCheck: """Tests for _region_has_content() and _add_header_footer() type selection."""