fix(sub-columns): exclude header/footer words from alignment clustering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s

Header/footer words (page numbers, chapter titles) could pollute the
left-edge alignment bins and trigger false sub-column splits. Now
_detect_header_footer_gaps() runs early and its boundaries are passed
to _detect_sub_columns() to filter those words from clustering and
the split threshold check.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 07:33:54 +01:00
parent 3904ddb493
commit 6527beae03
3 changed files with 101 additions and 5 deletions

View File

@@ -1330,6 +1330,73 @@ class TestSubColumnDetection:
assert result[0].word_count == 3
assert result[1].word_count == 40
def test_header_words_excluded_from_alignment(self):
"""Header words (top < header_y) should not participate in alignment clustering.
Without header_y: 3 header words at left=100 + 40 content words at left=250
would cause a split (3 outliers vs 40 main).
With header_y: the 3 header words are excluded from clustering, leaving only
40 uniform words at left=250 → no split.
"""
content_w = 1000
top_y = 0
# Header words: top=5 (relative to top_y=0), well above header_y=50
header_words = [{'left': 100, 'top': 5, 'width': 50, 'height': 20,
'text': f"Ch.{i}", 'conf': 90} for i in range(3)]
# Content words: top=200, below header_y=50
content_words = [{'left': 250, 'top': 200, 'width': 50, 'height': 20,
'text': f"word{i}", 'conf': 90} for i in range(40)]
all_words = header_words + content_words
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
# Without header_y: split happens (3 outliers at left=100)
result_no_filter = _detect_sub_columns([geo], content_w)
assert len(result_no_filter) == 2, "Should split without header filtering"
# With header_y=50: header words excluded, only 40 uniform words remain → no split
result_filtered = _detect_sub_columns([geo], content_w, top_y=top_y, header_y=50)
assert len(result_filtered) == 1, "Should NOT split with header words excluded"
assert result_filtered[0].word_count == 43 # all words still in the geometry
def test_footer_words_excluded_from_alignment(self):
"""Footer words (top > footer_y) should not participate in alignment clustering.
Analog to header test but with footer words at the bottom.
"""
content_w = 1000
top_y = 0
# Content words: top=200, above footer_y=800
content_words = [{'left': 250, 'top': 200, 'width': 50, 'height': 20,
'text': f"word{i}", 'conf': 90} for i in range(40)]
# Footer words: top=900, below footer_y=800
footer_words = [{'left': 100, 'top': 900, 'width': 50, 'height': 20,
'text': f"p.{i}", 'conf': 90} for i in range(3)]
all_words = content_words + footer_words
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
# Without footer_y: split happens (3 outliers at left=100)
result_no_filter = _detect_sub_columns([geo], content_w)
assert len(result_no_filter) == 2, "Should split without footer filtering"
# With footer_y=800: footer words excluded → no split
result_filtered = _detect_sub_columns([geo], content_w, top_y=top_y, footer_y=800)
assert len(result_filtered) == 1, "Should NOT split with footer words excluded"
assert result_filtered[0].word_count == 43
def test_header_footer_none_no_filtering(self):
"""header_y=None, footer_y=None → same behavior as before (no filtering)."""
content_w = 1000
page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)]
vocab_words = [self._make_word(250, f"word{i}") for i in range(40)]
all_words = page_words + vocab_words
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
result = _detect_sub_columns([geo], content_w, header_y=None, footer_y=None)
assert len(result) == 2, "Should still split with None header/footer"
assert result[0].word_count == 3
assert result[1].word_count == 40
class TestCellsToVocabEntriesPageRef:
"""Test that page_ref cells are mapped to source_page field."""