fix(tests): adjust word counts so 10% threshold works correctly

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 19:00:14 +01:00
parent 7252f9a956
commit 6e1a349eed

View File

@@ -1191,14 +1191,14 @@ class TestSubColumnDetection:
)
def test_sub_column_split_page_refs(self):
"""3 page-refs left + 30 vocab words right → split into 2.
"""3 page-refs left + 40 vocab words right → split into 2.
The leftmost bin with >= 10% of words (i.e. >= 4) is the vocab bin
The leftmost bin with >= 10% of words (>= 5) is the vocab bin
at left=250, so the 3 page-refs are outliers.
"""
content_w = 1000
page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)]
vocab_words = [self._make_word(250, f"word{i}") for i in range(30)]
vocab_words = [self._make_word(250, f"word{i}") for i in range(40)]
all_words = page_words + vocab_words
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
@@ -1209,7 +1209,7 @@ class TestSubColumnDetection:
right_col = result[1]
assert left_col.x < right_col.x
assert left_col.word_count == 3
assert right_col.word_count == 30
assert right_col.word_count == 40
assert left_col.index == 0
assert right_col.index == 1
@@ -1271,11 +1271,11 @@ class TestSubColumnDetection:
words1 = [self._make_word(50, f"de{i}") for i in range(10)]
geo1 = ColumnGeometry(index=0, x=30, y=50, width=200, height=500,
word_count=10, words=words1, width_ratio=0.2)
# Second column: will split (3 outliers + 30 main)
# Second column: will split (3 outliers + 40 main)
page_words = [self._make_word(400, f"p.{i}") for i in range(3)]
en_words = [self._make_word(550, f"en{i}") for i in range(30)]
en_words = [self._make_word(550, f"en{i}") for i in range(40)]
geo2 = ColumnGeometry(index=1, x=380, y=50, width=300, height=500,
word_count=33, words=page_words + en_words, width_ratio=0.3)
word_count=43, words=page_words + en_words, width_ratio=0.3)
result = _detect_sub_columns([geo1, geo2], content_w)
@@ -1283,7 +1283,7 @@ class TestSubColumnDetection:
assert [g.index for g in result] == [0, 1, 2]
assert result[0].word_count == 10
assert result[1].word_count == 3
assert result[2].word_count == 30
assert result[2].word_count == 40
def test_no_split_too_few_words(self):
"""Column with fewer than 5 words → no split attempted."""