diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index a13bedd..b95164b 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -1191,14 +1191,14 @@ class TestSubColumnDetection: ) def test_sub_column_split_page_refs(self): - """3 page-refs left + 30 vocab words right → split into 2. + """3 page-refs left + 40 vocab words right → split into 2. - The leftmost bin with >= 10% of words (i.e. >= 4) is the vocab bin + The leftmost bin with >= 10% of words (>= 5) is the vocab bin at left=250, so the 3 page-refs are outliers. """ content_w = 1000 page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)] - vocab_words = [self._make_word(250, f"word{i}") for i in range(30)] + vocab_words = [self._make_word(250, f"word{i}") for i in range(40)] all_words = page_words + vocab_words geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w) @@ -1209,7 +1209,7 @@ class TestSubColumnDetection: right_col = result[1] assert left_col.x < right_col.x assert left_col.word_count == 3 - assert right_col.word_count == 30 + assert right_col.word_count == 40 assert left_col.index == 0 assert right_col.index == 1 @@ -1271,11 +1271,11 @@ class TestSubColumnDetection: words1 = [self._make_word(50, f"de{i}") for i in range(10)] geo1 = ColumnGeometry(index=0, x=30, y=50, width=200, height=500, word_count=10, words=words1, width_ratio=0.2) - # Second column: will split (3 outliers + 30 main) + # Second column: will split (3 outliers + 40 main) page_words = [self._make_word(400, f"p.{i}") for i in range(3)] - en_words = [self._make_word(550, f"en{i}") for i in range(30)] + en_words = [self._make_word(550, f"en{i}") for i in range(40)] geo2 = ColumnGeometry(index=1, x=380, y=50, width=300, height=500, - word_count=33, words=page_words + en_words, width_ratio=0.3) + word_count=43, words=page_words + en_words, width_ratio=0.3) result = _detect_sub_columns([geo1, geo2], content_w) @@ -1283,7 +1283,7 @@ class TestSubColumnDetection: assert [g.index for g in result] == [0, 1, 2] assert result[0].word_count == 10 assert result[1].word_count == 3 - assert result[2].word_count == 30 + assert result[2].word_count == 40 def test_no_split_too_few_words(self): """Column with fewer than 5 words → no split attempted."""