feat: box-aware column detection — exclude box content from global columns
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
- Enrich column geometries with original full-page words (box-filtered) so _detect_sub_columns() finds narrow sub-columns across box boundaries - Add inline marker guard: bullet points (1., 2., •) are not split into sub-columns (minimum gap check: 1.2× word height or 20px) - Add box_rects parameter to build_grid_from_words() — words inside boxes are excluded from X-gap column clustering - Pass box rects from zones to words_first grid builder - Add 9 tests for box-aware column detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
174
klausur-service/backend/tests/test_box_column_awareness.py
Normal file
174
klausur-service/backend/tests/test_box_column_awareness.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
Tests for box-aware column detection.
|
||||
|
||||
Verifies that:
|
||||
1. Words inside boxes are excluded from column clustering (words_first)
|
||||
2. Column geometries are enriched with box-filtered original words (layout)
|
||||
3. Inline markers (bullet points) are not split into sub-columns
|
||||
|
||||
Lizenz: Apache 2.0
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from cv_words_first import build_grid_from_words, _cluster_columns
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _word(text: str, left: int, top: int, width: int, height: int,
|
||||
conf: int = 90) -> dict:
|
||||
return {
|
||||
'text': text, 'left': left, 'top': top,
|
||||
'width': width, 'height': height, 'conf': conf,
|
||||
}
|
||||
|
||||
|
||||
def _box(x: int, y: int, w: int, h: int) -> dict:
|
||||
return {'x': x, 'y': y, 'width': w, 'height': h}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: box filtering in build_grid_from_words
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBoxAwareGridBuilding:
|
||||
"""Words inside boxes should be excluded from column clustering."""
|
||||
|
||||
def test_no_boxes_unchanged(self):
|
||||
"""Without boxes, all words should be used."""
|
||||
words = [
|
||||
_word("hello", 50, 100, 80, 20),
|
||||
_word("world", 300, 100, 80, 20),
|
||||
]
|
||||
cells, cols = build_grid_from_words(words, 600, 400)
|
||||
assert len(cells) >= 2
|
||||
texts = {c['text'] for c in cells}
|
||||
assert 'hello' in texts
|
||||
assert 'world' in texts
|
||||
|
||||
def test_box_words_excluded(self):
|
||||
"""Words inside a box should not appear in the grid."""
|
||||
words = [
|
||||
_word("outside1", 50, 50, 80, 20),
|
||||
_word("outside2", 300, 50, 80, 20),
|
||||
_word("inside_box", 150, 250, 100, 20), # inside box
|
||||
]
|
||||
box = _box(100, 200, 300, 150) # box from x=100..400, y=200..350
|
||||
cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
|
||||
|
||||
texts = {c['text'] for c in cells}
|
||||
assert 'outside1' in texts
|
||||
assert 'outside2' in texts
|
||||
assert 'inside_box' not in texts
|
||||
|
||||
def test_all_words_in_box_returns_empty(self):
|
||||
"""If all words are inside the box, return empty grid."""
|
||||
words = [
|
||||
_word("a", 150, 250, 30, 20),
|
||||
_word("b", 200, 250, 30, 20),
|
||||
]
|
||||
box = _box(100, 200, 300, 150)
|
||||
cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
|
||||
assert cells == []
|
||||
assert cols == []
|
||||
|
||||
def test_multiple_boxes(self):
|
||||
"""Words in multiple boxes should all be excluded."""
|
||||
words = [
|
||||
_word("content", 50, 50, 80, 20),
|
||||
_word("box1_word", 120, 220, 80, 20),
|
||||
_word("box2_word", 420, 220, 80, 20),
|
||||
]
|
||||
boxes = [
|
||||
_box(100, 200, 200, 100), # box1
|
||||
_box(400, 200, 200, 100), # box2
|
||||
]
|
||||
cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes)
|
||||
texts = {c['text'] for c in cells}
|
||||
assert texts == {'content'}
|
||||
|
||||
def test_word_on_box_border_excluded(self):
|
||||
"""A word exactly on the box boundary should be excluded."""
|
||||
words = [
|
||||
_word("content", 50, 50, 80, 20),
|
||||
_word("edge", 100, 200, 40, 20), # left edge = box.x, center inside
|
||||
]
|
||||
box = _box(100, 200, 200, 100)
|
||||
cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box])
|
||||
texts = {c['text'] for c in cells}
|
||||
assert 'edge' not in texts
|
||||
|
||||
def test_columns_not_affected_by_box_words(self):
|
||||
"""Box words should not create extra columns via X-gap analysis."""
|
||||
# Two columns of content words, plus a word in a box at a different X
|
||||
words = [
|
||||
_word("col1_a", 50, 50, 80, 20),
|
||||
_word("col1_b", 50, 100, 80, 20),
|
||||
_word("col2_a", 300, 50, 80, 20),
|
||||
_word("col2_b", 300, 100, 80, 20),
|
||||
# This box word is at X=500, would create a 3rd column if not filtered
|
||||
_word("box_far", 500, 250, 80, 20),
|
||||
]
|
||||
box = _box(450, 200, 200, 150)
|
||||
cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box])
|
||||
# Should only have 2 columns (not 3)
|
||||
assert len(cols) <= 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: _cluster_columns with box-filtered words
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClusterColumnsFiltering:
|
||||
"""Verify column clustering works correctly with filtered words."""
|
||||
|
||||
def test_gap_detection_without_box_words(self):
|
||||
"""Column gaps should be found from content words only."""
|
||||
content_words = [
|
||||
_word("a", 50, 50, 30, 20),
|
||||
_word("b", 50, 100, 30, 20),
|
||||
_word("c", 300, 50, 30, 20),
|
||||
_word("d", 300, 100, 30, 20),
|
||||
]
|
||||
columns = _cluster_columns(content_words, 600)
|
||||
assert len(columns) == 2
|
||||
|
||||
def test_single_column_when_words_close(self):
|
||||
"""Close-together words should form a single column."""
|
||||
words = [
|
||||
_word("a", 50, 50, 80, 20),
|
||||
_word("b", 60, 100, 80, 20),
|
||||
_word("c", 55, 150, 80, 20),
|
||||
]
|
||||
columns = _cluster_columns(words, 600)
|
||||
assert len(columns) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: inline marker guard (bullet points)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInlineMarkerGuard:
|
||||
"""Bullet points / numbering should NOT be split into sub-columns."""
|
||||
|
||||
def test_concept_bullet_vs_page_ref(self):
|
||||
"""Demonstrate the gap difference between bullets and page refs.
|
||||
|
||||
Bullet points have small gap to main text (~5-10px).
|
||||
Page references have large gap (~50+ px).
|
||||
"""
|
||||
# Bullet point scenario: "1." at left=50, main text at left=65
|
||||
# Gap = 65 - (50+20) = -5 (overlapping or touching → no split)
|
||||
bullet_gap = 65 - (50 + 20)
|
||||
assert bullet_gap < 20 # very small gap
|
||||
|
||||
# Page ref scenario: "p.55" at left=20, main text at left=120
|
||||
# Gap = 120 - (20+40) = 60 (clear separation → split)
|
||||
pageref_gap = 120 - (20 + 40)
|
||||
assert pageref_gap > 30 # clear gap
|
||||
Reference in New Issue
Block a user