breakpilot-lehrer/klausur-service/backend/tests/test_box_column_awareness.py

"""
Tests for box-aware column detection.

Verifies that:
1. Words inside boxes are excluded from column clustering (words_first)
2. Column geometries are enriched with box-filtered original words (layout)
3. Inline markers (bullet points) are not split into sub-columns

Lizenz: Apache 2.0
"""

import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from cv_words_first import build_grid_from_words, _cluster_columns


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _word(text: str, left: int, top: int, width: int, height: int,
          conf: int = 90) -> dict:
    return {
        'text': text, 'left': left, 'top': top,
        'width': width, 'height': height, 'conf': conf,
    }


def _box(x: int, y: int, w: int, h: int) -> dict:
    return {'x': x, 'y': y, 'width': w, 'height': h}


# ---------------------------------------------------------------------------
# Tests: box filtering in build_grid_from_words
# ---------------------------------------------------------------------------

class TestBoxAwareGridBuilding:
    """Words inside boxes should be excluded from column clustering."""

    def test_no_boxes_unchanged(self):
        """Without boxes, all words should be used."""
        words = [
            _word("hello", 50, 100, 80, 20),
            _word("world", 300, 100, 80, 20),
        ]
        cells, cols = build_grid_from_words(words, 600, 400)
        assert len(cells) >= 2
        texts = {c['text'] for c in cells}
        assert 'hello' in texts
        assert 'world' in texts

    def test_box_words_excluded(self):
        """Words inside a box should not appear in the grid."""
        words = [
            _word("outside1", 50, 50, 80, 20),
            _word("outside2", 300, 50, 80, 20),
            _word("inside_box", 150, 250, 100, 20),  # inside box
        ]
        box = _box(100, 200, 300, 150)  # box from x=100..400, y=200..350
        cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])

        texts = {c['text'] for c in cells}
        assert 'outside1' in texts
        assert 'outside2' in texts
        assert 'inside_box' not in texts

    def test_all_words_in_box_returns_empty(self):
        """If all words are inside the box, return empty grid."""
        words = [
            _word("a", 150, 250, 30, 20),
            _word("b", 200, 250, 30, 20),
        ]
        box = _box(100, 200, 300, 150)
        cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
        assert cells == []
        assert cols == []

    def test_multiple_boxes(self):
        """Words in multiple boxes should all be excluded."""
        words = [
            _word("content", 50, 50, 80, 20),
            _word("box1_word", 120, 220, 80, 20),
            _word("box2_word", 420, 220, 80, 20),
        ]
        boxes = [
            _box(100, 200, 200, 100),  # box1
            _box(400, 200, 200, 100),  # box2
        ]
        cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes)
        texts = {c['text'] for c in cells}
        assert texts == {'content'}

    def test_word_on_box_border_excluded(self):
        """A word exactly on the box boundary should be excluded."""
        words = [
            _word("content", 50, 50, 80, 20),
            _word("edge", 100, 200, 40, 20),  # left edge = box.x, center inside
        ]
        box = _box(100, 200, 200, 100)
        cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box])
        texts = {c['text'] for c in cells}
        assert 'edge' not in texts

    def test_columns_not_affected_by_box_words(self):
        """Box words should not create extra columns via X-gap analysis."""
        # Two columns of content words, plus a word in a box at a different X
        words = [
            _word("col1_a", 50, 50, 80, 20),
            _word("col1_b", 50, 100, 80, 20),
            _word("col2_a", 300, 50, 80, 20),
            _word("col2_b", 300, 100, 80, 20),
            # This box word is at X=500, would create a 3rd column if not filtered
            _word("box_far", 500, 250, 80, 20),
        ]
        box = _box(450, 200, 200, 150)
        cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box])
        # Should only have 2 columns (not 3)
        assert len(cols) <= 2


# ---------------------------------------------------------------------------
# Tests: _cluster_columns with box-filtered words
# ---------------------------------------------------------------------------

class TestClusterColumnsFiltering:
    """Verify column clustering works correctly with filtered words."""

    def test_gap_detection_without_box_words(self):
        """Column gaps should be found from content words only."""
        content_words = [
            _word("a", 50, 50, 30, 20),
            _word("b", 50, 100, 30, 20),
            _word("c", 300, 50, 30, 20),
            _word("d", 300, 100, 30, 20),
        ]
        columns = _cluster_columns(content_words, 600)
        assert len(columns) == 2

    def test_single_column_when_words_close(self):
        """Close-together words should form a single column."""
        words = [
            _word("a", 50, 50, 80, 20),
            _word("b", 60, 100, 80, 20),
            _word("c", 55, 150, 80, 20),
        ]
        columns = _cluster_columns(words, 600)
        assert len(columns) == 1


# ---------------------------------------------------------------------------
# Tests: inline marker guard (bullet points)
# ---------------------------------------------------------------------------

class TestInlineMarkerGuard:
    """Bullet points / numbering should NOT be split into sub-columns."""

    def test_concept_bullet_vs_page_ref(self):
        """Demonstrate the gap difference between bullets and page refs.

        Bullet points have small gap to main text (~5-10px).
        Page references have large gap (~50+ px).
        """
        # Bullet point scenario: "1." at left=50, main text at left=65
        # Gap = 65 - (50+20) = -5  (overlapping or touching → no split)
        bullet_gap = 65 - (50 + 20)
        assert bullet_gap < 20  # very small gap

        # Page ref scenario: "p.55" at left=20, main text at left=120
        # Gap = 120 - (20+40) = 60  (clear separation → split)
        pageref_gap = 120 - (20 + 40)
        assert pageref_gap > 30  # clear gap