breakpilot-lehrer/klausur-service/backend/tests/test_cv_words_first.py

"""Tests for cv_words_first.py — Words-First Grid Builder."""

import pytest
from cv_words_first import (
    _assign_word_to_column,
    _assign_word_to_row,
    _build_cells,
    _cluster_columns,
    _cluster_rows,
    build_grid_from_words,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
    """Create a synthetic word dict."""
    return {
        'text': text,
        'left': left,
        'top': top,
        'width': width,
        'height': height,
        'conf': conf,
    }


# ---------------------------------------------------------------------------
# _cluster_columns
# ---------------------------------------------------------------------------

class TestClusterColumns:

    def test_single_column_freetext(self):
        """Words spread evenly across page → 1 column (column_text)."""
        words = [
            _word("Hello", 50, 10),
            _word("world", 120, 10),
            _word("this", 50, 40),
            _word("is", 120, 40),
            _word("text", 190, 40),
        ]
        cols = _cluster_columns(words, img_w=400)
        assert len(cols) == 1
        assert cols[0]['type'] == 'column_text'

    def test_two_columns(self):
        """Two word groups with large X-gap → 2 columns."""
        words = [
            _word("apple", 20, 10),
            _word("Apfel", 300, 10),
            _word("dog", 20, 40),
            _word("Hund", 300, 40),
        ]
        cols = _cluster_columns(words, img_w=500)
        assert len(cols) == 2
        assert cols[0]['type'] == 'column_1'
        assert cols[1]['type'] == 'column_2'

    def test_three_columns(self):
        """Three groups separated by wide gaps → 3 columns."""
        words = [
            _word("1", 10, 10, width=20),
            _word("apple", 100, 10),
            _word("Apfel", 400, 10),
            _word("2", 10, 40, width=20),
            _word("dog", 100, 40),
            _word("Hund", 400, 40),
        ]
        cols = _cluster_columns(words, img_w=600)
        assert len(cols) == 3

    def test_empty_words(self):
        """No words → empty result."""
        assert _cluster_columns([], img_w=500) == []


# ---------------------------------------------------------------------------
# _cluster_rows
# ---------------------------------------------------------------------------

class TestClusterRows:

    def test_two_rows(self):
        """Words at two Y-levels → 2 rows."""
        words = [
            _word("hello", 10, 20),
            _word("world", 100, 25),
            _word("foo", 10, 80),
            _word("bar", 100, 82),
        ]
        rows = _cluster_rows(words)
        assert len(rows) == 2
        assert rows[0]['y_min'] < rows[1]['y_min']

    def test_single_row(self):
        """All words at same Y → 1 row."""
        words = [
            _word("a", 10, 50),
            _word("b", 80, 52),
            _word("c", 150, 51),
        ]
        rows = _cluster_rows(words)
        assert len(rows) == 1

    def test_empty(self):
        assert _cluster_rows([]) == []


# ---------------------------------------------------------------------------
# build_grid_from_words (integration)
# ---------------------------------------------------------------------------

class TestBuildGridFromWords:

    def test_two_column_vocab(self):
        """Simulate a 2-column vocabulary page with 3 rows."""
        words = [
            _word("apple", 50, 20),
            _word("Apfel", 400, 22),
            _word("dog", 50, 60),
            _word("Hund", 400, 62),
            _word("cat", 50, 100),
            _word("Katze", 400, 102),
        ]
        cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)

        assert len(cols_meta) == 2
        assert len(cells) == 6  # 3 rows × 2 cols
        # Check cell_id format
        cell_ids = {c['cell_id'] for c in cells}
        assert 'R00_C0' in cell_ids
        assert 'R00_C1' in cell_ids

    def test_single_column_freetext(self):
        """Single-column text → 1 column, multiple rows."""
        words = [
            _word("Hello", 50, 20),
            _word("world", 120, 22),
            _word("Second", 50, 60),
            _word("line", 120, 62),
        ]
        cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)

        assert len(cols_meta) == 1
        assert cols_meta[0]['type'] == 'column_text'
        assert len(cells) == 2  # 2 rows, 1 column each

    def test_empty_input(self):
        cells, cols = build_grid_from_words([], img_w=500, img_h=500)
        assert cells == []
        assert cols == []

    def test_low_confidence_filtered(self):
        """Words below min_confidence are excluded."""
        words = [
            _word("good", 50, 20, conf=90),
            _word("bad", 200, 20, conf=10),
        ]
        cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
        # Only the good word should produce a cell
        assert len(cells) == 1
        assert cells[0]['text'] == 'good'

    def test_bbox_pct_correct(self):
        """Check that bbox_pct is correctly computed from pixel coords."""
        words = [_word("test", 200, 100, width=100, height=30)]
        cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
        assert len(cells) == 1
        bp = cells[0]['bbox_pct']
        assert bp['x'] == 20.0   # 200/1000*100
        assert bp['y'] == 20.0   # 100/500*100
        assert bp['w'] == 10.0   # 100/1000*100
        assert bp['h'] == 6.0    # 30/500*100

    def test_columns_meta_format(self):
        """columns_meta has same keys as build_cell_grid_v2 output."""
        words = [
            _word("a", 50, 20),
            _word("b", 400, 20),
        ]
        _, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
        for col in cols_meta:
            assert 'index' in col
            assert 'type' in col
            assert 'x' in col
            assert 'width' in col

    def test_word_boxes_included(self):
        """Each cell should contain word_boxes with percent coords."""
        words = [
            _word("hello", 50, 20),
            _word("world", 120, 22),
        ]
        cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
        assert len(cells) == 1  # single row, single column
        wb = cells[0].get('word_boxes', [])
        assert len(wb) == 2
        for w in wb:
            assert 'left' in w
            assert 'top' in w
            assert 'text' in w

    def test_all_whitespace_filtered(self):
        """Words with only whitespace text are filtered out."""
        words = [
            _word("  ", 50, 20, conf=90),
            _word("hello", 200, 20, conf=90),
        ]
        cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
        assert len(cells) == 1
        assert cells[0]['text'] == 'hello'