"""Tests for cv_words_first.py — Words-First Grid Builder.""" import pytest from cv_words_first import ( _assign_word_to_column, _assign_word_to_row, _build_cells, _cluster_columns, _cluster_rows, build_grid_from_words, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90): """Create a synthetic word dict.""" return { 'text': text, 'left': left, 'top': top, 'width': width, 'height': height, 'conf': conf, } # --------------------------------------------------------------------------- # _cluster_columns # --------------------------------------------------------------------------- class TestClusterColumns: def test_single_column_freetext(self): """Words spread evenly across page → 1 column (column_text).""" words = [ _word("Hello", 50, 10), _word("world", 120, 10), _word("this", 50, 40), _word("is", 120, 40), _word("text", 190, 40), ] cols = _cluster_columns(words, img_w=400) assert len(cols) == 1 assert cols[0]['type'] == 'column_text' def test_two_columns(self): """Two word groups with large X-gap → 2 columns.""" words = [ _word("apple", 20, 10), _word("Apfel", 300, 10), _word("dog", 20, 40), _word("Hund", 300, 40), ] cols = _cluster_columns(words, img_w=500) assert len(cols) == 2 assert cols[0]['type'] == 'column_1' assert cols[1]['type'] == 'column_2' def test_three_columns(self): """Three groups separated by wide gaps → 3 columns.""" words = [ _word("1", 10, 10, width=20), _word("apple", 100, 10), _word("Apfel", 400, 10), _word("2", 10, 40, width=20), _word("dog", 100, 40), _word("Hund", 400, 40), ] cols = _cluster_columns(words, img_w=600) assert len(cols) == 3 def test_empty_words(self): """No words → empty result.""" assert _cluster_columns([], img_w=500) == [] # --------------------------------------------------------------------------- # _cluster_rows # --------------------------------------------------------------------------- class TestClusterRows: def test_two_rows(self): """Words at two Y-levels → 2 rows.""" words = [ _word("hello", 10, 20), _word("world", 100, 25), _word("foo", 10, 80), _word("bar", 100, 82), ] rows = _cluster_rows(words) assert len(rows) == 2 assert rows[0]['y_min'] < rows[1]['y_min'] def test_single_row(self): """All words at same Y → 1 row.""" words = [ _word("a", 10, 50), _word("b", 80, 52), _word("c", 150, 51), ] rows = _cluster_rows(words) assert len(rows) == 1 def test_empty(self): assert _cluster_rows([]) == [] # --------------------------------------------------------------------------- # build_grid_from_words (integration) # --------------------------------------------------------------------------- class TestBuildGridFromWords: def test_two_column_vocab(self): """Simulate a 2-column vocabulary page with 3 rows.""" words = [ _word("apple", 50, 20), _word("Apfel", 400, 22), _word("dog", 50, 60), _word("Hund", 400, 62), _word("cat", 50, 100), _word("Katze", 400, 102), ] cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200) assert len(cols_meta) == 2 assert len(cells) == 6 # 3 rows × 2 cols # Check cell_id format cell_ids = {c['cell_id'] for c in cells} assert 'R00_C0' in cell_ids assert 'R00_C1' in cell_ids def test_single_column_freetext(self): """Single-column text → 1 column, multiple rows.""" words = [ _word("Hello", 50, 20), _word("world", 120, 22), _word("Second", 50, 60), _word("line", 120, 62), ] cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150) assert len(cols_meta) == 1 assert cols_meta[0]['type'] == 'column_text' assert len(cells) == 2 # 2 rows, 1 column each def test_empty_input(self): cells, cols = build_grid_from_words([], img_w=500, img_h=500) assert cells == [] assert cols == [] def test_low_confidence_filtered(self): """Words below min_confidence are excluded.""" words = [ _word("good", 50, 20, conf=90), _word("bad", 200, 20, conf=10), ] cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30) # Only the good word should produce a cell assert len(cells) == 1 assert cells[0]['text'] == 'good' def test_bbox_pct_correct(self): """Check that bbox_pct is correctly computed from pixel coords.""" words = [_word("test", 200, 100, width=100, height=30)] cells, _ = build_grid_from_words(words, img_w=1000, img_h=500) assert len(cells) == 1 bp = cells[0]['bbox_pct'] assert bp['x'] == 20.0 # 200/1000*100 assert bp['y'] == 20.0 # 100/500*100 assert bp['w'] == 10.0 # 100/1000*100 assert bp['h'] == 6.0 # 30/500*100 def test_columns_meta_format(self): """columns_meta has same keys as build_cell_grid_v2 output.""" words = [ _word("a", 50, 20), _word("b", 400, 20), ] _, cols_meta = build_grid_from_words(words, img_w=600, img_h=100) for col in cols_meta: assert 'index' in col assert 'type' in col assert 'x' in col assert 'width' in col def test_word_boxes_included(self): """Each cell should contain word_boxes with percent coords.""" words = [ _word("hello", 50, 20), _word("world", 120, 22), ] cells, _ = build_grid_from_words(words, img_w=300, img_h=100) assert len(cells) == 1 # single row, single column wb = cells[0].get('word_boxes', []) assert len(wb) == 2 for w in wb: assert 'left' in w assert 'top' in w assert 'text' in w def test_all_whitespace_filtered(self): """Words with only whitespace text are filtered out.""" words = [ _word(" ", 50, 20, conf=90), _word("hello", 200, 20, conf=90), ] cells, _ = build_grid_from_words(words, img_w=400, img_h=100) assert len(cells) == 1 assert cells[0]['text'] == 'hello'