feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions
@@ -0,0 +1,214 @@
+"""Tests for cv_words_first.py — Words-First Grid Builder."""
+
+import pytest
+from cv_words_first import (
+    _assign_word_to_column,
+    _assign_word_to_row,
+    _build_cells,
+    _cluster_columns,
+    _cluster_rows,
+    build_grid_from_words,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
+    """Create a synthetic word dict."""
+    return {
+        'text': text,
+        'left': left,
+        'top': top,
+        'width': width,
+        'height': height,
+        'conf': conf,
+    }
+
+
+# ---------------------------------------------------------------------------
+# _cluster_columns
+# ---------------------------------------------------------------------------
+
+class TestClusterColumns:
+
+    def test_single_column_freetext(self):
+        """Words spread evenly across page → 1 column (column_text)."""
+        words = [
+            _word("Hello", 50, 10),
+            _word("world", 120, 10),
+            _word("this", 50, 40),
+            _word("is", 120, 40),
+            _word("text", 190, 40),
+        ]
+        cols = _cluster_columns(words, img_w=400)
+        assert len(cols) == 1
+        assert cols[0]['type'] == 'column_text'
+
+    def test_two_columns(self):
+        """Two word groups with large X-gap → 2 columns."""
+        words = [
+            _word("apple", 20, 10),
+            _word("Apfel", 300, 10),
+            _word("dog", 20, 40),
+            _word("Hund", 300, 40),
+        ]
+        cols = _cluster_columns(words, img_w=500)
+        assert len(cols) == 2
+        assert cols[0]['type'] == 'column_1'
+        assert cols[1]['type'] == 'column_2'
+
+    def test_three_columns(self):
+        """Three groups separated by wide gaps → 3 columns."""
+        words = [
+            _word("1", 10, 10, width=20),
+            _word("apple", 100, 10),
+            _word("Apfel", 400, 10),
+            _word("2", 10, 40, width=20),
+            _word("dog", 100, 40),
+            _word("Hund", 400, 40),
+        ]
+        cols = _cluster_columns(words, img_w=600)
+        assert len(cols) == 3
+
+    def test_empty_words(self):
+        """No words → empty result."""
+        assert _cluster_columns([], img_w=500) == []
+
+
+# ---------------------------------------------------------------------------
+# _cluster_rows
+# ---------------------------------------------------------------------------
+
+class TestClusterRows:
+
+    def test_two_rows(self):
+        """Words at two Y-levels → 2 rows."""
+        words = [
+            _word("hello", 10, 20),
+            _word("world", 100, 25),
+            _word("foo", 10, 80),
+            _word("bar", 100, 82),
+        ]
+        rows = _cluster_rows(words)
+        assert len(rows) == 2
+        assert rows[0]['y_min'] < rows[1]['y_min']
+
+    def test_single_row(self):
+        """All words at same Y → 1 row."""
+        words = [
+            _word("a", 10, 50),
+            _word("b", 80, 52),
+            _word("c", 150, 51),
+        ]
+        rows = _cluster_rows(words)
+        assert len(rows) == 1
+
+    def test_empty(self):
+        assert _cluster_rows([]) == []
+
+
+# ---------------------------------------------------------------------------
+# build_grid_from_words (integration)
+# ---------------------------------------------------------------------------
+
+class TestBuildGridFromWords:
+
+    def test_two_column_vocab(self):
+        """Simulate a 2-column vocabulary page with 3 rows."""
+        words = [
+            _word("apple", 50, 20),
+            _word("Apfel", 400, 22),
+            _word("dog", 50, 60),
+            _word("Hund", 400, 62),
+            _word("cat", 50, 100),
+            _word("Katze", 400, 102),
+        ]
+        cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
+
+        assert len(cols_meta) == 2
+        assert len(cells) == 6  # 3 rows × 2 cols
+        # Check cell_id format
+        cell_ids = {c['cell_id'] for c in cells}
+        assert 'R00_C0' in cell_ids
+        assert 'R00_C1' in cell_ids
+
+    def test_single_column_freetext(self):
+        """Single-column text → 1 column, multiple rows."""
+        words = [
+            _word("Hello", 50, 20),
+            _word("world", 120, 22),
+            _word("Second", 50, 60),
+            _word("line", 120, 62),
+        ]
+        cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
+
+        assert len(cols_meta) == 1
+        assert cols_meta[0]['type'] == 'column_text'
+        assert len(cells) == 2  # 2 rows, 1 column each
+
+    def test_empty_input(self):
+        cells, cols = build_grid_from_words([], img_w=500, img_h=500)
+        assert cells == []
+        assert cols == []
+
+    def test_low_confidence_filtered(self):
+        """Words below min_confidence are excluded."""
+        words = [
+            _word("good", 50, 20, conf=90),
+            _word("bad", 200, 20, conf=10),
+        ]
+        cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
+        # Only the good word should produce a cell
+        assert len(cells) == 1
+        assert cells[0]['text'] == 'good'
+
+    def test_bbox_pct_correct(self):
+        """Check that bbox_pct is correctly computed from pixel coords."""
+        words = [_word("test", 200, 100, width=100, height=30)]
+        cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
+        assert len(cells) == 1
+        bp = cells[0]['bbox_pct']
+        assert bp['x'] == 20.0   # 200/1000*100
+        assert bp['y'] == 20.0   # 100/500*100
+        assert bp['w'] == 10.0   # 100/1000*100
+        assert bp['h'] == 6.0    # 30/500*100
+
+    def test_columns_meta_format(self):
+        """columns_meta has same keys as build_cell_grid_v2 output."""
+        words = [
+            _word("a", 50, 20),
+            _word("b", 400, 20),
+        ]
+        _, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
+        for col in cols_meta:
+            assert 'index' in col
+            assert 'type' in col
+            assert 'x' in col
+            assert 'width' in col
+
+    def test_word_boxes_included(self):
+        """Each cell should contain word_boxes with percent coords."""
+        words = [
+            _word("hello", 50, 20),
+            _word("world", 120, 22),
+        ]
+        cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
+        assert len(cells) == 1  # single row, single column
+        wb = cells[0].get('word_boxes', [])
+        assert len(wb) == 2
+        for w in wb:
+            assert 'left' in w
+            assert 'top' in w
+            assert 'text' in w
+
+    def test_all_whitespace_filtered(self):
+        """Words with only whitespace text are filtered out."""
+        words = [
+            _word("  ", 50, 20, conf=90),
+            _word("hello", 200, 20, conf=90),
+        ]
+        cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
+        assert len(cells) == 1
+        assert cells[0]['text'] == 'hello'