refactor(word-step): make table fully generic and fix marker-only row filter

Frontend: Replace hardcoded EN/DE/Example vocab table with unified dynamic table driven by columns_used from backend. Labeling, confirmation, counts, and summary badges are now all cell-based instead of branching on isVocab. Backend: Change _cells_to_vocab_entries() entry filter from checking only english/german/example to checking ANY mapped field. This preserves rows with only marker or source_page content, fixing the issue where marker sub-columns disappeared at the end of OCR processing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 08:45:24 +01:00
parent dea3349b23
commit 4d428980c1
3 changed files with 231 additions and 308 deletions
@@ -1465,6 +1465,106 @@ class TestCellsToVocabEntriesPageRef:
        assert entries[0]['source_page'] == ''
        assert entries[0]['bbox_ref'] is None

+    def test_marker_only_row_included(self):
+        """Row with only a marker (no english/german/example) is kept."""
+        from cv_vocab_pipeline import _cells_to_vocab_entries
+
+        cells = [
+            # Row 0: has english + marker
+            {
+                'row_index': 0,
+                'col_type': 'column_en',
+                'text': 'hello',
+                'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
+                'confidence': 95.0,
+                'ocr_engine': 'tesseract',
+            },
+            {
+                'row_index': 0,
+                'col_type': 'column_marker',
+                'text': '!',
+                'bbox_pct': {'x': 5, 'y': 10, 'w': 3, 'h': 5},
+                'confidence': 80.0,
+                'ocr_engine': 'tesseract',
+            },
+            # Row 1: marker only (no english/german/example)
+            {
+                'row_index': 1,
+                'col_type': 'column_en',
+                'text': '',
+                'bbox_pct': {'x': 10, 'y': 20, 'w': 30, 'h': 5},
+                'confidence': 0.0,
+                'ocr_engine': 'tesseract',
+            },
+            {
+                'row_index': 1,
+                'col_type': 'column_marker',
+                'text': '!',
+                'bbox_pct': {'x': 5, 'y': 20, 'w': 3, 'h': 5},
+                'confidence': 70.0,
+                'ocr_engine': 'tesseract',
+            },
+            # Row 2: completely empty (should be excluded)
+            {
+                'row_index': 2,
+                'col_type': 'column_en',
+                'text': '',
+                'bbox_pct': {'x': 10, 'y': 30, 'w': 30, 'h': 5},
+                'confidence': 0.0,
+                'ocr_engine': 'tesseract',
+            },
+            {
+                'row_index': 2,
+                'col_type': 'column_marker',
+                'text': '',
+                'bbox_pct': {'x': 5, 'y': 30, 'w': 3, 'h': 5},
+                'confidence': 0.0,
+                'ocr_engine': 'tesseract',
+            },
+        ]
+        columns_meta = [
+            {'type': 'column_en'}, {'type': 'column_marker'},
+        ]
+
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+
+        # Row 0 (has english) and Row 1 (has marker) should be included
+        # Row 2 (completely empty) should be excluded
+        assert len(entries) == 2
+        assert entries[0]['english'] == 'hello'
+        assert entries[0]['marker'] == '!'
+        assert entries[1]['english'] == ''
+        assert entries[1]['marker'] == '!'
+
+    def test_page_ref_only_row_included(self):
+        """Row with only source_page text is kept (no english/german/example)."""
+        from cv_vocab_pipeline import _cells_to_vocab_entries
+
+        cells = [
+            {
+                'row_index': 0,
+                'col_type': 'column_en',
+                'text': '',
+                'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
+                'confidence': 0.0,
+                'ocr_engine': 'tesseract',
+            },
+            {
+                'row_index': 0,
+                'col_type': 'page_ref',
+                'text': 'p.59',
+                'bbox_pct': {'x': 5, 'y': 10, 'w': 5, 'h': 5},
+                'confidence': 80.0,
+                'ocr_engine': 'tesseract',
+            },
+        ]
+        columns_meta = [{'type': 'column_en'}, {'type': 'page_ref'}]
+
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+
+        assert len(entries) == 1
+        assert entries[0]['source_page'] == 'p.59'
+

 # =============================================
 # RUN TESTS