refactor(word-step): make table fully generic and fix marker-only row filter
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m43s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 17s

Frontend: Replace hardcoded EN/DE/Example vocab table with unified dynamic
table driven by columns_used from backend. Labeling, confirmation, counts,
and summary badges are now all cell-based instead of branching on isVocab.

Backend: Change _cells_to_vocab_entries() entry filter from checking only
english/german/example to checking ANY mapped field. This preserves rows
with only marker or source_page content, fixing the issue where marker
sub-columns disappeared at the end of OCR processing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 08:45:24 +01:00
parent dea3349b23
commit 4d428980c1
3 changed files with 231 additions and 308 deletions

View File

@@ -1465,6 +1465,106 @@ class TestCellsToVocabEntriesPageRef:
assert entries[0]['source_page'] == ''
assert entries[0]['bbox_ref'] is None
def test_marker_only_row_included(self):
"""Row with only a marker (no english/german/example) is kept."""
from cv_vocab_pipeline import _cells_to_vocab_entries
cells = [
# Row 0: has english + marker
{
'row_index': 0,
'col_type': 'column_en',
'text': 'hello',
'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
'confidence': 95.0,
'ocr_engine': 'tesseract',
},
{
'row_index': 0,
'col_type': 'column_marker',
'text': '!',
'bbox_pct': {'x': 5, 'y': 10, 'w': 3, 'h': 5},
'confidence': 80.0,
'ocr_engine': 'tesseract',
},
# Row 1: marker only (no english/german/example)
{
'row_index': 1,
'col_type': 'column_en',
'text': '',
'bbox_pct': {'x': 10, 'y': 20, 'w': 30, 'h': 5},
'confidence': 0.0,
'ocr_engine': 'tesseract',
},
{
'row_index': 1,
'col_type': 'column_marker',
'text': '!',
'bbox_pct': {'x': 5, 'y': 20, 'w': 3, 'h': 5},
'confidence': 70.0,
'ocr_engine': 'tesseract',
},
# Row 2: completely empty (should be excluded)
{
'row_index': 2,
'col_type': 'column_en',
'text': '',
'bbox_pct': {'x': 10, 'y': 30, 'w': 30, 'h': 5},
'confidence': 0.0,
'ocr_engine': 'tesseract',
},
{
'row_index': 2,
'col_type': 'column_marker',
'text': '',
'bbox_pct': {'x': 5, 'y': 30, 'w': 3, 'h': 5},
'confidence': 0.0,
'ocr_engine': 'tesseract',
},
]
columns_meta = [
{'type': 'column_en'}, {'type': 'column_marker'},
]
entries = _cells_to_vocab_entries(cells, columns_meta)
# Row 0 (has english) and Row 1 (has marker) should be included
# Row 2 (completely empty) should be excluded
assert len(entries) == 2
assert entries[0]['english'] == 'hello'
assert entries[0]['marker'] == '!'
assert entries[1]['english'] == ''
assert entries[1]['marker'] == '!'
def test_page_ref_only_row_included(self):
"""Row with only source_page text is kept (no english/german/example)."""
from cv_vocab_pipeline import _cells_to_vocab_entries
cells = [
{
'row_index': 0,
'col_type': 'column_en',
'text': '',
'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
'confidence': 0.0,
'ocr_engine': 'tesseract',
},
{
'row_index': 0,
'col_type': 'page_ref',
'text': 'p.59',
'bbox_pct': {'x': 5, 'y': 10, 'w': 5, 'h': 5},
'confidence': 80.0,
'ocr_engine': 'tesseract',
},
]
columns_meta = [{'type': 'column_en'}, {'type': 'page_ref'}]
entries = _cells_to_vocab_entries(cells, columns_meta)
assert len(entries) == 1
assert entries[0]['source_page'] == 'p.59'
# =============================================
# RUN TESTS