refactor(word-step): make table fully generic and fix marker-only row filter
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m43s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m43s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 17s
Frontend: Replace hardcoded EN/DE/Example vocab table with unified dynamic table driven by columns_used from backend. Labeling, confirmation, counts, and summary badges are now all cell-based instead of branching on isVocab. Backend: Change _cells_to_vocab_entries() entry filter from checking only english/german/example to checking ANY mapped field. This preserves rows with only marker or source_page content, fixing the issue where marker sub-columns disappeared at the end of OCR processing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1465,6 +1465,106 @@ class TestCellsToVocabEntriesPageRef:
|
||||
assert entries[0]['source_page'] == ''
|
||||
assert entries[0]['bbox_ref'] is None
|
||||
|
||||
def test_marker_only_row_included(self):
|
||||
"""Row with only a marker (no english/german/example) is kept."""
|
||||
from cv_vocab_pipeline import _cells_to_vocab_entries
|
||||
|
||||
cells = [
|
||||
# Row 0: has english + marker
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'column_en',
|
||||
'text': 'hello',
|
||||
'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
|
||||
'confidence': 95.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'column_marker',
|
||||
'text': '!',
|
||||
'bbox_pct': {'x': 5, 'y': 10, 'w': 3, 'h': 5},
|
||||
'confidence': 80.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
# Row 1: marker only (no english/german/example)
|
||||
{
|
||||
'row_index': 1,
|
||||
'col_type': 'column_en',
|
||||
'text': '',
|
||||
'bbox_pct': {'x': 10, 'y': 20, 'w': 30, 'h': 5},
|
||||
'confidence': 0.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
{
|
||||
'row_index': 1,
|
||||
'col_type': 'column_marker',
|
||||
'text': '!',
|
||||
'bbox_pct': {'x': 5, 'y': 20, 'w': 3, 'h': 5},
|
||||
'confidence': 70.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
# Row 2: completely empty (should be excluded)
|
||||
{
|
||||
'row_index': 2,
|
||||
'col_type': 'column_en',
|
||||
'text': '',
|
||||
'bbox_pct': {'x': 10, 'y': 30, 'w': 30, 'h': 5},
|
||||
'confidence': 0.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
{
|
||||
'row_index': 2,
|
||||
'col_type': 'column_marker',
|
||||
'text': '',
|
||||
'bbox_pct': {'x': 5, 'y': 30, 'w': 3, 'h': 5},
|
||||
'confidence': 0.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
]
|
||||
columns_meta = [
|
||||
{'type': 'column_en'}, {'type': 'column_marker'},
|
||||
]
|
||||
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
|
||||
# Row 0 (has english) and Row 1 (has marker) should be included
|
||||
# Row 2 (completely empty) should be excluded
|
||||
assert len(entries) == 2
|
||||
assert entries[0]['english'] == 'hello'
|
||||
assert entries[0]['marker'] == '!'
|
||||
assert entries[1]['english'] == ''
|
||||
assert entries[1]['marker'] == '!'
|
||||
|
||||
def test_page_ref_only_row_included(self):
|
||||
"""Row with only source_page text is kept (no english/german/example)."""
|
||||
from cv_vocab_pipeline import _cells_to_vocab_entries
|
||||
|
||||
cells = [
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'column_en',
|
||||
'text': '',
|
||||
'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
|
||||
'confidence': 0.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
{
|
||||
'row_index': 0,
|
||||
'col_type': 'page_ref',
|
||||
'text': 'p.59',
|
||||
'bbox_pct': {'x': 5, 'y': 10, 'w': 5, 'h': 5},
|
||||
'confidence': 80.0,
|
||||
'ocr_engine': 'tesseract',
|
||||
},
|
||||
]
|
||||
columns_meta = [{'type': 'column_en'}, {'type': 'page_ref'}]
|
||||
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
|
||||
assert len(entries) == 1
|
||||
assert entries[0]['source_page'] == 'p.59'
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
|
||||
Reference in New Issue
Block a user