From 4d428980c140e786cf34fbae8f6de8cae00dfb8e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 08:45:24 +0100 Subject: [PATCH] refactor(word-step): make table fully generic and fix marker-only row filter Frontend: Replace hardcoded EN/DE/Example vocab table with unified dynamic table driven by columns_used from backend. Labeling, confirmation, counts, and summary badges are now all cell-based instead of branching on isVocab. Backend: Change _cells_to_vocab_entries() entry filter from checking only english/german/example to checking ANY mapped field. This preserves rows with only marker or source_page content, fixing the issue where marker sub-columns disappeared at the end of OCR processing. Co-Authored-By: Claude Sonnet 4.6 --- .../ocr-pipeline/StepWordRecognition.tsx | 431 +++++------------- klausur-service/backend/cv_vocab_pipeline.py | 8 +- .../backend/tests/test_cv_vocab_pipeline.py | 100 ++++ 3 files changed, 231 insertions(+), 308 deletions(-) diff --git a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx index 5590201..1d519d4 100644 --- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx @@ -241,34 +241,27 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec )) } - // Step-through: confirm current entry + // Step-through: confirm current row (always cell-based) const confirmEntry = () => { - if (isVocab) { - setEditedEntries(prev => prev.map((e, i) => - i === activeIndex ? { ...e, status: e.status === 'edited' ? 'edited' : 'confirmed' } : e - )) - } else { - // Generic: confirm all cells in this row - const rowCells = getRowCells(activeIndex) - const cellIds = new Set(rowCells.map(c => c.cell_id)) - setEditedCells(prev => prev.map(c => - cellIds.has(c.cell_id) ? { ...c, status: c.status === 'edited' ? 'edited' : 'confirmed' } : c - )) - } - const maxIdx = isVocab ? editedEntries.length - 1 : getUniqueRowCount() - 1 + const rowCells = getRowCells(activeIndex) + const cellIds = new Set(rowCells.map(c => c.cell_id)) + setEditedCells(prev => prev.map(c => + cellIds.has(c.cell_id) ? { ...c, status: c.status === 'edited' ? 'edited' : 'confirmed' } : c + )) + const maxIdx = getUniqueRowCount() - 1 if (activeIndex < maxIdx) { setActiveIndex(activeIndex + 1) } } - // Step-through: skip current entry + // Step-through: skip current row const skipEntry = () => { - if (isVocab) { - setEditedEntries(prev => prev.map((e, i) => - i === activeIndex ? { ...e, status: 'skipped' as const } : e - )) - } - const maxIdx = isVocab ? editedEntries.length - 1 : getUniqueRowCount() - 1 + const rowCells = getRowCells(activeIndex) + const cellIds = new Set(rowCells.map(c => c.cell_id)) + setEditedCells(prev => prev.map(c => + cellIds.has(c.cell_id) ? { ...c, status: 'skipped' as const } : c + )) + const maxIdx = getUniqueRowCount() - 1 if (activeIndex < maxIdx) { setActiveIndex(activeIndex + 1) } @@ -351,11 +344,12 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec const columnsUsed = gridResult?.columns_used || [] const gridShape = gridResult?.grid_shape - // Counts for labeling progress - const confirmedCount = isVocab - ? editedEntries.filter(e => e.status === 'confirmed' || e.status === 'edited').length - : editedCells.filter(c => c.status === 'confirmed' || c.status === 'edited').length - const totalCount = isVocab ? editedEntries.length : getUniqueRowCount() + // Counts for labeling progress (always cell-based) + const confirmedRowIds = new Set( + editedCells.filter(c => c.status === 'confirmed' || c.status === 'edited').map(c => c.row_index) + ) + const confirmedCount = confirmedRowIds.size + const totalCount = getUniqueRowCount() // Group cells by row for generic table display const cellsByRow: Map = new Map() @@ -475,10 +469,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec

- {isVocab - ? `Ergebnis: ${summary.total_entries ?? 0} Vokabel-Eintraege erkannt` - : `Ergebnis: ${summary.non_empty_cells}/${summary.total_cells} Zellen mit Text` - } + Ergebnis: {summary.non_empty_cells}/{summary.total_cells} Zellen mit Text + ({sortedRowIndices.length} Zeilen, {columnsUsed.length} Spalten)

{gridResult.duration_seconds}s @@ -487,27 +479,14 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec {/* Summary badges */}
- {isVocab ? ( - <> - - EN: {summary.with_english ?? 0} - - - DE: {summary.with_german ?? 0} - - - ) : ( - <> - - Zellen: {summary.non_empty_cells}/{summary.total_cells} - - {columnsUsed.map((col, i) => ( - - C{col.index}: {colTypeLabel(col.type)} - - ))} - - )} + + Zellen: {summary.non_empty_cells}/{summary.total_cells} + + {columnsUsed.map((col, i) => ( + + C{col.index}: {colTypeLabel(col.type)} + + ))} {summary.low_confidence > 0 && ( Unsicher: {summary.low_confidence} @@ -517,103 +496,52 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec {/* Entry/Cell table */}
- {isVocab ? ( - /* Vocab table: EN/DE/Example columns + optional page_ref */ - (() => { - const hasPageRef = editedEntries.some(e => e.source_page) - return ( - - - - - {hasPageRef && } - - - - - - - - {editedEntries.map((entry, idx) => ( - { setActiveIndex(idx); setMode('labeling') }} - > - - {hasPageRef && ( - - )} - + + ) + })} + +
#SeiteEnglishDeutschExampleConf
{idx + 1} - {entry.source_page || ''} - - + {/* Unified dynamic table — columns driven by columns_used */} + + + + + {columnsUsed.map((col, i) => ( + + ))} + + + + + {sortedRowIndices.map((rowIdx, posIdx) => { + const rowCells = cellsByRow.get(rowIdx) || [] + const avgConf = rowCells.length + ? Math.round(rowCells.reduce((s, c) => s + c.confidence, 0) / rowCells.length) + : 0 + return ( + { setActiveIndex(posIdx); setMode('labeling') }} + > + + {columnsUsed.map((col) => { + const cell = rowCells.find(c => c.col_index === col.index) + return ( + - - - - - ))} - -
Zeile + {colTypeLabel(col.type)} + Conf
+ R{String(rowIdx).padStart(2, '0')} + + - - - - - {entry.confidence}% -
- ) - })() - ) : ( - /* Generic table: dynamic columns from columns_used */ - - - - - {columnsUsed.map((col, i) => ( - - ))} - - - - - {sortedRowIndices.map((rowIdx, posIdx) => { - const rowCells = cellsByRow.get(rowIdx) || [] - const avgConf = rowCells.length - ? Math.round(rowCells.reduce((s, c) => s + c.confidence, 0) / rowCells.length) - : 0 - return ( - { setActiveIndex(posIdx); setMode('labeling') }} - > - - {columnsUsed.map((col) => { - const cell = rowCells.find(c => c.col_index === col.index) - return ( - - ) - })} - - - ) - })} - -
Zeile - {colTypeLabel(col.type)} - Conf
- R{String(rowIdx).padStart(2, '0')} - - - - {avgConf}% -
- )} + ) + })} +
+ {avgConf}% +
@@ -682,15 +610,12 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec )} {/* Labeling mode */} - {mode === 'labeling' && (isVocab ? editedEntries.length > 0 : editedCells.length > 0) && ( + {mode === 'labeling' && editedCells.length > 0 && (
{/* Left 2/3: Image with highlighted active row */}
- {isVocab - ? `Eintrag ${activeIndex + 1} von ${editedEntries.length}` - : `Zeile ${activeIndex + 1} von ${getUniqueRowCount()}` - } + Zeile {activeIndex + 1} von {getUniqueRowCount()}
{/* eslint-disable-next-line @next/next/no-img-element */} @@ -699,19 +624,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec alt="Wort-Overlay" className="w-full h-auto" /> - {/* Highlight overlay for active row/entry */} - {isVocab && editedEntries[activeIndex]?.bbox && ( -
- )} - {!isVocab && (() => { + {/* Highlight overlay for active row */} + {(() => { const rowCells = getRowCells(activeIndex) return rowCells.map(cell => (
- {activeIndex + 1} / {isVocab ? editedEntries.length : getUniqueRowCount()} + {activeIndex + 1} / {getUniqueRowCount()}