refactor(word-step): make table fully generic and fix marker-only row filter

Frontend: Replace hardcoded EN/DE/Example vocab table with unified dynamic table driven by columns_used from backend. Labeling, confirmation, counts, and summary badges are now all cell-based instead of branching on isVocab. Backend: Change _cells_to_vocab_entries() entry filter from checking only english/german/example to checking ANY mapped field. This preserves rows with only marker or source_page content, fixing the issue where marker sub-columns disappeared at the end of OCR processing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 08:45:24 +01:00
parent dea3349b23
commit 4d428980c1
3 changed files with 231 additions and 308 deletions
@@ -241,34 +241,27 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
    ))
  }
-  // Step-through: confirm current entry
+  // Step-through: confirm current row (always cell-based)
  const confirmEntry = () => {
    if (isVocab) {
      setEditedEntries(prev => prev.map((e, i) =>
        i === activeIndex ? { ...e, status: e.status === 'edited' ? 'edited' : 'confirmed' } : e
      ))
    } else {
      // Generic: confirm all cells in this row
    const rowCells = getRowCells(activeIndex)
    const cellIds = new Set(rowCells.map(c => c.cell_id))
    setEditedCells(prev => prev.map(c =>
      cellIds.has(c.cell_id) ? { ...c, status: c.status === 'edited' ? 'edited' : 'confirmed' } : c
    ))
-    }
+    const maxIdx = getUniqueRowCount() - 1
    const maxIdx = isVocab ? editedEntries.length - 1 : getUniqueRowCount() - 1
    if (activeIndex < maxIdx) {
      setActiveIndex(activeIndex + 1)
    }
  }
-  // Step-through: skip current entry
+  // Step-through: skip current row
  const skipEntry = () => {
-    if (isVocab) {
+    const rowCells = getRowCells(activeIndex)
-      setEditedEntries(prev => prev.map((e, i) =>
+    const cellIds = new Set(rowCells.map(c => c.cell_id))
-        i === activeIndex ? { ...e, status: 'skipped' as const } : e
+    setEditedCells(prev => prev.map(c =>
      cellIds.has(c.cell_id) ? { ...c, status: 'skipped' as const } : c
    ))
-    }
+    const maxIdx = getUniqueRowCount() - 1
    const maxIdx = isVocab ? editedEntries.length - 1 : getUniqueRowCount() - 1
    if (activeIndex < maxIdx) {
      setActiveIndex(activeIndex + 1)
    }
@@ -351,11 +344,12 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
  const columnsUsed = gridResult?.columns_used || []
  const gridShape = gridResult?.grid_shape
-  // Counts for labeling progress
+  // Counts for labeling progress (always cell-based)
-  const confirmedCount = isVocab
+  const confirmedRowIds = new Set(
-    ? editedEntries.filter(e => e.status === 'confirmed' || e.status === 'edited').length
+    editedCells.filter(c => c.status === 'confirmed' || c.status === 'edited').map(c => c.row_index)
-    : editedCells.filter(c => c.status === 'confirmed' || c.status === 'edited').length
+  )
-  const totalCount = isVocab ? editedEntries.length : getUniqueRowCount()
+  const confirmedCount = confirmedRowIds.size
  const totalCount = getUniqueRowCount()
  // Group cells by row for generic table display
  const cellsByRow: Map<number, GridCell[]> = new Map()
@@ -475,10 +469,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
            <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
              <div className="flex items-center justify-between">
                <h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
-                  {isVocab
+                  Ergebnis: {summary.non_empty_cells}/{summary.total_cells} Zellen mit Text
-                    ? `Ergebnis: ${summary.total_entries ?? 0} Vokabel-Eintraege erkannt`
+                  ({sortedRowIndices.length} Zeilen, {columnsUsed.length} Spalten)
                    : `Ergebnis: ${summary.non_empty_cells}/${summary.total_cells} Zellen mit Text`
                  }
                </h4>
                <span className="text-xs text-gray-400">
                  {gridResult.duration_seconds}s
@@ -487,17 +479,6 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
              {/* Summary badges */}
              <div className="flex gap-2 flex-wrap">
                {isVocab ? (
                  <>
                    <span className="px-2 py-0.5 rounded text-xs font-medium bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-300">
                      EN: {summary.with_english ?? 0}
                    </span>
                    <span className="px-2 py-0.5 rounded text-xs font-medium bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-300">
                      DE: {summary.with_german ?? 0}
                    </span>
                  </>
                ) : (
                  <>
                <span className="px-2 py-0.5 rounded text-xs font-medium bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-300">
                  Zellen: {summary.non_empty_cells}/{summary.total_cells}
                </span>
@@ -506,8 +487,6 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                    C{col.index}: {colTypeLabel(col.type)}
                  </span>
                ))}
                  </>
                )}
                {summary.low_confidence > 0 && (
                  <span className="px-2 py-0.5 rounded text-xs font-medium bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300">
                    Unsicher: {summary.low_confidence}
@@ -517,57 +496,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
              {/* Entry/Cell table */}
              <div className="max-h-80 overflow-y-auto">
-                {isVocab ? (
+                {/* Unified dynamic table — columns driven by columns_used */}
                  /* Vocab table: EN/DE/Example columns + optional page_ref */
                  (() => {
                    const hasPageRef = editedEntries.some(e => e.source_page)
                    return (
                      <table className="w-full text-xs">
                        <thead className="sticky top-0 bg-white dark:bg-gray-800">
                          <tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
                            <th className="py-1 pr-2 w-8">#</th>
                            {hasPageRef && <th className="py-1 pr-2 w-12 text-gray-400">Seite</th>}
                            <th className="py-1 pr-2">English</th>
                            <th className="py-1 pr-2">Deutsch</th>
                            <th className="py-1 pr-2">Example</th>
                            <th className="py-1 w-12 text-right">Conf</th>
                          </tr>
                        </thead>
                        <tbody>
                          {editedEntries.map((entry, idx) => (
                            <tr
                              key={idx}
                              className={`border-b dark:border-gray-700/50 ${
                                idx === activeIndex ? 'bg-teal-50 dark:bg-teal-900/20' : ''
                              }`}
                              onClick={() => { setActiveIndex(idx); setMode('labeling') }}
                            >
                              <td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
                              {hasPageRef && (
                                <td className="py-1 pr-2 font-mono text-gray-400 dark:text-gray-500">
                                  {entry.source_page || ''}
                                </td>
                              )}
                              <td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
                                <MultilineText text={entry.english} />
                              </td>
                              <td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
                                <MultilineText text={entry.german} />
                              </td>
                              <td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px]">
                                <MultilineText text={entry.example} />
                              </td>
                              <td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
                                {entry.confidence}%
                              </td>
                            </tr>
                          ))}
                        </tbody>
                      </table>
                    )
                  })()
                ) : (
                  /* Generic table: dynamic columns from columns_used */
                <table className="w-full text-xs">
                  <thead className="sticky top-0 bg-white dark:bg-gray-800">
                    <tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
@@ -613,7 +542,6 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                    })}
                  </tbody>
                </table>
                )}
                <div ref={tableEndRef} />
              </div>
            </div>
@@ -682,15 +610,12 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
      )}
      {/* Labeling mode */}
-      {mode === 'labeling' && (isVocab ? editedEntries.length > 0 : editedCells.length > 0) && (
+      {mode === 'labeling' && editedCells.length > 0 && (
        <div className="grid grid-cols-3 gap-4">
          {/* Left 2/3: Image with highlighted active row */}
          <div className="col-span-2">
            <div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
-              {isVocab
+              Zeile {activeIndex + 1} von {getUniqueRowCount()}
                ? `Eintrag ${activeIndex + 1} von ${editedEntries.length}`
                : `Zeile ${activeIndex + 1} von ${getUniqueRowCount()}`
              }
            </div>
            <div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900 relative">
              {/* eslint-disable-next-line @next/next/no-img-element */}
@@ -699,19 +624,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                alt="Wort-Overlay"
                className="w-full h-auto"
              />
-              {/* Highlight overlay for active row/entry */}
+              {/* Highlight overlay for active row */}
-              {isVocab && editedEntries[activeIndex]?.bbox && (
+              {(() => {
                <div
                  className="absolute border-2 border-yellow-400 bg-yellow-400/10 pointer-events-none"
                  style={{
                    left: `${editedEntries[activeIndex].bbox.x}%`,
                    top: `${editedEntries[activeIndex].bbox.y}%`,
                    width: `${editedEntries[activeIndex].bbox.w}%`,
                    height: `${editedEntries[activeIndex].bbox.h}%`,
                  }}
                />
              )}
              {!isVocab && (() => {
                const rowCells = getRowCells(activeIndex)
                return rowCells.map(cell => (
                  <div
@@ -741,14 +655,14 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                Zurueck
              </button>
              <span className="text-xs text-gray-500">
-                {activeIndex + 1} / {isVocab ? editedEntries.length : getUniqueRowCount()}
+                {activeIndex + 1} / {getUniqueRowCount()}
              </span>
              <button
                onClick={() => setActiveIndex(Math.min(
-                  (isVocab ? editedEntries.length : getUniqueRowCount()) - 1,
+                  getUniqueRowCount() - 1,
                  activeIndex + 1
                ))}
-                disabled={activeIndex >= (isVocab ? editedEntries.length : getUniqueRowCount()) - 1}
+                disabled={activeIndex >= getUniqueRowCount() - 1}
                className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
              >
                Weiter
@@ -757,17 +671,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
            {/* Status badge */}
            <div className="flex items-center gap-2">
-              {isVocab && (
+              {(() => {
                <>
                  <span className={`px-2 py-0.5 rounded text-[10px] uppercase font-semibold ${statusBadge(editedEntries[activeIndex]?.status)}`}>
                    {editedEntries[activeIndex]?.status || 'pending'}
                  </span>
                  <span className={`text-xs font-mono ${confColor(editedEntries[activeIndex]?.confidence || 0)}`}>
                    {editedEntries[activeIndex]?.confidence}% Konfidenz
                  </span>
                </>
              )}
              {!isVocab && (() => {
                const rowCells = getRowCells(activeIndex)
                const avgConf = rowCells.length
                  ? Math.round(rowCells.reduce((s, c) => s + c.confidence, 0) / rowCells.length)
@@ -780,70 +684,11 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
              })()}
            </div>
-            {/* Cell crops (vocab mode) */}
+            {/* Editable fields — one per column, driven by columns_used */}
            {isVocab && editedEntries[activeIndex]?.bbox_en && (
              <div>
                <div className="text-[10px] font-medium text-blue-500 mb-0.5">EN-Zelle</div>
                <div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
                  <CellCrop
                    imageUrl={dewarpedUrl}
                    bbox={editedEntries[activeIndex].bbox_en!}
                  />
                </div>
              </div>
            )}
            {isVocab && editedEntries[activeIndex]?.bbox_de && (
              <div>
                <div className="text-[10px] font-medium text-green-500 mb-0.5">DE-Zelle</div>
                <div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
                  <CellCrop
                    imageUrl={dewarpedUrl}
                    bbox={editedEntries[activeIndex].bbox_de!}
                  />
                </div>
              </div>
            )}
            {/* Editable fields */}
            <div className="space-y-2">
              {isVocab ? (
                /* Vocab mode: EN/DE/Example fields */
                <>
                  <div>
                    <label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
                    <textarea
                      ref={enRef as any}
                      rows={Math.max(1, (editedEntries[activeIndex]?.english || '').split('\n').length)}
                      value={editedEntries[activeIndex]?.english || ''}
                      onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
                      className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
                    />
                  </div>
                  <div>
                    <label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
                    <textarea
                      rows={Math.max(1, (editedEntries[activeIndex]?.german || '').split('\n').length)}
                      value={editedEntries[activeIndex]?.german || ''}
                      onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
                      className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
                    />
                  </div>
                  <div>
                    <label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
                    <textarea
                      rows={Math.max(1, (editedEntries[activeIndex]?.example || '').split('\n').length)}
                      value={editedEntries[activeIndex]?.example || ''}
                      onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
                      className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
                    />
                  </div>
                </>
              ) : (
                /* Generic mode: one field per column */
                <>
              {(() => {
                const rowCells = getRowCells(activeIndex)
-                    return columnsUsed.map((col) => {
+                return columnsUsed.map((col, colIdx) => {
                  const cell = rowCells.find(c => c.col_index === col.index)
                  if (!cell) return null
                  return (
@@ -859,6 +704,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                        <CellCrop imageUrl={dewarpedUrl} bbox={cell.bbox_pct} />
                      </div>
                      <textarea
                        ref={colIdx === 0 ? enRef as any : undefined}
                        rows={Math.max(1, (cell.text || '').split('\n').length)}
                        value={cell.text || ''}
                        onChange={(e) => updateCell(cell.cell_id, e.target.value)}
@@ -868,8 +714,6 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                  )
                })
              })()}
                </>
              )}
            </div>
            {/* Action buttons */}
@@ -895,39 +739,15 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
              <div>Ctrl+Up = Zurueck</div>
            </div>
-            {/* Entry/Row list (compact) */}
+            {/* Row list (compact) */}
            <div className="border-t dark:border-gray-700 pt-2 mt-2">
              <div className="text-[10px] font-medium text-gray-500 dark:text-gray-400 mb-1">
-                {isVocab ? 'Alle Eintraege' : 'Alle Zeilen'}
+                Alle Zeilen
              </div>
              <div className="max-h-48 overflow-y-auto space-y-0.5">
-                {isVocab ? (
+                {sortedRowIndices.map((rowIdx, posIdx) => {
                  editedEntries.map((entry, idx) => (
                    <div
                      key={idx}
                      onClick={() => setActiveIndex(idx)}
                      className={`flex items-center gap-1 px-2 py-1 rounded text-[10px] cursor-pointer transition-colors ${
                        idx === activeIndex
                          ? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700'
                          : 'hover:bg-gray-50 dark:hover:bg-gray-700/50'
                      }`}
                    >
                      <span className="w-4 text-right text-gray-400">{idx + 1}</span>
                      <span className={`w-2 h-2 rounded-full ${
                        entry.status === 'confirmed' ? 'bg-green-500' :
                        entry.status === 'edited' ? 'bg-blue-500' :
                        entry.status === 'skipped' ? 'bg-orange-400' :
                        'bg-gray-300 dark:bg-gray-600'
                      }`} />
                      <span className="truncate text-gray-600 dark:text-gray-400 font-mono">
                        {(entry.english || '\u2014').replace(/\n/g, ' ')} &rarr; {(entry.german || '\u2014').replace(/\n/g, ' ')}
                      </span>
                    </div>
                  ))
                ) : (
                  sortedRowIndices.map((rowIdx, posIdx) => {
                  const rowCells = cellsByRow.get(rowIdx) || []
-                    const firstText = rowCells.find(c => c.text)?.text || ''
+                  const textParts = rowCells.filter(c => c.text).map(c => c.text.replace(/\n/g, ' '))
                  return (
                    <div
                      key={rowIdx}
@@ -940,12 +760,11 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
                    >
                      <span className="w-6 text-right text-gray-400 font-mono">R{String(rowIdx).padStart(2, '0')}</span>
                      <span className="truncate text-gray-600 dark:text-gray-400 font-mono">
-                          {firstText.replace(/\n/g, ' ').substring(0, 60) || '\u2014'}
+                        {textParts.join(' \u2192 ') || '\u2014'}
                      </span>
                    </div>
                  )
-                  })
+                })}
                )}
              </div>
            </div>
          </div>
@@ -4281,8 +4281,12 @@ def _cells_to_vocab_entries(
            sum(confidences) / len(confidences), 1
        ) if confidences else 0.0
-        # Only include if at least one vocab field has text
+        # Only include if at least one mapped field has text
-        if entry['english'] or entry['german'] or entry['example']:
+        has_content = any(
            entry.get(f)
            for f in col_type_to_field.values()
        )
        if has_content:
            entries.append(entry)
    return entries
@@ -1465,6 +1465,106 @@ class TestCellsToVocabEntriesPageRef:
        assert entries[0]['source_page'] == ''
        assert entries[0]['bbox_ref'] is None
    def test_marker_only_row_included(self):
        """Row with only a marker (no english/german/example) is kept."""
        from cv_vocab_pipeline import _cells_to_vocab_entries
        cells = [
            # Row 0: has english + marker
            {
                'row_index': 0,
                'col_type': 'column_en',
                'text': 'hello',
                'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
                'confidence': 95.0,
                'ocr_engine': 'tesseract',
            },
            {
                'row_index': 0,
                'col_type': 'column_marker',
                'text': '!',
                'bbox_pct': {'x': 5, 'y': 10, 'w': 3, 'h': 5},
                'confidence': 80.0,
                'ocr_engine': 'tesseract',
            },
            # Row 1: marker only (no english/german/example)
            {
                'row_index': 1,
                'col_type': 'column_en',
                'text': '',
                'bbox_pct': {'x': 10, 'y': 20, 'w': 30, 'h': 5},
                'confidence': 0.0,
                'ocr_engine': 'tesseract',
            },
            {
                'row_index': 1,
                'col_type': 'column_marker',
                'text': '!',
                'bbox_pct': {'x': 5, 'y': 20, 'w': 3, 'h': 5},
                'confidence': 70.0,
                'ocr_engine': 'tesseract',
            },
            # Row 2: completely empty (should be excluded)
            {
                'row_index': 2,
                'col_type': 'column_en',
                'text': '',
                'bbox_pct': {'x': 10, 'y': 30, 'w': 30, 'h': 5},
                'confidence': 0.0,
                'ocr_engine': 'tesseract',
            },
            {
                'row_index': 2,
                'col_type': 'column_marker',
                'text': '',
                'bbox_pct': {'x': 5, 'y': 30, 'w': 3, 'h': 5},
                'confidence': 0.0,
                'ocr_engine': 'tesseract',
            },
        ]
        columns_meta = [
            {'type': 'column_en'}, {'type': 'column_marker'},
        ]
        entries = _cells_to_vocab_entries(cells, columns_meta)
        # Row 0 (has english) and Row 1 (has marker) should be included
        # Row 2 (completely empty) should be excluded
        assert len(entries) == 2
        assert entries[0]['english'] == 'hello'
        assert entries[0]['marker'] == '!'
        assert entries[1]['english'] == ''
        assert entries[1]['marker'] == '!'
    def test_page_ref_only_row_included(self):
        """Row with only source_page text is kept (no english/german/example)."""
        from cv_vocab_pipeline import _cells_to_vocab_entries
        cells = [
            {
                'row_index': 0,
                'col_type': 'column_en',
                'text': '',
                'bbox_pct': {'x': 10, 'y': 10, 'w': 30, 'h': 5},
                'confidence': 0.0,
                'ocr_engine': 'tesseract',
            },
            {
                'row_index': 0,
                'col_type': 'page_ref',
                'text': 'p.59',
                'bbox_pct': {'x': 5, 'y': 10, 'w': 5, 'h': 5},
                'confidence': 80.0,
                'ocr_engine': 'tesseract',
            },
        ]
        columns_meta = [{'type': 'column_en'}, {'type': 'page_ref'}]
        entries = _cells_to_vocab_entries(cells, columns_meta)
        assert len(entries) == 1
        assert entries[0]['source_page'] == 'p.59'
 # =============================================
 # RUN TESTS