fix: three OCR pipeline UX improvements

1. Rename Step 6 label to "Korrektur" (was "OCR-Zeichenkorrektur") 2. Move _fix_character_confusion from pipeline Step 1 into llm_review_entries_streaming so corrections are visible in the UI: char changes (| → I, 1 → I, 8 → B) are now emitted as a batch event right after the meta event, appearing in the corrections list 3. StepReconstruction: all cells (including empty) are now rendered as editable inputs — removed filter that hid empty cells from the editor Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 17:31:55 +01:00
parent f3d61a9394
commit d1c8075da2
3 changed files with 45 additions and 11 deletions
@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
      <div className="flex items-center justify-between">
        <div>
          <h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
-            Schritt 6: OCR-Zeichenkorrektur
+            Schritt 6: Korrektur
          </h3>
          <p className="text-xs text-gray-400 mt-0.5">
            {status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
          {status === 'ready' && (
            <button onClick={runReview}
              className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
-              Zeichenkorrektur starten
+              Korrektur starten
            </button>
          )}
          {status === 'running' && (
@@ -35,8 +35,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  const [undoStack, setUndoStack] = useState<UndoAction[]>([])
  const [redoStack, setRedoStack] = useState<UndoAction[]>([])

-  // All cells including empty ones (for empty field highlighting)
-  const [allCells, setAllCells] = useState<EditableCell[]>([])
+  // (allCells removed — cells now contains all cells including empty ones)

  const containerRef = useRef<HTMLDivElement>(null)
  const imageRef = useRef<HTMLImageElement>(null)
@@ -82,8 +81,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
        colIndex: c.col_index,
      }))

-      setAllCells(allEditableCells)
-      setCells(allEditableCells.filter(c => c.text.trim() !== ''))
+      setCells(allEditableCells)
      setEditedTexts(new Map())
      setUndoStack([])
      setRedoStack([])
@@ -191,7 +189,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  const emptyCellIds = useMemo(() => {
    const required = new Set(['column_en', 'column_de'])
    const ids = new Set<string>()
-    for (const cell of allCells) {
+    for (const cell of cells) {
      if (required.has(cell.colType) && !cell.text.trim()) {
        ids.add(cell.cellId)
      }
@@ -429,7 +427,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
          />

          {/* Empty field markers */}
-          {showEmptyHighlight && allCells
+          {showEmptyHighlight && cells
            .filter(c => emptyCellIds.has(c.cellId))
            .map(cell => (
              <div
@@ -4999,8 +4999,8 @@ def build_word_grid(
    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
    entries = _merge_continuation_rows(entries)

-    # 1. Fix character confusion (I/1/l based on context)
-    entries = _fix_character_confusion(entries)
+    # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
+    #    llm_review_entries_streaming so changes are visible to the user in Step 6.

    # 2. Replace OCR'd phonetics with dictionary IPA
    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
@@ -5913,14 +5913,50 @@ async def llm_review_entries_streaming(
    model: str = None,
    batch_size: int = _REVIEW_BATCH_SIZE,
 ):
-    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
+
+    Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
+    visible in the UI — this is the only place the fix now runs (removed from Step 1
+    of build_vocab_pipeline_streaming).
+    """
+    # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
+    _CONF_FIELDS = ('english', 'german', 'example')
+    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
+    _fix_character_confusion(entries)  # modifies in-place, returns same list
+    char_changes = [
+        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
+        for i in range(len(entries))
+        for f in _CONF_FIELDS
+        if originals[i][f] != entries[i].get(f, '')
+    ]
+
    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        # Inject char_changes as a batch right after the meta event from the spell checker
+        _meta_sent = False
        async for event in spell_review_entries_streaming(entries, batch_size):
            yield event
+            if not _meta_sent and event.get('type') == 'meta' and char_changes:
+                _meta_sent = True
+                yield {
+                    'type': 'batch',
+                    'changes': char_changes,
+                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+                    'progress': {'current': 0, 'total': len(entries)},
+                }
        return
+
    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")

+    # LLM path: emit char_changes first (before meta) so they appear in the UI
+    if char_changes:
+        yield {
+            'type': 'batch',
+            'changes': char_changes,
+            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+            'progress': {'current': 0, 'total': len(entries)},
+        }
+
    model = model or OLLAMA_REVIEW_MODEL

    # Separate reviewable from skipped entries