fix: three OCR pipeline UX improvements

1. Rename Step 6 label to "Korrektur" (was "OCR-Zeichenkorrektur") 2. Move _fix_character_confusion from pipeline Step 1 into llm_review_entries_streaming so corrections are visible in the UI: char changes (| → I, 1 → I, 8 → B) are now emitted as a batch event right after the meta event, appearing in the corrections list 3. StepReconstruction: all cells (including empty) are now rendered as editable inputs — removed filter that hid empty cells from the editor Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 17:31:55 +01:00
parent f3d61a9394
commit d1c8075da2
3 changed files with 45 additions and 11 deletions
@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
      <div className="flex items-center justify-between">
        <div>
          <h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
-            Schritt 6: OCR-Zeichenkorrektur
+            Schritt 6: Korrektur
          </h3>
          <p className="text-xs text-gray-400 mt-0.5">
            {status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
          {status === 'ready' && (
            <button onClick={runReview}
              className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
-              Zeichenkorrektur starten
+              Korrektur starten
            </button>
          )}
          {status === 'running' && (
@@ -35,8 +35,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  const [undoStack, setUndoStack] = useState<UndoAction[]>([])
  const [redoStack, setRedoStack] = useState<UndoAction[]>([])
-  // All cells including empty ones (for empty field highlighting)
+  // (allCells removed — cells now contains all cells including empty ones)
  const [allCells, setAllCells] = useState<EditableCell[]>([])
  const containerRef = useRef<HTMLDivElement>(null)
  const imageRef = useRef<HTMLImageElement>(null)
@@ -82,8 +81,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
        colIndex: c.col_index,
      }))
-      setAllCells(allEditableCells)
+      setCells(allEditableCells)
      setCells(allEditableCells.filter(c => c.text.trim() !== ''))
      setEditedTexts(new Map())
      setUndoStack([])
      setRedoStack([])
@@ -191,7 +189,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  const emptyCellIds = useMemo(() => {
    const required = new Set(['column_en', 'column_de'])
    const ids = new Set<string>()
-    for (const cell of allCells) {
+    for (const cell of cells) {
      if (required.has(cell.colType) && !cell.text.trim()) {
        ids.add(cell.cellId)
      }
@@ -429,7 +427,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
          />
          {/* Empty field markers */}
-          {showEmptyHighlight && allCells
+          {showEmptyHighlight && cells
            .filter(c => emptyCellIds.has(c.cellId))
            .map(cell => (
              <div
@@ -4999,8 +4999,8 @@ def build_word_grid(
    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
    entries = _merge_continuation_rows(entries)
-    # 1. Fix character confusion (I/1/l based on context)
+    # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
-    entries = _fix_character_confusion(entries)
+    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
    # 2. Replace OCR'd phonetics with dictionary IPA
    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
@@ -5913,14 +5913,50 @@ async def llm_review_entries_streaming(
    model: str = None,
    batch_size: int = _REVIEW_BATCH_SIZE,
 ):
-    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
    Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
    visible in the UI — this is the only place the fix now runs (removed from Step 1
    of build_vocab_pipeline_streaming).
    """
    # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
    _CONF_FIELDS = ('english', 'german', 'example')
    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
    _fix_character_confusion(entries)  # modifies in-place, returns same list
    char_changes = [
        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
        for i in range(len(entries))
        for f in _CONF_FIELDS
        if originals[i][f] != entries[i].get(f, '')
    ]
    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
        # Inject char_changes as a batch right after the meta event from the spell checker
        _meta_sent = False
        async for event in spell_review_entries_streaming(entries, batch_size):
            yield event
            if not _meta_sent and event.get('type') == 'meta' and char_changes:
                _meta_sent = True
                yield {
                    'type': 'batch',
                    'changes': char_changes,
                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
                    'progress': {'current': 0, 'total': len(entries)},
                }
        return
    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
    # LLM path: emit char_changes first (before meta) so they appear in the UI
    if char_changes:
        yield {
            'type': 'batch',
            'changes': char_changes,
            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
            'progress': {'current': 0, 'total': len(entries)},
        }
    model = model or OLLAMA_REVIEW_MODEL
    # Separate reviewable from skipped entries