fix: three OCR pipeline UX improvements

1. Rename Step 6 label to "Korrektur" (was "OCR-Zeichenkorrektur")
2. Move _fix_character_confusion from pipeline Step 1 into
   llm_review_entries_streaming so corrections are visible in the UI:
   char changes (| → I, 1 → I, 8 → B) are now emitted as a batch event
   right after the meta event, appearing in the corrections list
3. StepReconstruction: all cells (including empty) are now rendered as
   editable inputs — removed filter that hid empty cells from the editor

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 17:31:55 +01:00
parent f3d61a9394
commit d1c8075da2
3 changed files with 45 additions and 11 deletions

View File

@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
<div> <div>
<h3 className="text-base font-medium text-gray-700 dark:text-gray-300"> <h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
Schritt 6: OCR-Zeichenkorrektur Schritt 6: Korrektur
</h3> </h3>
<p className="text-xs text-gray-400 mt-0.5"> <p className="text-xs text-gray-400 mt-0.5">
{status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`} {status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
{status === 'ready' && ( {status === 'ready' && (
<button onClick={runReview} <button onClick={runReview}
className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium"> className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
Zeichenkorrektur starten Korrektur starten
</button> </button>
)} )}
{status === 'running' && ( {status === 'running' && (

View File

@@ -35,8 +35,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
const [undoStack, setUndoStack] = useState<UndoAction[]>([]) const [undoStack, setUndoStack] = useState<UndoAction[]>([])
const [redoStack, setRedoStack] = useState<UndoAction[]>([]) const [redoStack, setRedoStack] = useState<UndoAction[]>([])
// All cells including empty ones (for empty field highlighting) // (allCells removed — cells now contains all cells including empty ones)
const [allCells, setAllCells] = useState<EditableCell[]>([])
const containerRef = useRef<HTMLDivElement>(null) const containerRef = useRef<HTMLDivElement>(null)
const imageRef = useRef<HTMLImageElement>(null) const imageRef = useRef<HTMLImageElement>(null)
@@ -82,8 +81,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
colIndex: c.col_index, colIndex: c.col_index,
})) }))
setAllCells(allEditableCells) setCells(allEditableCells)
setCells(allEditableCells.filter(c => c.text.trim() !== ''))
setEditedTexts(new Map()) setEditedTexts(new Map())
setUndoStack([]) setUndoStack([])
setRedoStack([]) setRedoStack([])
@@ -191,7 +189,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
const emptyCellIds = useMemo(() => { const emptyCellIds = useMemo(() => {
const required = new Set(['column_en', 'column_de']) const required = new Set(['column_en', 'column_de'])
const ids = new Set<string>() const ids = new Set<string>()
for (const cell of allCells) { for (const cell of cells) {
if (required.has(cell.colType) && !cell.text.trim()) { if (required.has(cell.colType) && !cell.text.trim()) {
ids.add(cell.cellId) ids.add(cell.cellId)
} }
@@ -429,7 +427,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
/> />
{/* Empty field markers */} {/* Empty field markers */}
{showEmptyHighlight && allCells {showEmptyHighlight && cells
.filter(c => emptyCellIds.has(c.cellId)) .filter(c => emptyCellIds.has(c.cellId))
.map(cell => ( .map(cell => (
<div <div

View File

@@ -4999,8 +4999,8 @@ def build_word_grid(
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE) # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
entries = _merge_continuation_rows(entries) entries = _merge_continuation_rows(entries)
# 1. Fix character confusion (I/1/l based on context) # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
entries = _fix_character_confusion(entries) # llm_review_entries_streaming so changes are visible to the user in Step 6.
# 2. Replace OCR'd phonetics with dictionary IPA # 2. Replace OCR'd phonetics with dictionary IPA
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
@@ -5913,14 +5913,50 @@ async def llm_review_entries_streaming(
model: str = None, model: str = None,
batch_size: int = _REVIEW_BATCH_SIZE, batch_size: int = _REVIEW_BATCH_SIZE,
): ):
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.""" """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
visible in the UI — this is the only place the fix now runs (removed from Step 1
of build_vocab_pipeline_streaming).
"""
# --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
_CONF_FIELDS = ('english', 'german', 'example')
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
_fix_character_confusion(entries) # modifies in-place, returns same list
char_changes = [
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
for i in range(len(entries))
for f in _CONF_FIELDS
if originals[i][f] != entries[i].get(f, '')
]
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
# Inject char_changes as a batch right after the meta event from the spell checker
_meta_sent = False
async for event in spell_review_entries_streaming(entries, batch_size): async for event in spell_review_entries_streaming(entries, batch_size):
yield event yield event
if not _meta_sent and event.get('type') == 'meta' and char_changes:
_meta_sent = True
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
return return
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
# LLM path: emit char_changes first (before meta) so they appear in the UI
if char_changes:
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
model = model or OLLAMA_REVIEW_MODEL model = model or OLLAMA_REVIEW_MODEL
# Separate reviewable from skipped entries # Separate reviewable from skipped entries