fix: three OCR pipeline UX improvements

1. Rename Step 6 label to "Korrektur" (was "OCR-Zeichenkorrektur")
2. Move _fix_character_confusion from pipeline Step 1 into
   llm_review_entries_streaming so corrections are visible in the UI:
   char changes (| → I, 1 → I, 8 → B) are now emitted as a batch event
   right after the meta event, appearing in the corrections list
3. StepReconstruction: all cells (including empty) are now rendered as
   editable inputs — removed filter that hid empty cells from the editor

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 17:31:55 +01:00
parent f3d61a9394
commit d1c8075da2
3 changed files with 45 additions and 11 deletions

View File

@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
<div className="flex items-center justify-between">
<div>
<h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
Schritt 6: OCR-Zeichenkorrektur
Schritt 6: Korrektur
</h3>
<p className="text-xs text-gray-400 mt-0.5">
{status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
{status === 'ready' && (
<button onClick={runReview}
className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
Zeichenkorrektur starten
Korrektur starten
</button>
)}
{status === 'running' && (

View File

@@ -35,8 +35,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
const [undoStack, setUndoStack] = useState<UndoAction[]>([])
const [redoStack, setRedoStack] = useState<UndoAction[]>([])
// All cells including empty ones (for empty field highlighting)
const [allCells, setAllCells] = useState<EditableCell[]>([])
// (allCells removed — cells now contains all cells including empty ones)
const containerRef = useRef<HTMLDivElement>(null)
const imageRef = useRef<HTMLImageElement>(null)
@@ -82,8 +81,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
colIndex: c.col_index,
}))
setAllCells(allEditableCells)
setCells(allEditableCells.filter(c => c.text.trim() !== ''))
setCells(allEditableCells)
setEditedTexts(new Map())
setUndoStack([])
setRedoStack([])
@@ -191,7 +189,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
const emptyCellIds = useMemo(() => {
const required = new Set(['column_en', 'column_de'])
const ids = new Set<string>()
for (const cell of allCells) {
for (const cell of cells) {
if (required.has(cell.colType) && !cell.text.trim()) {
ids.add(cell.cellId)
}
@@ -429,7 +427,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
/>
{/* Empty field markers */}
{showEmptyHighlight && allCells
{showEmptyHighlight && cells
.filter(c => emptyCellIds.has(c.cellId))
.map(cell => (
<div

View File

@@ -4999,8 +4999,8 @@ def build_word_grid(
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
entries = _merge_continuation_rows(entries)
# 1. Fix character confusion (I/1/l based on context)
entries = _fix_character_confusion(entries)
# 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
# llm_review_entries_streaming so changes are visible to the user in Step 6.
# 2. Replace OCR'd phonetics with dictionary IPA
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
@@ -5913,14 +5913,50 @@ async def llm_review_entries_streaming(
model: str = None,
batch_size: int = _REVIEW_BATCH_SIZE,
):
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
visible in the UI — this is the only place the fix now runs (removed from Step 1
of build_vocab_pipeline_streaming).
"""
# --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
_CONF_FIELDS = ('english', 'german', 'example')
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
_fix_character_confusion(entries) # modifies in-place, returns same list
char_changes = [
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
for i in range(len(entries))
for f in _CONF_FIELDS
if originals[i][f] != entries[i].get(f, '')
]
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
# Inject char_changes as a batch right after the meta event from the spell checker
_meta_sent = False
async for event in spell_review_entries_streaming(entries, batch_size):
yield event
if not _meta_sent and event.get('type') == 'meta' and char_changes:
_meta_sent = True
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
return
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
# LLM path: emit char_changes first (before meta) so they appear in the UI
if char_changes:
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
model = model or OLLAMA_REVIEW_MODEL
# Separate reviewable from skipped entries