fix: three OCR pipeline UX improvements
1. Rename Step 6 label to "Korrektur" (was "OCR-Zeichenkorrektur") 2. Move _fix_character_confusion from pipeline Step 1 into llm_review_entries_streaming so corrections are visible in the UI: char changes (| → I, 1 → I, 8 → B) are now emitted as a batch event right after the meta event, appearing in the corrections list 3. StepReconstruction: all cells (including empty) are now rendered as editable inputs — removed filter that hid empty cells from the editor Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
|
||||
<div className="flex items-center justify-between">
|
||||
<div>
|
||||
<h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
|
||||
Schritt 6: OCR-Zeichenkorrektur
|
||||
Schritt 6: Korrektur
|
||||
</h3>
|
||||
<p className="text-xs text-gray-400 mt-0.5">
|
||||
{status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
|
||||
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
|
||||
{status === 'ready' && (
|
||||
<button onClick={runReview}
|
||||
className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
|
||||
Zeichenkorrektur starten
|
||||
Korrektur starten
|
||||
</button>
|
||||
)}
|
||||
{status === 'running' && (
|
||||
|
||||
@@ -35,8 +35,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
||||
const [undoStack, setUndoStack] = useState<UndoAction[]>([])
|
||||
const [redoStack, setRedoStack] = useState<UndoAction[]>([])
|
||||
|
||||
// All cells including empty ones (for empty field highlighting)
|
||||
const [allCells, setAllCells] = useState<EditableCell[]>([])
|
||||
// (allCells removed — cells now contains all cells including empty ones)
|
||||
|
||||
const containerRef = useRef<HTMLDivElement>(null)
|
||||
const imageRef = useRef<HTMLImageElement>(null)
|
||||
@@ -82,8 +81,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
||||
colIndex: c.col_index,
|
||||
}))
|
||||
|
||||
setAllCells(allEditableCells)
|
||||
setCells(allEditableCells.filter(c => c.text.trim() !== ''))
|
||||
setCells(allEditableCells)
|
||||
setEditedTexts(new Map())
|
||||
setUndoStack([])
|
||||
setRedoStack([])
|
||||
@@ -191,7 +189,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
||||
const emptyCellIds = useMemo(() => {
|
||||
const required = new Set(['column_en', 'column_de'])
|
||||
const ids = new Set<string>()
|
||||
for (const cell of allCells) {
|
||||
for (const cell of cells) {
|
||||
if (required.has(cell.colType) && !cell.text.trim()) {
|
||||
ids.add(cell.cellId)
|
||||
}
|
||||
@@ -429,7 +427,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
||||
/>
|
||||
|
||||
{/* Empty field markers */}
|
||||
{showEmptyHighlight && allCells
|
||||
{showEmptyHighlight && cells
|
||||
.filter(c => emptyCellIds.has(c.cellId))
|
||||
.map(cell => (
|
||||
<div
|
||||
|
||||
@@ -4999,8 +4999,8 @@ def build_word_grid(
|
||||
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
|
||||
entries = _merge_continuation_rows(entries)
|
||||
|
||||
# 1. Fix character confusion (I/1/l based on context)
|
||||
entries = _fix_character_confusion(entries)
|
||||
# 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
|
||||
# llm_review_entries_streaming so changes are visible to the user in Step 6.
|
||||
|
||||
# 2. Replace OCR'd phonetics with dictionary IPA
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
@@ -5913,14 +5913,50 @@ async def llm_review_entries_streaming(
|
||||
model: str = None,
|
||||
batch_size: int = _REVIEW_BATCH_SIZE,
|
||||
):
|
||||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
|
||||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
|
||||
|
||||
Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
|
||||
visible in the UI — this is the only place the fix now runs (removed from Step 1
|
||||
of build_vocab_pipeline_streaming).
|
||||
"""
|
||||
# --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
|
||||
_CONF_FIELDS = ('english', 'german', 'example')
|
||||
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
|
||||
_fix_character_confusion(entries) # modifies in-place, returns same list
|
||||
char_changes = [
|
||||
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
|
||||
for i in range(len(entries))
|
||||
for f in _CONF_FIELDS
|
||||
if originals[i][f] != entries[i].get(f, '')
|
||||
]
|
||||
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
# Inject char_changes as a batch right after the meta event from the spell checker
|
||||
_meta_sent = False
|
||||
async for event in spell_review_entries_streaming(entries, batch_size):
|
||||
yield event
|
||||
if not _meta_sent and event.get('type') == 'meta' and char_changes:
|
||||
_meta_sent = True
|
||||
yield {
|
||||
'type': 'batch',
|
||||
'changes': char_changes,
|
||||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||||
'progress': {'current': 0, 'total': len(entries)},
|
||||
}
|
||||
return
|
||||
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
# LLM path: emit char_changes first (before meta) so they appear in the UI
|
||||
if char_changes:
|
||||
yield {
|
||||
'type': 'batch',
|
||||
'changes': char_changes,
|
||||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||||
'progress': {'current': 0, 'total': len(entries)},
|
||||
}
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
# Separate reviewable from skipped entries
|
||||
|
||||
Reference in New Issue
Block a user