feat(ocr-pipeline): add SSE streaming for word recognition (Step 5)

Cells now appear one-by-one in the UI as they are OCR'd, with a live
progress bar, instead of waiting for the full result.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 17:54:20 +01:00
parent a666e883da
commit 7f27783008
3 changed files with 506 additions and 93 deletions

View File

@@ -62,7 +62,11 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
const [usedEngine, setUsedEngine] = useState<string>('')
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
// Streaming progress state
const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
const enRef = useRef<HTMLInputElement>(null)
const tableEndRef = useRef<HTMLDivElement>(null)
const isVocab = gridResult?.layout === 'vocab'
@@ -110,16 +114,107 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
const eng = engine || ocrEngine
setDetecting(true)
setError(null)
setStreamProgress(null)
setEditedCells([])
setEditedEntries([])
setGridResult(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}&pronunciation=${pronunciation}`, {
method: 'POST',
})
const res = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}`,
{ method: 'POST' },
)
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }))
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
}
const data = await res.json()
applyGridResult(data)
const reader = res.body!.getReader()
const decoder = new TextDecoder()
let buffer = ''
let streamLayout: string | null = null
let streamColumnsUsed: GridResult['columns_used'] = []
let streamGridShape: GridResult['grid_shape'] | null = null
let streamCells: GridCell[] = []
while (true) {
const { done, value } = await reader.read()
if (done) break
buffer += decoder.decode(value, { stream: true })
// Parse SSE events (separated by \n\n)
while (buffer.includes('\n\n')) {
const idx = buffer.indexOf('\n\n')
const chunk = buffer.slice(0, idx).trim()
buffer = buffer.slice(idx + 2)
if (!chunk.startsWith('data: ')) continue
const dataStr = chunk.slice(6) // strip "data: "
let event: any
try {
event = JSON.parse(dataStr)
} catch {
continue
}
if (event.type === 'meta') {
streamLayout = event.layout || 'generic'
streamGridShape = event.grid_shape || null
// Show partial grid result so UI renders structure
setGridResult(prev => ({
...prev,
layout: event.layout || 'generic',
grid_shape: event.grid_shape,
columns_used: [],
cells: [],
summary: { total_cells: event.grid_shape?.total_cells || 0, non_empty_cells: 0, low_confidence: 0 },
duration_seconds: 0,
ocr_engine: '',
} as GridResult))
}
if (event.type === 'columns') {
streamColumnsUsed = event.columns_used || []
setGridResult(prev => prev ? { ...prev, columns_used: streamColumnsUsed } : prev)
}
if (event.type === 'cell') {
const cell: GridCell = { ...event.cell, status: 'pending' }
streamCells = [...streamCells, cell]
setEditedCells(streamCells)
setStreamProgress(event.progress)
// Auto-scroll table to bottom
setTimeout(() => tableEndRef.current?.scrollIntoView({ behavior: 'smooth', block: 'nearest' }), 16)
}
if (event.type === 'complete') {
// Build final GridResult
const finalResult: GridResult = {
cells: streamCells,
grid_shape: streamGridShape || { rows: 0, cols: 0, total_cells: streamCells.length },
columns_used: streamColumnsUsed,
layout: streamLayout || 'generic',
image_width: 0,
image_height: 0,
duration_seconds: event.duration_seconds || 0,
ocr_engine: event.ocr_engine || '',
summary: event.summary || {},
}
// If vocab: apply post-processed entries from complete event
if (event.vocab_entries) {
finalResult.entries = event.vocab_entries
finalResult.vocab_entries = event.vocab_entries
finalResult.entry_count = event.vocab_entries.length
}
applyGridResult(finalResult)
setUsedEngine(event.ocr_engine || '')
setStreamProgress(null)
}
}
}
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
@@ -288,11 +383,23 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
return (
<div className="space-y-4">
{/* Loading */}
{/* Loading with streaming progress */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Worterkennung laeuft...
<div className="space-y-1">
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
{streamProgress
? `Zelle ${streamProgress.current}/${streamProgress.total} erkannt...`
: 'Worterkennung startet...'}
</div>
{streamProgress && streamProgress.total > 0 && (
<div className="w-full bg-gray-200 dark:bg-gray-700 rounded-full h-1.5">
<div
className="bg-teal-500 h-1.5 rounded-full transition-all duration-150"
style={{ width: `${(streamProgress.current / streamProgress.total) * 100}%` }}
/>
</div>
)}
</div>
)}
@@ -378,8 +485,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
</div>
</div>
{/* Result summary */}
{gridResult && summary && (
{/* Result summary (only after streaming completes) */}
{gridResult && summary && !detecting && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<div className="flex items-center justify-between">
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
@@ -511,6 +618,67 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
</tbody>
</table>
)}
<div ref={tableEndRef} />
</div>
</div>
)}
{/* Streaming cell table (shown while detecting, before complete) */}
{detecting && editedCells.length > 0 && !gridResult?.summary?.non_empty_cells && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
Live: {editedCells.length} Zellen erkannt...
</h4>
<div className="max-h-80 overflow-y-auto">
<table className="w-full text-xs">
<thead className="sticky top-0 bg-white dark:bg-gray-800">
<tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
<th className="py-1 pr-2 w-12">Zelle</th>
{columnsUsed.map((col, i) => (
<th key={i} className={`py-1 pr-2 ${colTypeColor(col.type)}`}>
{colTypeLabel(col.type)}
</th>
))}
<th className="py-1 w-12 text-right">Conf</th>
</tr>
</thead>
<tbody>
{(() => {
const liveByRow: Map<number, GridCell[]> = new Map()
for (const cell of editedCells) {
const existing = liveByRow.get(cell.row_index) || []
existing.push(cell)
liveByRow.set(cell.row_index, existing)
}
const liveSorted = [...liveByRow.keys()].sort((a, b) => a - b)
return liveSorted.map(rowIdx => {
const rowCells = liveByRow.get(rowIdx) || []
const avgConf = rowCells.length
? Math.round(rowCells.reduce((s, c) => s + c.confidence, 0) / rowCells.length)
: 0
return (
<tr key={rowIdx} className="border-b dark:border-gray-700/50 animate-fade-in">
<td className="py-1 pr-2 text-gray-400 font-mono text-[10px]">
R{String(rowIdx).padStart(2, '0')}
</td>
{columnsUsed.map((col) => {
const cell = rowCells.find(c => c.col_index === col.index)
return (
<td key={col.index} className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300">
<MultilineText text={cell?.text || ''} />
</td>
)
})}
<td className={`py-1 text-right font-mono ${confColor(avgConf)}`}>
{avgConf}%
</td>
</tr>
)
})
})()}
</tbody>
</table>
<div ref={tableEndRef} />
</div>
</div>
)}