feat(ocr-pipeline): add SSE streaming for word recognition (Step 5)
Cells now appear one-by-one in the UI as they are OCR'd, with a live progress bar, instead of waiting for the full result. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -62,7 +62,11 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
const [usedEngine, setUsedEngine] = useState<string>('')
|
const [usedEngine, setUsedEngine] = useState<string>('')
|
||||||
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
||||||
|
|
||||||
|
// Streaming progress state
|
||||||
|
const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
|
||||||
|
|
||||||
const enRef = useRef<HTMLInputElement>(null)
|
const enRef = useRef<HTMLInputElement>(null)
|
||||||
|
const tableEndRef = useRef<HTMLDivElement>(null)
|
||||||
|
|
||||||
const isVocab = gridResult?.layout === 'vocab'
|
const isVocab = gridResult?.layout === 'vocab'
|
||||||
|
|
||||||
@@ -110,16 +114,107 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
const eng = engine || ocrEngine
|
const eng = engine || ocrEngine
|
||||||
setDetecting(true)
|
setDetecting(true)
|
||||||
setError(null)
|
setError(null)
|
||||||
|
setStreamProgress(null)
|
||||||
|
setEditedCells([])
|
||||||
|
setEditedEntries([])
|
||||||
|
setGridResult(null)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?engine=${eng}&pronunciation=${pronunciation}`, {
|
const res = await fetch(
|
||||||
method: 'POST',
|
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}`,
|
||||||
})
|
{ method: 'POST' },
|
||||||
|
)
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
const err = await res.json().catch(() => ({ detail: res.statusText }))
|
const err = await res.json().catch(() => ({ detail: res.statusText }))
|
||||||
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
|
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
|
||||||
}
|
}
|
||||||
const data = await res.json()
|
|
||||||
applyGridResult(data)
|
const reader = res.body!.getReader()
|
||||||
|
const decoder = new TextDecoder()
|
||||||
|
let buffer = ''
|
||||||
|
let streamLayout: string | null = null
|
||||||
|
let streamColumnsUsed: GridResult['columns_used'] = []
|
||||||
|
let streamGridShape: GridResult['grid_shape'] | null = null
|
||||||
|
let streamCells: GridCell[] = []
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read()
|
||||||
|
if (done) break
|
||||||
|
buffer += decoder.decode(value, { stream: true })
|
||||||
|
|
||||||
|
// Parse SSE events (separated by \n\n)
|
||||||
|
while (buffer.includes('\n\n')) {
|
||||||
|
const idx = buffer.indexOf('\n\n')
|
||||||
|
const chunk = buffer.slice(0, idx).trim()
|
||||||
|
buffer = buffer.slice(idx + 2)
|
||||||
|
|
||||||
|
if (!chunk.startsWith('data: ')) continue
|
||||||
|
const dataStr = chunk.slice(6) // strip "data: "
|
||||||
|
|
||||||
|
let event: any
|
||||||
|
try {
|
||||||
|
event = JSON.parse(dataStr)
|
||||||
|
} catch {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === 'meta') {
|
||||||
|
streamLayout = event.layout || 'generic'
|
||||||
|
streamGridShape = event.grid_shape || null
|
||||||
|
// Show partial grid result so UI renders structure
|
||||||
|
setGridResult(prev => ({
|
||||||
|
...prev,
|
||||||
|
layout: event.layout || 'generic',
|
||||||
|
grid_shape: event.grid_shape,
|
||||||
|
columns_used: [],
|
||||||
|
cells: [],
|
||||||
|
summary: { total_cells: event.grid_shape?.total_cells || 0, non_empty_cells: 0, low_confidence: 0 },
|
||||||
|
duration_seconds: 0,
|
||||||
|
ocr_engine: '',
|
||||||
|
} as GridResult))
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === 'columns') {
|
||||||
|
streamColumnsUsed = event.columns_used || []
|
||||||
|
setGridResult(prev => prev ? { ...prev, columns_used: streamColumnsUsed } : prev)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === 'cell') {
|
||||||
|
const cell: GridCell = { ...event.cell, status: 'pending' }
|
||||||
|
streamCells = [...streamCells, cell]
|
||||||
|
setEditedCells(streamCells)
|
||||||
|
setStreamProgress(event.progress)
|
||||||
|
// Auto-scroll table to bottom
|
||||||
|
setTimeout(() => tableEndRef.current?.scrollIntoView({ behavior: 'smooth', block: 'nearest' }), 16)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === 'complete') {
|
||||||
|
// Build final GridResult
|
||||||
|
const finalResult: GridResult = {
|
||||||
|
cells: streamCells,
|
||||||
|
grid_shape: streamGridShape || { rows: 0, cols: 0, total_cells: streamCells.length },
|
||||||
|
columns_used: streamColumnsUsed,
|
||||||
|
layout: streamLayout || 'generic',
|
||||||
|
image_width: 0,
|
||||||
|
image_height: 0,
|
||||||
|
duration_seconds: event.duration_seconds || 0,
|
||||||
|
ocr_engine: event.ocr_engine || '',
|
||||||
|
summary: event.summary || {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// If vocab: apply post-processed entries from complete event
|
||||||
|
if (event.vocab_entries) {
|
||||||
|
finalResult.entries = event.vocab_entries
|
||||||
|
finalResult.vocab_entries = event.vocab_entries
|
||||||
|
finalResult.entry_count = event.vocab_entries.length
|
||||||
|
}
|
||||||
|
|
||||||
|
applyGridResult(finalResult)
|
||||||
|
setUsedEngine(event.ocr_engine || '')
|
||||||
|
setStreamProgress(null)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
|
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
|
||||||
} finally {
|
} finally {
|
||||||
@@ -288,11 +383,23 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="space-y-4">
|
<div className="space-y-4">
|
||||||
{/* Loading */}
|
{/* Loading with streaming progress */}
|
||||||
{detecting && (
|
{detecting && (
|
||||||
|
<div className="space-y-1">
|
||||||
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
|
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
|
||||||
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
|
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
|
||||||
Worterkennung laeuft...
|
{streamProgress
|
||||||
|
? `Zelle ${streamProgress.current}/${streamProgress.total} erkannt...`
|
||||||
|
: 'Worterkennung startet...'}
|
||||||
|
</div>
|
||||||
|
{streamProgress && streamProgress.total > 0 && (
|
||||||
|
<div className="w-full bg-gray-200 dark:bg-gray-700 rounded-full h-1.5">
|
||||||
|
<div
|
||||||
|
className="bg-teal-500 h-1.5 rounded-full transition-all duration-150"
|
||||||
|
style={{ width: `${(streamProgress.current / streamProgress.total) * 100}%` }}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
@@ -378,8 +485,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Result summary */}
|
{/* Result summary (only after streaming completes) */}
|
||||||
{gridResult && summary && (
|
{gridResult && summary && !detecting && (
|
||||||
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
|
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
|
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
|
||||||
@@ -511,6 +618,67 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
)}
|
)}
|
||||||
|
<div ref={tableEndRef} />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Streaming cell table (shown while detecting, before complete) */}
|
||||||
|
{detecting && editedCells.length > 0 && !gridResult?.summary?.non_empty_cells && (
|
||||||
|
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
|
||||||
|
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
|
||||||
|
Live: {editedCells.length} Zellen erkannt...
|
||||||
|
</h4>
|
||||||
|
<div className="max-h-80 overflow-y-auto">
|
||||||
|
<table className="w-full text-xs">
|
||||||
|
<thead className="sticky top-0 bg-white dark:bg-gray-800">
|
||||||
|
<tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
|
||||||
|
<th className="py-1 pr-2 w-12">Zelle</th>
|
||||||
|
{columnsUsed.map((col, i) => (
|
||||||
|
<th key={i} className={`py-1 pr-2 ${colTypeColor(col.type)}`}>
|
||||||
|
{colTypeLabel(col.type)}
|
||||||
|
</th>
|
||||||
|
))}
|
||||||
|
<th className="py-1 w-12 text-right">Conf</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{(() => {
|
||||||
|
const liveByRow: Map<number, GridCell[]> = new Map()
|
||||||
|
for (const cell of editedCells) {
|
||||||
|
const existing = liveByRow.get(cell.row_index) || []
|
||||||
|
existing.push(cell)
|
||||||
|
liveByRow.set(cell.row_index, existing)
|
||||||
|
}
|
||||||
|
const liveSorted = [...liveByRow.keys()].sort((a, b) => a - b)
|
||||||
|
return liveSorted.map(rowIdx => {
|
||||||
|
const rowCells = liveByRow.get(rowIdx) || []
|
||||||
|
const avgConf = rowCells.length
|
||||||
|
? Math.round(rowCells.reduce((s, c) => s + c.confidence, 0) / rowCells.length)
|
||||||
|
: 0
|
||||||
|
return (
|
||||||
|
<tr key={rowIdx} className="border-b dark:border-gray-700/50 animate-fade-in">
|
||||||
|
<td className="py-1 pr-2 text-gray-400 font-mono text-[10px]">
|
||||||
|
R{String(rowIdx).padStart(2, '0')}
|
||||||
|
</td>
|
||||||
|
{columnsUsed.map((col) => {
|
||||||
|
const cell = rowCells.find(c => c.col_index === col.index)
|
||||||
|
return (
|
||||||
|
<td key={col.index} className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300">
|
||||||
|
<MultilineText text={cell?.text || ''} />
|
||||||
|
</td>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
<td className={`py-1 text-right font-mono ${confColor(avgConf)}`}>
|
||||||
|
{avgConf}%
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})()}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<div ref={tableEndRef} />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ import io
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Dict, Any, Optional, Tuple
|
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -3009,6 +3009,94 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
|||||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_single_cell(
|
||||||
|
row_idx: int,
|
||||||
|
col_idx: int,
|
||||||
|
row: RowGeometry,
|
||||||
|
col: PageRegion,
|
||||||
|
ocr_img: np.ndarray,
|
||||||
|
img_bgr: Optional[np.ndarray],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
use_rapid: bool,
|
||||||
|
engine_name: str,
|
||||||
|
lang: str,
|
||||||
|
lang_map: Dict[str, str],
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""OCR a single cell (column × row intersection) and return its dict."""
|
||||||
|
pad = 8 # pixels
|
||||||
|
cell_x = max(0, col.x - pad)
|
||||||
|
cell_y = max(0, row.y - pad)
|
||||||
|
cell_w = col.width + 2 * pad
|
||||||
|
cell_h = row.height + 2 * pad
|
||||||
|
|
||||||
|
# Clamp to image bounds
|
||||||
|
if cell_x + cell_w > img_w:
|
||||||
|
cell_w = img_w - cell_x
|
||||||
|
if cell_y + cell_h > img_h:
|
||||||
|
cell_h = img_h - cell_y
|
||||||
|
|
||||||
|
if cell_w <= 0 or cell_h <= 0:
|
||||||
|
return {
|
||||||
|
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||||
|
'row_index': row_idx,
|
||||||
|
'col_index': col_idx,
|
||||||
|
'col_type': col.type,
|
||||||
|
'text': '',
|
||||||
|
'confidence': 0.0,
|
||||||
|
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
||||||
|
'bbox_pct': {
|
||||||
|
'x': round(col.x / img_w * 100, 2),
|
||||||
|
'y': round(row.y / img_h * 100, 2),
|
||||||
|
'w': round(col.width / img_w * 100, 2),
|
||||||
|
'h': round(row.height / img_h * 100, 2),
|
||||||
|
},
|
||||||
|
'ocr_engine': engine_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
cell_region = PageRegion(
|
||||||
|
type=col.type,
|
||||||
|
x=cell_x, y=cell_y,
|
||||||
|
width=cell_w, height=cell_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
# OCR the cell
|
||||||
|
if use_rapid:
|
||||||
|
words = ocr_region_rapid(img_bgr, cell_region)
|
||||||
|
else:
|
||||||
|
cell_lang = lang_map.get(col.type, lang)
|
||||||
|
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||||
|
|
||||||
|
# Group into lines, then join in reading order
|
||||||
|
if words:
|
||||||
|
avg_h = sum(w['height'] for w in words) / len(words)
|
||||||
|
y_tol = max(10, int(avg_h * 0.5))
|
||||||
|
else:
|
||||||
|
y_tol = 15
|
||||||
|
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||||
|
|
||||||
|
avg_conf = 0.0
|
||||||
|
if words:
|
||||||
|
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||||
|
'row_index': row_idx,
|
||||||
|
'col_index': col_idx,
|
||||||
|
'col_type': col.type,
|
||||||
|
'text': text,
|
||||||
|
'confidence': avg_conf,
|
||||||
|
'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
|
||||||
|
'bbox_pct': {
|
||||||
|
'x': round(cell_x / img_w * 100, 2),
|
||||||
|
'y': round(cell_y / img_h * 100, 2),
|
||||||
|
'w': round(cell_w / img_w * 100, 2),
|
||||||
|
'h': round(cell_h / img_h * 100, 2),
|
||||||
|
},
|
||||||
|
'ocr_engine': engine_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def build_cell_grid(
|
def build_cell_grid(
|
||||||
ocr_img: np.ndarray,
|
ocr_img: np.ndarray,
|
||||||
column_regions: List[PageRegion],
|
column_regions: List[PageRegion],
|
||||||
@@ -3089,79 +3177,12 @@ def build_cell_grid(
|
|||||||
|
|
||||||
for row_idx, row in enumerate(content_rows):
|
for row_idx, row in enumerate(content_rows):
|
||||||
for col_idx, col in enumerate(relevant_cols):
|
for col_idx, col in enumerate(relevant_cols):
|
||||||
# Compute cell region: column x/width, row y/height
|
cell = _ocr_single_cell(
|
||||||
pad = 8 # pixels
|
row_idx, col_idx, row, col,
|
||||||
cell_x = max(0, col.x - pad)
|
ocr_img, img_bgr, img_w, img_h,
|
||||||
cell_y = max(0, row.y - pad)
|
use_rapid, engine_name, lang, lang_map,
|
||||||
cell_w = col.width + 2 * pad
|
|
||||||
cell_h = row.height + 2 * pad
|
|
||||||
|
|
||||||
# Clamp to image bounds
|
|
||||||
if cell_x + cell_w > img_w:
|
|
||||||
cell_w = img_w - cell_x
|
|
||||||
if cell_y + cell_h > img_h:
|
|
||||||
cell_h = img_h - cell_y
|
|
||||||
|
|
||||||
if cell_w <= 0 or cell_h <= 0:
|
|
||||||
cells.append({
|
|
||||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
|
||||||
'row_index': row_idx,
|
|
||||||
'col_index': col_idx,
|
|
||||||
'col_type': col.type,
|
|
||||||
'text': '',
|
|
||||||
'confidence': 0.0,
|
|
||||||
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
|
||||||
'bbox_pct': {
|
|
||||||
'x': round(col.x / img_w * 100, 2),
|
|
||||||
'y': round(row.y / img_h * 100, 2),
|
|
||||||
'w': round(col.width / img_w * 100, 2),
|
|
||||||
'h': round(row.height / img_h * 100, 2),
|
|
||||||
},
|
|
||||||
'ocr_engine': engine_name,
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
cell_region = PageRegion(
|
|
||||||
type=col.type,
|
|
||||||
x=cell_x, y=cell_y,
|
|
||||||
width=cell_w, height=cell_h,
|
|
||||||
)
|
)
|
||||||
|
cells.append(cell)
|
||||||
# OCR the cell
|
|
||||||
if use_rapid:
|
|
||||||
words = ocr_region_rapid(img_bgr, cell_region)
|
|
||||||
else:
|
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
|
||||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
|
||||||
|
|
||||||
# Group into lines, then join in reading order
|
|
||||||
if words:
|
|
||||||
avg_h = sum(w['height'] for w in words) / len(words)
|
|
||||||
y_tol = max(10, int(avg_h * 0.5))
|
|
||||||
else:
|
|
||||||
y_tol = 15
|
|
||||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
|
||||||
|
|
||||||
avg_conf = 0.0
|
|
||||||
if words:
|
|
||||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
|
||||||
|
|
||||||
cells.append({
|
|
||||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
|
||||||
'row_index': row_idx,
|
|
||||||
'col_index': col_idx,
|
|
||||||
'col_type': col.type,
|
|
||||||
'text': text,
|
|
||||||
'confidence': avg_conf,
|
|
||||||
'bbox_px': {'x': cell_x, 'y': cell_y, 'w': cell_w, 'h': cell_h},
|
|
||||||
'bbox_pct': {
|
|
||||||
'x': round(cell_x / img_w * 100, 2),
|
|
||||||
'y': round(cell_y / img_h * 100, 2),
|
|
||||||
'w': round(cell_w / img_w * 100, 2),
|
|
||||||
'h': round(cell_h / img_h * 100, 2),
|
|
||||||
},
|
|
||||||
'ocr_engine': engine_name,
|
|
||||||
})
|
|
||||||
|
|
||||||
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
||||||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||||||
@@ -3170,6 +3191,72 @@ def build_cell_grid(
|
|||||||
return cells, columns_meta
|
return cells, columns_meta
|
||||||
|
|
||||||
|
|
||||||
|
def build_cell_grid_streaming(
|
||||||
|
ocr_img: np.ndarray,
|
||||||
|
column_regions: List[PageRegion],
|
||||||
|
row_geometries: List[RowGeometry],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
lang: str = "eng+deu",
|
||||||
|
ocr_engine: str = "auto",
|
||||||
|
img_bgr: Optional[np.ndarray] = None,
|
||||||
|
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||||||
|
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
(cell_dict, columns_meta, total_cells) for each cell.
|
||||||
|
"""
|
||||||
|
# Resolve engine choice (same as build_cell_grid)
|
||||||
|
use_rapid = False
|
||||||
|
if ocr_engine == "auto":
|
||||||
|
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||||
|
elif ocr_engine == "rapid":
|
||||||
|
if not RAPIDOCR_AVAILABLE:
|
||||||
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||||
|
else:
|
||||||
|
use_rapid = True
|
||||||
|
|
||||||
|
engine_name = "rapid" if use_rapid else "tesseract"
|
||||||
|
|
||||||
|
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||||
|
if not content_rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||||
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||||
|
if not relevant_cols:
|
||||||
|
return
|
||||||
|
|
||||||
|
relevant_cols.sort(key=lambda c: c.x)
|
||||||
|
|
||||||
|
columns_meta = [
|
||||||
|
{
|
||||||
|
'index': col_idx,
|
||||||
|
'type': col.type,
|
||||||
|
'x': col.x,
|
||||||
|
'width': col.width,
|
||||||
|
}
|
||||||
|
for col_idx, col in enumerate(relevant_cols)
|
||||||
|
]
|
||||||
|
|
||||||
|
lang_map = {
|
||||||
|
'column_en': 'eng',
|
||||||
|
'column_de': 'deu',
|
||||||
|
'column_example': 'eng+deu',
|
||||||
|
}
|
||||||
|
|
||||||
|
total_cells = len(content_rows) * len(relevant_cols)
|
||||||
|
|
||||||
|
for row_idx, row in enumerate(content_rows):
|
||||||
|
for col_idx, col in enumerate(relevant_cols):
|
||||||
|
cell = _ocr_single_cell(
|
||||||
|
row_idx, col_idx, row, col,
|
||||||
|
ocr_img, img_bgr, img_w, img_h,
|
||||||
|
use_rapid, engine_name, lang, lang_map,
|
||||||
|
)
|
||||||
|
yield cell, columns_meta, total_cells
|
||||||
|
|
||||||
|
|
||||||
def _cells_to_vocab_entries(
|
def _cells_to_vocab_entries(
|
||||||
cells: List[Dict[str, Any]],
|
cells: List[Dict[str, Any]],
|
||||||
columns_meta: List[Dict[str, Any]],
|
columns_meta: List[Dict[str, Any]],
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ Lizenz: Apache 2.0
|
|||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
@@ -24,8 +25,8 @@ from typing import Any, Dict, List, Optional
|
|||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response, StreamingResponse
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from cv_vocab_pipeline import (
|
from cv_vocab_pipeline import (
|
||||||
@@ -39,6 +40,7 @@ from cv_vocab_pipeline import (
|
|||||||
analyze_layout,
|
analyze_layout,
|
||||||
analyze_layout_by_words,
|
analyze_layout_by_words,
|
||||||
build_cell_grid,
|
build_cell_grid,
|
||||||
|
build_cell_grid_streaming,
|
||||||
build_word_grid,
|
build_word_grid,
|
||||||
classify_column_types,
|
classify_column_types,
|
||||||
create_layout_image,
|
create_layout_image,
|
||||||
@@ -1023,12 +1025,19 @@ async def get_row_ground_truth(session_id: str):
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/words")
|
@router.post("/sessions/{session_id}/words")
|
||||||
async def detect_words(session_id: str, engine: str = "auto", pronunciation: str = "british"):
|
async def detect_words(
|
||||||
|
session_id: str,
|
||||||
|
request: Request,
|
||||||
|
engine: str = "auto",
|
||||||
|
pronunciation: str = "british",
|
||||||
|
stream: bool = False,
|
||||||
|
):
|
||||||
"""Build word grid from columns × rows, OCR each cell.
|
"""Build word grid from columns × rows, OCR each cell.
|
||||||
|
|
||||||
Query params:
|
Query params:
|
||||||
engine: 'auto' (default), 'tesseract', or 'rapid'
|
engine: 'auto' (default), 'tesseract', or 'rapid'
|
||||||
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
||||||
|
stream: false (default) for JSON response, true for SSE streaming
|
||||||
"""
|
"""
|
||||||
if session_id not in _cache:
|
if session_id not in _cache:
|
||||||
await _load_session_to_cache(session_id)
|
await _load_session_to_cache(session_id)
|
||||||
@@ -1049,12 +1058,6 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
|
|||||||
if not row_result or not row_result.get("rows"):
|
if not row_result or not row_result.get("rows"):
|
||||||
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
||||||
|
|
||||||
t0 = time.time()
|
|
||||||
|
|
||||||
# Create binarized OCR image (for Tesseract)
|
|
||||||
ocr_img = create_ocr_image(dewarped_bgr)
|
|
||||||
img_h, img_w = dewarped_bgr.shape[:2]
|
|
||||||
|
|
||||||
# Convert column dicts back to PageRegion objects
|
# Convert column dicts back to PageRegion objects
|
||||||
col_regions = [
|
col_regions = [
|
||||||
PageRegion(
|
PageRegion(
|
||||||
@@ -1081,6 +1084,27 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
|
|||||||
for r in row_result["rows"]
|
for r in row_result["rows"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return StreamingResponse(
|
||||||
|
_word_stream_generator(
|
||||||
|
session_id, cached, col_regions, row_geoms,
|
||||||
|
dewarped_bgr, engine, pronunciation, request,
|
||||||
|
),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Accel-Buffering": "no",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Non-streaming path (unchanged) ---
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Create binarized OCR image (for Tesseract)
|
||||||
|
ocr_img = create_ocr_image(dewarped_bgr)
|
||||||
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
|
|
||||||
# Build generic cell grid
|
# Build generic cell grid
|
||||||
cells, columns_meta = build_cell_grid(
|
cells, columns_meta = build_cell_grid(
|
||||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||||
@@ -1154,6 +1178,140 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _word_stream_generator(
|
||||||
|
session_id: str,
|
||||||
|
cached: Dict[str, Any],
|
||||||
|
col_regions: List[PageRegion],
|
||||||
|
row_geoms: List[RowGeometry],
|
||||||
|
dewarped_bgr: np.ndarray,
|
||||||
|
engine: str,
|
||||||
|
pronunciation: str,
|
||||||
|
request: Request,
|
||||||
|
):
|
||||||
|
"""SSE generator that yields cell-by-cell OCR progress."""
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
ocr_img = create_ocr_image(dewarped_bgr)
|
||||||
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
|
|
||||||
|
# Compute grid shape upfront for the meta event
|
||||||
|
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||||||
|
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||||
|
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
||||||
|
|
||||||
|
# Determine layout
|
||||||
|
col_types = {c.type for c in col_regions if c.type not in _skip_types}
|
||||||
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||||
|
|
||||||
|
# Start streaming — first event: meta
|
||||||
|
columns_meta = None # will be set from first yield
|
||||||
|
total_cells = n_content_rows * n_cols
|
||||||
|
|
||||||
|
meta_event = {
|
||||||
|
"type": "meta",
|
||||||
|
"grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": total_cells},
|
||||||
|
"layout": "vocab" if is_vocab else "generic",
|
||||||
|
}
|
||||||
|
yield f"data: {json.dumps(meta_event)}\n\n"
|
||||||
|
|
||||||
|
# Stream cells one by one
|
||||||
|
all_cells: List[Dict[str, Any]] = []
|
||||||
|
cell_idx = 0
|
||||||
|
|
||||||
|
for cell, cols_meta, total in build_cell_grid_streaming(
|
||||||
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||||
|
ocr_engine=engine, img_bgr=dewarped_bgr,
|
||||||
|
):
|
||||||
|
if await request.is_disconnected():
|
||||||
|
logger.info(f"SSE: client disconnected during streaming for {session_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if columns_meta is None:
|
||||||
|
columns_meta = cols_meta
|
||||||
|
# Send columns_used as part of first cell or update meta
|
||||||
|
meta_update = {
|
||||||
|
"type": "columns",
|
||||||
|
"columns_used": cols_meta,
|
||||||
|
}
|
||||||
|
yield f"data: {json.dumps(meta_update)}\n\n"
|
||||||
|
|
||||||
|
all_cells.append(cell)
|
||||||
|
cell_idx += 1
|
||||||
|
|
||||||
|
cell_event = {
|
||||||
|
"type": "cell",
|
||||||
|
"cell": cell,
|
||||||
|
"progress": {"current": cell_idx, "total": total},
|
||||||
|
}
|
||||||
|
yield f"data: {json.dumps(cell_event)}\n\n"
|
||||||
|
|
||||||
|
# All cells done — build final result
|
||||||
|
duration = time.time() - t0
|
||||||
|
if columns_meta is None:
|
||||||
|
columns_meta = []
|
||||||
|
|
||||||
|
used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
|
||||||
|
|
||||||
|
word_result = {
|
||||||
|
"cells": all_cells,
|
||||||
|
"grid_shape": {
|
||||||
|
"rows": n_content_rows,
|
||||||
|
"cols": n_cols,
|
||||||
|
"total_cells": len(all_cells),
|
||||||
|
},
|
||||||
|
"columns_used": columns_meta,
|
||||||
|
"layout": "vocab" if is_vocab else "generic",
|
||||||
|
"image_width": img_w,
|
||||||
|
"image_height": img_h,
|
||||||
|
"duration_seconds": round(duration, 2),
|
||||||
|
"ocr_engine": used_engine,
|
||||||
|
"summary": {
|
||||||
|
"total_cells": len(all_cells),
|
||||||
|
"non_empty_cells": sum(1 for c in all_cells if c.get("text")),
|
||||||
|
"low_confidence": sum(1 for c in all_cells if 0 < c.get("confidence", 0) < 50),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Vocab post-processing
|
||||||
|
vocab_entries = None
|
||||||
|
if is_vocab:
|
||||||
|
entries = _cells_to_vocab_entries(all_cells, columns_meta)
|
||||||
|
entries = _fix_character_confusion(entries)
|
||||||
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||||
|
entries = _split_comma_entries(entries)
|
||||||
|
entries = _attach_example_sentences(entries)
|
||||||
|
word_result["vocab_entries"] = entries
|
||||||
|
word_result["entries"] = entries
|
||||||
|
word_result["entry_count"] = len(entries)
|
||||||
|
word_result["summary"]["total_entries"] = len(entries)
|
||||||
|
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||||||
|
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||||||
|
vocab_entries = entries
|
||||||
|
|
||||||
|
# Persist to DB
|
||||||
|
await update_session_db(
|
||||||
|
session_id,
|
||||||
|
word_result=word_result,
|
||||||
|
current_step=5,
|
||||||
|
)
|
||||||
|
cached["word_result"] = word_result
|
||||||
|
|
||||||
|
logger.info(f"OCR Pipeline SSE: words session {session_id}: "
|
||||||
|
f"layout={word_result['layout']}, "
|
||||||
|
f"{len(all_cells)} cells ({duration:.2f}s)")
|
||||||
|
|
||||||
|
# Final complete event
|
||||||
|
complete_event = {
|
||||||
|
"type": "complete",
|
||||||
|
"summary": word_result["summary"],
|
||||||
|
"duration_seconds": round(duration, 2),
|
||||||
|
"ocr_engine": used_engine,
|
||||||
|
}
|
||||||
|
if vocab_entries is not None:
|
||||||
|
complete_event["vocab_entries"] = vocab_entries
|
||||||
|
yield f"data: {json.dumps(complete_event)}\n\n"
|
||||||
|
|
||||||
|
|
||||||
class WordGroundTruthRequest(BaseModel):
|
class WordGroundTruthRequest(BaseModel):
|
||||||
is_correct: bool
|
is_correct: bool
|
||||||
corrected_entries: Optional[List[Dict[str, Any]]] = None
|
corrected_entries: Optional[List[Dict[str, Any]]] = None
|
||||||
|
|||||||
Reference in New Issue
Block a user