feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 21s
1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection,
cell text cleaning, and row merging (116 total, all green)
2. Continuation-row merge: detect multi-line vocab entries where text wraps
(lowercase EN + empty DE) and merge into previous entry
3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6
4. Batch-OCR: collect empty cells per column, run single Tesseract call on
column strip instead of per-cell (~66% fewer calls for 3+ empty cells)
5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field
highlighting, undo/redo (Ctrl+Z), per-cell reset button
6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from
any step, with reprocess button on completed pipeline steps
Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline
and updates dewarp tests to match current (image, info) return signature.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -160,6 +160,29 @@ export default function OcrPipelinePage() {
|
|||||||
8: 'Validierung',
|
8: 'Validierung',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const reprocessFromStep = useCallback(async (uiStep: number) => {
|
||||||
|
if (!sessionId) return
|
||||||
|
const dbStep = uiStep + 1 // UI is 0-indexed, DB is 1-indexed
|
||||||
|
if (!confirm(`Ab Schritt ${dbStep} (${stepNames[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return
|
||||||
|
try {
|
||||||
|
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ from_step: dbStep }),
|
||||||
|
})
|
||||||
|
if (!res.ok) {
|
||||||
|
const data = await res.json().catch(() => ({}))
|
||||||
|
console.error('Reprocess failed:', data.detail || res.status)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Reset UI steps
|
||||||
|
goToStep(uiStep)
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Reprocess error:', e)
|
||||||
|
}
|
||||||
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
|
}, [sessionId, goToStep])
|
||||||
|
|
||||||
const renderStep = () => {
|
const renderStep = () => {
|
||||||
switch (currentStep) {
|
switch (currentStep) {
|
||||||
case 0:
|
case 0:
|
||||||
@@ -291,7 +314,7 @@ export default function OcrPipelinePage() {
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
<PipelineStepper steps={steps} currentStep={currentStep} onStepClick={handleStepClick} />
|
<PipelineStepper steps={steps} currentStep={currentStep} onStepClick={handleStepClick} onReprocess={sessionId ? reprocessFromStep : undefined} />
|
||||||
|
|
||||||
<div className="min-h-[400px]">{renderStep()}</div>
|
<div className="min-h-[400px]">{renderStep()}</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -6,9 +6,10 @@ interface PipelineStepperProps {
|
|||||||
steps: PipelineStep[]
|
steps: PipelineStep[]
|
||||||
currentStep: number
|
currentStep: number
|
||||||
onStepClick: (index: number) => void
|
onStepClick: (index: number) => void
|
||||||
|
onReprocess?: (index: number) => void
|
||||||
}
|
}
|
||||||
|
|
||||||
export function PipelineStepper({ steps, currentStep, onStepClick }: PipelineStepperProps) {
|
export function PipelineStepper({ steps, currentStep, onStepClick, onReprocess }: PipelineStepperProps) {
|
||||||
return (
|
return (
|
||||||
<div className="flex items-center justify-between px-4 py-3 bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700">
|
<div className="flex items-center justify-between px-4 py-3 bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700">
|
||||||
{steps.map((step, index) => {
|
{steps.map((step, index) => {
|
||||||
@@ -26,25 +27,37 @@ export function PipelineStepper({ steps, currentStep, onStepClick }: PipelineSte
|
|||||||
}`}
|
}`}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
<button
|
<div className="relative group">
|
||||||
onClick={() => isClickable && onStepClick(index)}
|
<button
|
||||||
disabled={!isClickable}
|
onClick={() => isClickable && onStepClick(index)}
|
||||||
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium transition-all ${
|
disabled={!isClickable}
|
||||||
isActive
|
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium transition-all ${
|
||||||
? 'bg-teal-100 text-teal-700 dark:bg-teal-900/40 dark:text-teal-300 ring-2 ring-teal-400'
|
isActive
|
||||||
: isCompleted
|
? 'bg-teal-100 text-teal-700 dark:bg-teal-900/40 dark:text-teal-300 ring-2 ring-teal-400'
|
||||||
? 'bg-green-100 text-green-700 dark:bg-green-900/40 dark:text-green-300'
|
: isCompleted
|
||||||
: isFailed
|
? 'bg-green-100 text-green-700 dark:bg-green-900/40 dark:text-green-300'
|
||||||
? 'bg-red-100 text-red-700 dark:bg-red-900/40 dark:text-red-300'
|
: isFailed
|
||||||
: 'text-gray-400 dark:text-gray-500'
|
? 'bg-red-100 text-red-700 dark:bg-red-900/40 dark:text-red-300'
|
||||||
} ${isClickable ? 'cursor-pointer hover:opacity-80' : 'cursor-default'}`}
|
: 'text-gray-400 dark:text-gray-500'
|
||||||
>
|
} ${isClickable ? 'cursor-pointer hover:opacity-80' : 'cursor-default'}`}
|
||||||
<span className="text-base">
|
>
|
||||||
{isCompleted ? '✓' : isFailed ? '✗' : step.icon}
|
<span className="text-base">
|
||||||
</span>
|
{isCompleted ? '\u2713' : isFailed ? '\u2717' : step.icon}
|
||||||
<span className="hidden sm:inline">{step.name}</span>
|
</span>
|
||||||
<span className="sm:hidden">{index + 1}</span>
|
<span className="hidden sm:inline">{step.name}</span>
|
||||||
</button>
|
<span className="sm:hidden">{index + 1}</span>
|
||||||
|
</button>
|
||||||
|
{/* Reprocess button — shown on completed steps on hover */}
|
||||||
|
{isCompleted && onReprocess && (
|
||||||
|
<button
|
||||||
|
onClick={(e) => { e.stopPropagation(); onReprocess(index) }}
|
||||||
|
className="absolute -top-1 -right-1 w-4 h-4 bg-orange-500 text-white rounded-full text-[9px] leading-none opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center"
|
||||||
|
title={`Ab hier neu verarbeiten`}
|
||||||
|
>
|
||||||
|
↻
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
})}
|
})}
|
||||||
|
|||||||
@@ -20,13 +20,23 @@ interface EditableCell {
|
|||||||
colIndex: number
|
colIndex: number
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type UndoAction = { cellId: string; oldText: string; newText: string }
|
||||||
|
|
||||||
export function StepReconstruction({ sessionId, onNext }: StepReconstructionProps) {
|
export function StepReconstruction({ sessionId, onNext }: StepReconstructionProps) {
|
||||||
const [status, setStatus] = useState<'loading' | 'ready' | 'saving' | 'saved' | 'error'>('loading')
|
const [status, setStatus] = useState<'loading' | 'ready' | 'saving' | 'saved' | 'error'>('loading')
|
||||||
const [error, setError] = useState('')
|
const [error, setError] = useState('')
|
||||||
const [cells, setCells] = useState<EditableCell[]>([])
|
const [cells, setCells] = useState<EditableCell[]>([])
|
||||||
const [editedTexts, setEditedTexts] = useState<Map<string, string>>(new Map())
|
const [editedTexts, setEditedTexts] = useState<Map<string, string>>(new Map())
|
||||||
const [zoom, setZoom] = useState(100)
|
const [zoom, setZoom] = useState(100)
|
||||||
const [containerSize, setContainerSize] = useState<{ w: number; h: number } | null>(null)
|
const [imageNaturalH, setImageNaturalH] = useState(0)
|
||||||
|
const [showEmptyHighlight, setShowEmptyHighlight] = useState(true)
|
||||||
|
|
||||||
|
// Undo/Redo stacks
|
||||||
|
const [undoStack, setUndoStack] = useState<UndoAction[]>([])
|
||||||
|
const [redoStack, setRedoStack] = useState<UndoAction[]>([])
|
||||||
|
|
||||||
|
// All cells including empty ones (for empty field highlighting)
|
||||||
|
const [allCells, setAllCells] = useState<EditableCell[]>([])
|
||||||
|
|
||||||
const containerRef = useRef<HTMLDivElement>(null)
|
const containerRef = useRef<HTMLDivElement>(null)
|
||||||
const imageRef = useRef<HTMLImageElement>(null)
|
const imageRef = useRef<HTMLImageElement>(null)
|
||||||
@@ -38,16 +48,11 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, [sessionId])
|
}, [sessionId])
|
||||||
|
|
||||||
// Track container size for font scaling
|
// Track image natural height for font scaling
|
||||||
useEffect(() => {
|
const handleImageLoad = useCallback(() => {
|
||||||
if (!containerRef.current) return
|
if (imageRef.current) {
|
||||||
const observer = new ResizeObserver((entries) => {
|
setImageNaturalH(imageRef.current.naturalHeight)
|
||||||
for (const entry of entries) {
|
}
|
||||||
setContainerSize({ w: entry.contentRect.width, h: entry.contentRect.height })
|
|
||||||
}
|
|
||||||
})
|
|
||||||
observer.observe(containerRef.current)
|
|
||||||
return () => observer.disconnect()
|
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
const loadSessionData = async () => {
|
const loadSessionData = async () => {
|
||||||
@@ -67,19 +72,21 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
|
|
||||||
// Build editable cells from grid cells
|
// Build editable cells from grid cells
|
||||||
const gridCells: GridCell[] = wordResult.cells || []
|
const gridCells: GridCell[] = wordResult.cells || []
|
||||||
const editableCells: EditableCell[] = gridCells
|
const allEditableCells: EditableCell[] = gridCells.map(c => ({
|
||||||
.filter(c => c.text.trim() !== '')
|
cellId: c.cell_id,
|
||||||
.map(c => ({
|
text: c.text,
|
||||||
cellId: c.cell_id,
|
originalText: c.text,
|
||||||
text: c.text,
|
bboxPct: c.bbox_pct,
|
||||||
originalText: c.text,
|
colType: c.col_type,
|
||||||
bboxPct: c.bbox_pct,
|
rowIndex: c.row_index,
|
||||||
colType: c.col_type,
|
colIndex: c.col_index,
|
||||||
rowIndex: c.row_index,
|
}))
|
||||||
colIndex: c.col_index,
|
|
||||||
}))
|
|
||||||
|
|
||||||
setCells(editableCells)
|
setAllCells(allEditableCells)
|
||||||
|
setCells(allEditableCells.filter(c => c.text.trim() !== ''))
|
||||||
|
setEditedTexts(new Map())
|
||||||
|
setUndoStack([])
|
||||||
|
setRedoStack([])
|
||||||
setStatus('ready')
|
setStatus('ready')
|
||||||
} catch (e: unknown) {
|
} catch (e: unknown) {
|
||||||
setError(e instanceof Error ? e.message : String(e))
|
setError(e instanceof Error ? e.message : String(e))
|
||||||
@@ -89,12 +96,80 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
|
|
||||||
const handleTextChange = useCallback((cellId: string, newText: string) => {
|
const handleTextChange = useCallback((cellId: string, newText: string) => {
|
||||||
setEditedTexts(prev => {
|
setEditedTexts(prev => {
|
||||||
|
const oldText = prev.get(cellId)
|
||||||
|
const cell = cells.find(c => c.cellId === cellId)
|
||||||
|
const prevText = oldText ?? cell?.text ?? ''
|
||||||
|
|
||||||
|
// Push to undo stack
|
||||||
|
setUndoStack(stack => [...stack, { cellId, oldText: prevText, newText }])
|
||||||
|
setRedoStack([]) // Clear redo on new edit
|
||||||
|
|
||||||
const next = new Map(prev)
|
const next = new Map(prev)
|
||||||
next.set(cellId, newText)
|
next.set(cellId, newText)
|
||||||
return next
|
return next
|
||||||
})
|
})
|
||||||
|
}, [cells])
|
||||||
|
|
||||||
|
const undo = useCallback(() => {
|
||||||
|
setUndoStack(stack => {
|
||||||
|
if (stack.length === 0) return stack
|
||||||
|
const action = stack[stack.length - 1]
|
||||||
|
const newStack = stack.slice(0, -1)
|
||||||
|
|
||||||
|
setRedoStack(rs => [...rs, action])
|
||||||
|
setEditedTexts(prev => {
|
||||||
|
const next = new Map(prev)
|
||||||
|
next.set(action.cellId, action.oldText)
|
||||||
|
return next
|
||||||
|
})
|
||||||
|
|
||||||
|
return newStack
|
||||||
|
})
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
|
const redo = useCallback(() => {
|
||||||
|
setRedoStack(stack => {
|
||||||
|
if (stack.length === 0) return stack
|
||||||
|
const action = stack[stack.length - 1]
|
||||||
|
const newStack = stack.slice(0, -1)
|
||||||
|
|
||||||
|
setUndoStack(us => [...us, action])
|
||||||
|
setEditedTexts(prev => {
|
||||||
|
const next = new Map(prev)
|
||||||
|
next.set(action.cellId, action.newText)
|
||||||
|
return next
|
||||||
|
})
|
||||||
|
|
||||||
|
return newStack
|
||||||
|
})
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const resetCell = useCallback((cellId: string) => {
|
||||||
|
const cell = cells.find(c => c.cellId === cellId)
|
||||||
|
if (!cell) return
|
||||||
|
setEditedTexts(prev => {
|
||||||
|
const next = new Map(prev)
|
||||||
|
next.delete(cellId)
|
||||||
|
return next
|
||||||
|
})
|
||||||
|
}, [cells])
|
||||||
|
|
||||||
|
// Global keyboard shortcuts for undo/redo
|
||||||
|
useEffect(() => {
|
||||||
|
const handler = (e: KeyboardEvent) => {
|
||||||
|
if ((e.metaKey || e.ctrlKey) && e.key === 'z') {
|
||||||
|
e.preventDefault()
|
||||||
|
if (e.shiftKey) {
|
||||||
|
redo()
|
||||||
|
} else {
|
||||||
|
undo()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
document.addEventListener('keydown', handler)
|
||||||
|
return () => document.removeEventListener('keydown', handler)
|
||||||
|
}, [undo, redo])
|
||||||
|
|
||||||
const getDisplayText = useCallback((cell: EditableCell): string => {
|
const getDisplayText = useCallback((cell: EditableCell): string => {
|
||||||
return editedTexts.get(cell.cellId) ?? cell.text
|
return editedTexts.get(cell.cellId) ?? cell.text
|
||||||
}, [editedTexts])
|
}, [editedTexts])
|
||||||
@@ -112,6 +187,18 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
return count
|
return count
|
||||||
}, [cells, isEdited])
|
}, [cells, isEdited])
|
||||||
|
|
||||||
|
// Identify empty required cells (EN or DE columns with no text)
|
||||||
|
const emptyCellIds = useMemo(() => {
|
||||||
|
const required = new Set(['column_en', 'column_de'])
|
||||||
|
const ids = new Set<string>()
|
||||||
|
for (const cell of allCells) {
|
||||||
|
if (required.has(cell.colType) && !cell.text.trim()) {
|
||||||
|
ids.add(cell.cellId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ids
|
||||||
|
}, [allCells])
|
||||||
|
|
||||||
// Sort cells for tab navigation: by row, then by column
|
// Sort cells for tab navigation: by row, then by column
|
||||||
const sortedCellIds = useMemo(() => {
|
const sortedCellIds = useMemo(() => {
|
||||||
return [...cells]
|
return [...cells]
|
||||||
@@ -181,6 +268,13 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
return colors[colType] || 'border-gray-400/40 focus:border-gray-500'
|
return colors[colType] || 'border-gray-400/40 focus:border-gray-500'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Font size based on image natural height (not container) scaled by zoom
|
||||||
|
const getFontSize = useCallback((bboxH: number): number => {
|
||||||
|
const baseH = imageNaturalH || 800
|
||||||
|
const px = (bboxH / 100) * baseH * 0.55
|
||||||
|
return Math.max(8, Math.min(18, px * (zoom / 100)))
|
||||||
|
}, [imageNaturalH, zoom])
|
||||||
|
|
||||||
if (!sessionId) {
|
if (!sessionId) {
|
||||||
return <div className="text-center py-12 text-gray-400">Bitte zuerst eine Session auswaehlen.</div>
|
return <div className="text-center py-12 text-gray-400">Bitte zuerst eine Session auswaehlen.</div>
|
||||||
}
|
}
|
||||||
@@ -197,7 +291,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
if (status === 'error') {
|
if (status === 'error') {
|
||||||
return (
|
return (
|
||||||
<div className="flex flex-col items-center justify-center py-12 text-center">
|
<div className="flex flex-col items-center justify-center py-12 text-center">
|
||||||
<div className="text-5xl mb-4">⚠️</div>
|
<div className="text-5xl mb-4">⚠️</div>
|
||||||
<h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler</h3>
|
<h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler</h3>
|
||||||
<p className="text-sm text-gray-500 dark:text-gray-400 max-w-lg mb-4">{error}</p>
|
<p className="text-sm text-gray-500 dark:text-gray-400 max-w-lg mb-4">{error}</p>
|
||||||
<div className="flex gap-3">
|
<div className="flex gap-3">
|
||||||
@@ -207,7 +301,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
</button>
|
</button>
|
||||||
<button onClick={onNext}
|
<button onClick={onNext}
|
||||||
className="px-5 py-2 bg-gray-200 dark:bg-gray-700 text-gray-700 dark:text-gray-300 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition-colors text-sm">
|
className="px-5 py-2 bg-gray-200 dark:bg-gray-700 text-gray-700 dark:text-gray-300 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition-colors text-sm">
|
||||||
Ueberspringen →
|
Ueberspringen →
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -217,14 +311,14 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
if (status === 'saved') {
|
if (status === 'saved') {
|
||||||
return (
|
return (
|
||||||
<div className="flex flex-col items-center justify-center py-12 text-center">
|
<div className="flex flex-col items-center justify-center py-12 text-center">
|
||||||
<div className="text-5xl mb-4">✅</div>
|
<div className="text-5xl mb-4">✅</div>
|
||||||
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">Rekonstruktion gespeichert</h3>
|
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">Rekonstruktion gespeichert</h3>
|
||||||
<p className="text-sm text-gray-500 dark:text-gray-400 mb-6">
|
<p className="text-sm text-gray-500 dark:text-gray-400 mb-6">
|
||||||
{changedCount > 0 ? `${changedCount} Zellen wurden aktualisiert.` : 'Keine Aenderungen vorgenommen.'}
|
{changedCount > 0 ? `${changedCount} Zellen wurden aktualisiert.` : 'Keine Aenderungen vorgenommen.'}
|
||||||
</p>
|
</p>
|
||||||
<button onClick={onNext}
|
<button onClick={onNext}
|
||||||
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium">
|
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium">
|
||||||
Weiter →
|
Weiter →
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
@@ -239,16 +333,54 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
Schritt 7: Rekonstruktion
|
Schritt 7: Rekonstruktion
|
||||||
</h3>
|
</h3>
|
||||||
<span className="text-xs text-gray-400">
|
<span className="text-xs text-gray-400">
|
||||||
{cells.length} Zellen · {changedCount} geaendert
|
{cells.length} Zellen · {changedCount} geaendert
|
||||||
|
{emptyCellIds.size > 0 && showEmptyHighlight && (
|
||||||
|
<span className="text-red-400 ml-1">· {emptyCellIds.size} leer</span>
|
||||||
|
)}
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
|
{/* Undo/Redo */}
|
||||||
|
<button
|
||||||
|
onClick={undo}
|
||||||
|
disabled={undoStack.length === 0}
|
||||||
|
className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700 disabled:opacity-30"
|
||||||
|
title="Rueckgaengig (Ctrl+Z)"
|
||||||
|
>
|
||||||
|
↩
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={redo}
|
||||||
|
disabled={redoStack.length === 0}
|
||||||
|
className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700 disabled:opacity-30"
|
||||||
|
title="Wiederholen (Ctrl+Shift+Z)"
|
||||||
|
>
|
||||||
|
↪
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
|
||||||
|
|
||||||
|
{/* Empty field toggle */}
|
||||||
|
<button
|
||||||
|
onClick={() => setShowEmptyHighlight(v => !v)}
|
||||||
|
className={`px-2 py-1 text-xs border rounded transition-colors ${
|
||||||
|
showEmptyHighlight
|
||||||
|
? 'border-red-300 bg-red-50 text-red-600 dark:border-red-700 dark:bg-red-900/30 dark:text-red-400'
|
||||||
|
: 'border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700'
|
||||||
|
}`}
|
||||||
|
title="Leere Pflichtfelder markieren"
|
||||||
|
>
|
||||||
|
Leer
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
|
||||||
|
|
||||||
{/* Zoom controls */}
|
{/* Zoom controls */}
|
||||||
<button
|
<button
|
||||||
onClick={() => setZoom(z => Math.max(50, z - 25))}
|
onClick={() => setZoom(z => Math.max(50, z - 25))}
|
||||||
className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700"
|
className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700"
|
||||||
>
|
>
|
||||||
−
|
−
|
||||||
</button>
|
</button>
|
||||||
<span className="text-xs text-gray-500 w-10 text-center">{zoom}%</span>
|
<span className="text-xs text-gray-500 w-10 text-center">{zoom}%</span>
|
||||||
<button
|
<button
|
||||||
@@ -291,34 +423,63 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
alt="Dewarped"
|
alt="Dewarped"
|
||||||
className="block"
|
className="block"
|
||||||
style={{ opacity: 0.3 }}
|
style={{ opacity: 0.3 }}
|
||||||
|
onLoad={handleImageLoad}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
{/* Empty field markers */}
|
||||||
|
{showEmptyHighlight && allCells
|
||||||
|
.filter(c => emptyCellIds.has(c.cellId))
|
||||||
|
.map(cell => (
|
||||||
|
<div
|
||||||
|
key={`empty-${cell.cellId}`}
|
||||||
|
className="absolute border-2 border-dashed border-red-400/60 rounded pointer-events-none"
|
||||||
|
style={{
|
||||||
|
left: `${cell.bboxPct.x}%`,
|
||||||
|
top: `${cell.bboxPct.y}%`,
|
||||||
|
width: `${cell.bboxPct.w}%`,
|
||||||
|
height: `${cell.bboxPct.h}%`,
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
))}
|
||||||
|
|
||||||
{/* Editable text fields at bbox positions */}
|
{/* Editable text fields at bbox positions */}
|
||||||
{cells.map((cell) => {
|
{cells.map((cell) => {
|
||||||
const displayText = getDisplayText(cell)
|
const displayText = getDisplayText(cell)
|
||||||
const edited = isEdited(cell)
|
const edited = isEdited(cell)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<input
|
<div key={cell.cellId} className="absolute group" style={{
|
||||||
key={cell.cellId}
|
left: `${cell.bboxPct.x}%`,
|
||||||
id={`cell-${cell.cellId}`}
|
top: `${cell.bboxPct.y}%`,
|
||||||
type="text"
|
width: `${cell.bboxPct.w}%`,
|
||||||
value={displayText}
|
height: `${cell.bboxPct.h}%`,
|
||||||
onChange={(e) => handleTextChange(cell.cellId, e.target.value)}
|
}}>
|
||||||
onKeyDown={(e) => handleKeyDown(e, cell.cellId)}
|
<input
|
||||||
className={`absolute bg-transparent text-black dark:text-white border px-0.5 outline-none transition-colors ${
|
id={`cell-${cell.cellId}`}
|
||||||
colTypeColor(cell.colType)
|
type="text"
|
||||||
} ${edited ? 'border-green-500 bg-green-50/30 dark:bg-green-900/20' : ''}`}
|
value={displayText}
|
||||||
style={{
|
onChange={(e) => handleTextChange(cell.cellId, e.target.value)}
|
||||||
left: `${cell.bboxPct.x}%`,
|
onKeyDown={(e) => handleKeyDown(e, cell.cellId)}
|
||||||
top: `${cell.bboxPct.y}%`,
|
className={`w-full h-full bg-transparent text-black dark:text-white border px-0.5 outline-none transition-colors ${
|
||||||
width: `${cell.bboxPct.w}%`,
|
colTypeColor(cell.colType)
|
||||||
height: `${cell.bboxPct.h}%`,
|
} ${edited ? 'border-green-500 bg-green-50/30 dark:bg-green-900/20' : ''}`}
|
||||||
fontSize: `${Math.max(8, Math.min(16, (cell.bboxPct.h / 100) * (containerSize?.h || 800) * 0.6))}px`,
|
style={{
|
||||||
lineHeight: '1',
|
fontSize: `${getFontSize(cell.bboxPct.h)}px`,
|
||||||
}}
|
lineHeight: '1',
|
||||||
title={`${cell.cellId} (${cell.colType})`}
|
}}
|
||||||
/>
|
title={`${cell.cellId} (${cell.colType})`}
|
||||||
|
/>
|
||||||
|
{/* Per-cell reset button (X) — only shown for edited cells on hover */}
|
||||||
|
{edited && (
|
||||||
|
<button
|
||||||
|
onClick={() => resetCell(cell.cellId)}
|
||||||
|
className="absolute -top-1 -right-1 w-4 h-4 bg-red-500 text-white rounded-full text-[9px] leading-none opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center"
|
||||||
|
title="Zuruecksetzen"
|
||||||
|
>
|
||||||
|
×
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
)
|
)
|
||||||
})}
|
})}
|
||||||
</div>
|
</div>
|
||||||
@@ -336,7 +497,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
|
|||||||
}}
|
}}
|
||||||
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium text-sm"
|
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium text-sm"
|
||||||
>
|
>
|
||||||
{changedCount > 0 ? 'Speichern & Weiter →' : 'Weiter →'}
|
{changedCount > 0 ? 'Speichern & Weiter \u2192' : 'Weiter \u2192'}
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -3503,6 +3503,21 @@ def _ocr_single_cell(
|
|||||||
)
|
)
|
||||||
used_engine = 'cell_ocr_fallback'
|
used_engine = 'cell_ocr_fallback'
|
||||||
|
|
||||||
|
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
|
||||||
|
if not text.strip() and _run_fallback and not use_rapid:
|
||||||
|
cell_lang = lang_map.get(col.type, lang)
|
||||||
|
psm7_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
|
||||||
|
if psm7_words:
|
||||||
|
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||||
|
if psm7_words:
|
||||||
|
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||||||
|
if p7_text.strip():
|
||||||
|
text = p7_text
|
||||||
|
avg_conf = round(
|
||||||
|
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||||||
|
)
|
||||||
|
used_engine = 'cell_ocr_psm7'
|
||||||
|
|
||||||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||||||
if text.strip():
|
if text.strip():
|
||||||
text = _clean_cell_text(text)
|
text = _clean_cell_text(text)
|
||||||
@@ -3628,6 +3643,79 @@ def build_cell_grid(
|
|||||||
)
|
)
|
||||||
cells.append(cell)
|
cells.append(cell)
|
||||||
|
|
||||||
|
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
|
||||||
|
# Collect cells that are still empty but have visible pixels.
|
||||||
|
# Instead of calling Tesseract once per cell (expensive), crop an entire
|
||||||
|
# column strip and run OCR once, then assign words to cells by Y position.
|
||||||
|
empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
|
||||||
|
for ci, cell in enumerate(cells):
|
||||||
|
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
|
||||||
|
bpx = cell['bbox_px']
|
||||||
|
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
|
||||||
|
if w > 0 and h > 0 and ocr_img is not None:
|
||||||
|
crop = ocr_img[y:y + h, x:x + w]
|
||||||
|
if crop.size > 0:
|
||||||
|
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||||
|
if dark_ratio > 0.005:
|
||||||
|
empty_by_col.setdefault(cell['col_index'], []).append(ci)
|
||||||
|
|
||||||
|
for col_idx, cell_indices in empty_by_col.items():
|
||||||
|
if len(cell_indices) < 3:
|
||||||
|
continue # Not worth batching for < 3 cells
|
||||||
|
|
||||||
|
# Find the column strip bounding box (union of all empty cell bboxes)
|
||||||
|
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
|
||||||
|
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
|
||||||
|
col_x = cells[cell_indices[0]]['bbox_px']['x']
|
||||||
|
col_w = cells[cell_indices[0]]['bbox_px']['w']
|
||||||
|
|
||||||
|
strip_region = PageRegion(
|
||||||
|
type=relevant_cols[col_idx].type,
|
||||||
|
x=col_x, y=min_y,
|
||||||
|
width=col_w, height=max_y_h - min_y,
|
||||||
|
)
|
||||||
|
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
|
||||||
|
|
||||||
|
if use_rapid and img_bgr is not None:
|
||||||
|
strip_words = ocr_region_rapid(img_bgr, strip_region)
|
||||||
|
else:
|
||||||
|
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
|
||||||
|
|
||||||
|
if not strip_words:
|
||||||
|
continue
|
||||||
|
|
||||||
|
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
|
||||||
|
if not strip_words:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Assign words to cells by Y overlap
|
||||||
|
for ci in cell_indices:
|
||||||
|
cell_y = cells[ci]['bbox_px']['y']
|
||||||
|
cell_h = cells[ci]['bbox_px']['h']
|
||||||
|
cell_mid_y = cell_y + cell_h / 2
|
||||||
|
|
||||||
|
matched_words = [
|
||||||
|
w for w in strip_words
|
||||||
|
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
|
||||||
|
]
|
||||||
|
if matched_words:
|
||||||
|
matched_words.sort(key=lambda w: w['left'])
|
||||||
|
batch_text = ' '.join(w['text'] for w in matched_words)
|
||||||
|
batch_text = _clean_cell_text(batch_text)
|
||||||
|
if batch_text.strip():
|
||||||
|
cells[ci]['text'] = batch_text
|
||||||
|
cells[ci]['confidence'] = round(
|
||||||
|
sum(w['conf'] for w in matched_words) / len(matched_words), 1
|
||||||
|
)
|
||||||
|
cells[ci]['ocr_engine'] = 'batch_column_ocr'
|
||||||
|
|
||||||
|
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
|
||||||
|
if batch_filled > 0:
|
||||||
|
logger.info(
|
||||||
|
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
|
||||||
|
f"empty cells in column {col_idx}"
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
||||||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||||||
f"engine={engine_name}")
|
f"engine={engine_name}")
|
||||||
@@ -3869,6 +3957,69 @@ def _merge_phonetic_continuation_rows(
|
|||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_continuation_rows(
|
||||||
|
entries: List[Dict[str, Any]],
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Merge multi-line vocabulary entries where text wraps to the next row.
|
||||||
|
|
||||||
|
A row is a continuation of the previous entry when:
|
||||||
|
- EN has text, but DE is empty
|
||||||
|
- EN starts with a lowercase letter (not a new vocab entry)
|
||||||
|
- Previous entry's EN does NOT end with a sentence terminator (.!?)
|
||||||
|
- The continuation text has fewer than 4 words (not an example sentence)
|
||||||
|
- The row was not already merged as phonetic
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Row 5: EN="to put up" DE="aufstellen"
|
||||||
|
Row 6: EN="with sth." DE=""
|
||||||
|
→ Merged: EN="to put up with sth." DE="aufstellen"
|
||||||
|
"""
|
||||||
|
if len(entries) < 2:
|
||||||
|
return entries
|
||||||
|
|
||||||
|
merged: List[Dict[str, Any]] = []
|
||||||
|
for entry in entries:
|
||||||
|
en = (entry.get('english') or '').strip()
|
||||||
|
de = (entry.get('german') or '').strip()
|
||||||
|
|
||||||
|
if merged and en and not de:
|
||||||
|
# Check: not phonetic (already handled)
|
||||||
|
if _is_phonetic_only_text(en):
|
||||||
|
merged.append(entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check: starts with lowercase
|
||||||
|
first_alpha = next((c for c in en if c.isalpha()), '')
|
||||||
|
starts_lower = first_alpha and first_alpha.islower()
|
||||||
|
|
||||||
|
# Check: fewer than 4 words (not an example sentence)
|
||||||
|
word_count = len(en.split())
|
||||||
|
is_short = word_count < 4
|
||||||
|
|
||||||
|
# Check: previous entry doesn't end with sentence terminator
|
||||||
|
prev = merged[-1]
|
||||||
|
prev_en = (prev.get('english') or '').strip()
|
||||||
|
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
|
||||||
|
|
||||||
|
if starts_lower and is_short and not prev_ends_sentence:
|
||||||
|
# Merge into previous entry
|
||||||
|
prev['english'] = (prev_en + ' ' + en).strip()
|
||||||
|
# Merge example if present
|
||||||
|
ex = (entry.get('example') or '').strip()
|
||||||
|
if ex:
|
||||||
|
prev_ex = (prev.get('example') or '').strip()
|
||||||
|
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||||||
|
logger.debug(
|
||||||
|
f"Merged continuation row {entry.get('row_index')} "
|
||||||
|
f"into previous entry: {prev['english']!r}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
merged.append(entry)
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def build_word_grid(
|
def build_word_grid(
|
||||||
ocr_img: np.ndarray,
|
ocr_img: np.ndarray,
|
||||||
column_regions: List[PageRegion],
|
column_regions: List[PageRegion],
|
||||||
@@ -3920,9 +4071,12 @@ def build_word_grid(
|
|||||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||||
n_raw = len(entries)
|
n_raw = len(entries)
|
||||||
|
|
||||||
# 0. Merge phonetic-only continuation rows into previous entry
|
# 0a. Merge phonetic-only continuation rows into previous entry
|
||||||
entries = _merge_phonetic_continuation_rows(entries)
|
entries = _merge_phonetic_continuation_rows(entries)
|
||||||
|
|
||||||
|
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
|
||||||
|
entries = _merge_continuation_rows(entries)
|
||||||
|
|
||||||
# 1. Fix character confusion (I/1/l based on context)
|
# 1. Fix character confusion (I/1/l based on context)
|
||||||
entries = _fix_character_confusion(entries)
|
entries = _fix_character_confusion(entries)
|
||||||
|
|
||||||
@@ -4361,7 +4515,7 @@ async def run_cv_pipeline(
|
|||||||
# Stage 3: Dewarp
|
# Stage 3: Dewarp
|
||||||
if enable_dewarp:
|
if enable_dewarp:
|
||||||
t = time.time()
|
t = time.time()
|
||||||
img = dewarp_image(img)
|
img, _dewarp_info = dewarp_image(img)
|
||||||
result.stages['dewarp'] = round(time.time() - t, 2)
|
result.stages['dewarp'] = round(time.time() - t, 2)
|
||||||
|
|
||||||
# Stage 4: Dual image preparation
|
# Stage 4: Dual image preparation
|
||||||
|
|||||||
@@ -1623,6 +1623,69 @@ async def save_reconstruction(session_id: str, request: Request):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/reprocess")
|
||||||
|
async def reprocess_session(session_id: str, request: Request):
|
||||||
|
"""Re-run pipeline from a specific step, clearing downstream data.
|
||||||
|
|
||||||
|
Body: {"from_step": 5} (1-indexed step number)
|
||||||
|
|
||||||
|
Clears downstream results:
|
||||||
|
- from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
|
||||||
|
- from_step <= 2: dewarp_result, column_result, row_result, word_result
|
||||||
|
- from_step <= 3: column_result, row_result, word_result
|
||||||
|
- from_step <= 4: row_result, word_result
|
||||||
|
- from_step <= 5: word_result (cells, vocab_entries)
|
||||||
|
- from_step <= 6: word_result.llm_review only
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
body = await request.json()
|
||||||
|
from_step = body.get("from_step", 1)
|
||||||
|
if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
|
||||||
|
raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
|
||||||
|
|
||||||
|
update_kwargs: Dict[str, Any] = {"current_step": from_step}
|
||||||
|
|
||||||
|
# Clear downstream data based on from_step
|
||||||
|
if from_step <= 5:
|
||||||
|
update_kwargs["word_result"] = None
|
||||||
|
elif from_step == 6:
|
||||||
|
# Only clear LLM review from word_result
|
||||||
|
word_result = session.get("word_result")
|
||||||
|
if word_result:
|
||||||
|
word_result.pop("llm_review", None)
|
||||||
|
word_result.pop("llm_corrections", None)
|
||||||
|
update_kwargs["word_result"] = word_result
|
||||||
|
|
||||||
|
if from_step <= 4:
|
||||||
|
update_kwargs["row_result"] = None
|
||||||
|
if from_step <= 3:
|
||||||
|
update_kwargs["column_result"] = None
|
||||||
|
if from_step <= 2:
|
||||||
|
update_kwargs["dewarp_result"] = None
|
||||||
|
if from_step <= 1:
|
||||||
|
update_kwargs["deskew_result"] = None
|
||||||
|
|
||||||
|
await update_session_db(session_id, **update_kwargs)
|
||||||
|
|
||||||
|
# Also clear cache
|
||||||
|
if session_id in _cache:
|
||||||
|
for key in list(update_kwargs.keys()):
|
||||||
|
if key != "current_step":
|
||||||
|
_cache[session_id][key] = update_kwargs[key]
|
||||||
|
_cache[session_id]["current_step"] = from_step
|
||||||
|
|
||||||
|
logger.info(f"Session {session_id} reprocessing from step {from_step}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"from_step": from_step,
|
||||||
|
"cleared": [k for k in update_kwargs if k != "current_step"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async def _get_rows_overlay(session_id: str) -> Response:
|
async def _get_rows_overlay(session_id: str) -> Response:
|
||||||
"""Generate dewarped image with row bands drawn on it."""
|
"""Generate dewarped image with row bands drawn on it."""
|
||||||
session = await get_session_db(session_id)
|
session = await get_session_db(session_id)
|
||||||
|
|||||||
@@ -9,6 +9,9 @@ Tests cover:
|
|||||||
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
|
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
|
||||||
- Stage 6: Multi-pass OCR region handling
|
- Stage 6: Multi-pass OCR region handling
|
||||||
- Stage 7: Line grouping and vocabulary matching
|
- Stage 7: Line grouping and vocabulary matching
|
||||||
|
- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
|
||||||
|
- Phonetic detection (_is_phonetic_only_text)
|
||||||
|
- Phonetic & continuation row merging
|
||||||
- Orchestrator (run_cv_pipeline)
|
- Orchestrator (run_cv_pipeline)
|
||||||
|
|
||||||
DSGVO Note: All tests run locally with synthetic data. No external API calls.
|
DSGVO Note: All tests run locally with synthetic data. No external API calls.
|
||||||
@@ -36,6 +39,11 @@ from cv_vocab_pipeline import (
|
|||||||
CV2_AVAILABLE,
|
CV2_AVAILABLE,
|
||||||
TESSERACT_AVAILABLE,
|
TESSERACT_AVAILABLE,
|
||||||
CV_PIPELINE_AVAILABLE,
|
CV_PIPELINE_AVAILABLE,
|
||||||
|
_is_noise_tail_token,
|
||||||
|
_clean_cell_text,
|
||||||
|
_is_phonetic_only_text,
|
||||||
|
_merge_phonetic_continuation_rows,
|
||||||
|
_merge_continuation_rows,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -202,16 +210,28 @@ class TestDeskew:
|
|||||||
|
|
||||||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||||||
class TestDewarp:
|
class TestDewarp:
|
||||||
"""Test dewarp (pass-through) stage."""
|
"""Test dewarp stage (returns (image, info) tuple)."""
|
||||||
|
|
||||||
def test_dewarp_passthrough(self, white_image):
|
def test_dewarp_returns_tuple(self, white_image):
|
||||||
"""Current dewarp should return the same image (pass-through)."""
|
"""dewarp_image must return (image, dewarp_info) tuple."""
|
||||||
result = dewarp_image(white_image)
|
result = dewarp_image(white_image)
|
||||||
np.testing.assert_array_equal(result, white_image)
|
assert isinstance(result, tuple)
|
||||||
|
assert len(result) == 2
|
||||||
|
img_out, info = result
|
||||||
|
assert isinstance(img_out, np.ndarray)
|
||||||
|
assert isinstance(info, dict)
|
||||||
|
assert "shear_degrees" in info
|
||||||
|
|
||||||
def test_dewarp_preserves_shape(self, text_like_image):
|
def test_dewarp_preserves_shape(self, text_like_image):
|
||||||
result = dewarp_image(text_like_image)
|
"""Output image should have same shape as input."""
|
||||||
assert result.shape == text_like_image.shape
|
img_out, _ = dewarp_image(text_like_image)
|
||||||
|
assert img_out.shape == text_like_image.shape
|
||||||
|
|
||||||
|
def test_dewarp_white_image_no_correction(self, white_image):
|
||||||
|
"""A uniform white image should get no shear correction."""
|
||||||
|
img_out, info = dewarp_image(white_image)
|
||||||
|
assert abs(info["shear_degrees"]) < 0.5
|
||||||
|
assert img_out.shape == white_image.shape
|
||||||
|
|
||||||
|
|
||||||
# =============================================
|
# =============================================
|
||||||
@@ -561,6 +581,268 @@ class TestStageIntegration:
|
|||||||
assert layout_img.shape[:2] == corrected.shape[:2]
|
assert layout_img.shape[:2] == corrected.shape[:2]
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================
|
||||||
|
# NOISE FILTER TESTS
|
||||||
|
# =============================================
|
||||||
|
|
||||||
|
class TestNoiseFilter:
|
||||||
|
"""Test _is_noise_tail_token for trailing OCR noise detection."""
|
||||||
|
|
||||||
|
# --- Tokens that should be KEPT (return False) ---
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("token", [
|
||||||
|
# Compound words with hyphens
|
||||||
|
"money-saver",
|
||||||
|
"under-",
|
||||||
|
"well-known",
|
||||||
|
# Words with parenthesized parts (dictionary entries)
|
||||||
|
"Schild(chen)",
|
||||||
|
"(Salat-)Gurke",
|
||||||
|
"(auf)",
|
||||||
|
"(on)",
|
||||||
|
"selbst)",
|
||||||
|
"(wir",
|
||||||
|
"Tanz(veranstaltung)",
|
||||||
|
"(zer)brechen",
|
||||||
|
# Phonetic brackets
|
||||||
|
"serva]",
|
||||||
|
"['mani",
|
||||||
|
"[eg]",
|
||||||
|
"[maus]",
|
||||||
|
# Words with trailing punctuation
|
||||||
|
"cupcakes.",
|
||||||
|
"sister.",
|
||||||
|
"mice",
|
||||||
|
# Abbreviations
|
||||||
|
"e.g.",
|
||||||
|
"sth.",
|
||||||
|
"usw.",
|
||||||
|
"adj.",
|
||||||
|
# Ellipsis
|
||||||
|
"...",
|
||||||
|
"\u2026",
|
||||||
|
# Regular words
|
||||||
|
"the",
|
||||||
|
"cat",
|
||||||
|
"big",
|
||||||
|
"run",
|
||||||
|
"set",
|
||||||
|
"ago",
|
||||||
|
])
|
||||||
|
def test_keep_real_tokens(self, token):
|
||||||
|
"""Real words, dictionary punctuation, and phonetic brackets are kept."""
|
||||||
|
assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
|
||||||
|
|
||||||
|
# --- Tokens that should be FILTERED (return True) ---
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("token", [
|
||||||
|
# Pure non-alpha
|
||||||
|
"B|",
|
||||||
|
"3d",
|
||||||
|
"x7",
|
||||||
|
")",
|
||||||
|
"|",
|
||||||
|
"@",
|
||||||
|
"3",
|
||||||
|
# Very short non-dictionary fragments
|
||||||
|
"ee",
|
||||||
|
"k",
|
||||||
|
"zz",
|
||||||
|
"qq",
|
||||||
|
# Empty
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
])
|
||||||
|
def test_filter_noise_tokens(self, token):
|
||||||
|
"""OCR noise fragments are filtered."""
|
||||||
|
assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCleanCellText:
|
||||||
|
"""Test _clean_cell_text integration (full text → cleaned text)."""
|
||||||
|
|
||||||
|
def test_empty_returns_empty(self):
|
||||||
|
assert _clean_cell_text("") == ""
|
||||||
|
assert _clean_cell_text(" ") == ""
|
||||||
|
|
||||||
|
def test_real_word_unchanged(self):
|
||||||
|
assert _clean_cell_text("cupcakes") == "cupcakes"
|
||||||
|
|
||||||
|
def test_strips_trailing_noise(self):
|
||||||
|
"""Trailing noise tokens should be removed."""
|
||||||
|
result = _clean_cell_text("cupcakes B|")
|
||||||
|
assert result == "cupcakes"
|
||||||
|
|
||||||
|
def test_keeps_trailing_real_word(self):
|
||||||
|
"""Trailing real words should be kept."""
|
||||||
|
result = _clean_cell_text("big cat")
|
||||||
|
assert result == "big cat"
|
||||||
|
|
||||||
|
def test_abbreviation_kept(self):
|
||||||
|
"""Known abbreviations should not be cleared."""
|
||||||
|
result = _clean_cell_text("e.g.")
|
||||||
|
assert result == "e.g."
|
||||||
|
|
||||||
|
def test_pure_garbage_cleared(self):
|
||||||
|
"""OCR garbage without real words should be cleared."""
|
||||||
|
result = _clean_cell_text("3d |x")
|
||||||
|
assert result == ""
|
||||||
|
|
||||||
|
def test_compound_word_preserved(self):
|
||||||
|
"""Compound words with hyphens should be preserved."""
|
||||||
|
result = _clean_cell_text("money-saver")
|
||||||
|
assert result == "money-saver"
|
||||||
|
|
||||||
|
def test_parenthesized_word_preserved(self):
|
||||||
|
result = _clean_cell_text("(Salat-)Gurke")
|
||||||
|
assert result == "(Salat-)Gurke"
|
||||||
|
|
||||||
|
def test_multiple_trailing_noise(self):
|
||||||
|
"""Multiple trailing noise tokens should all be removed."""
|
||||||
|
result = _clean_cell_text("achieve 3 |")
|
||||||
|
assert result == "achieve"
|
||||||
|
|
||||||
|
|
||||||
|
class TestPhoneticOnlyText:
|
||||||
|
"""Test _is_phonetic_only_text for phonetic transcription detection."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected", [
|
||||||
|
# Phonetic-only patterns → True
|
||||||
|
("['mani serva]", True),
|
||||||
|
("[dɑːns]", True),
|
||||||
|
("[\"a:mand]", True),
|
||||||
|
("['wɜːkʃɒp]", True),
|
||||||
|
# serva] has 5 alpha chars after bracket removal → NOT phonetic-only
|
||||||
|
("serva]", False),
|
||||||
|
# NOT phonetic-only → False
|
||||||
|
("almond ['a:mand]", False),
|
||||||
|
("Mandel", False),
|
||||||
|
("cupcakes", False),
|
||||||
|
("", False),
|
||||||
|
("achieve", False),
|
||||||
|
("money-saver ['mani]", False),
|
||||||
|
])
|
||||||
|
def test_phonetic_detection(self, text, expected):
|
||||||
|
assert _is_phonetic_only_text(text) is expected, \
|
||||||
|
f"_is_phonetic_only_text({text!r}) should be {expected}"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMergePhoneticContinuationRows:
|
||||||
|
"""Test _merge_phonetic_continuation_rows for phonetic row merging."""
|
||||||
|
|
||||||
|
def test_empty_list(self):
|
||||||
|
assert _merge_phonetic_continuation_rows([]) == []
|
||||||
|
|
||||||
|
def test_single_entry(self):
|
||||||
|
entries = [{"english": "cat", "german": "Katze", "example": ""}]
|
||||||
|
result = _merge_phonetic_continuation_rows(entries)
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0]["english"] == "cat"
|
||||||
|
|
||||||
|
def test_merges_phonetic_row(self):
|
||||||
|
"""Phonetic-only row should merge into previous entry."""
|
||||||
|
entries = [
|
||||||
|
{"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
|
||||||
|
{"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_phonetic_continuation_rows(entries)
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0]["english"] == "money-saver ['mani serva]"
|
||||||
|
assert result[0]["german"] == "Sparfuchs"
|
||||||
|
|
||||||
|
def test_no_merge_when_de_present(self):
|
||||||
|
"""Row with DE text should NOT be merged even if EN looks phonetic."""
|
||||||
|
entries = [
|
||||||
|
{"english": "cat", "german": "Katze", "example": ""},
|
||||||
|
{"english": "[kæt]", "german": "some text", "example": ""},
|
||||||
|
]
|
||||||
|
result = _merge_phonetic_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_no_merge_regular_rows(self):
|
||||||
|
"""Normal vocab rows should not be merged."""
|
||||||
|
entries = [
|
||||||
|
{"english": "cat", "german": "Katze", "example": ""},
|
||||||
|
{"english": "dog", "german": "Hund", "example": ""},
|
||||||
|
]
|
||||||
|
result = _merge_phonetic_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_merges_example_too(self):
|
||||||
|
"""If phonetic row has example text, it should merge into previous."""
|
||||||
|
entries = [
|
||||||
|
{"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
|
||||||
|
{"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_phonetic_continuation_rows(entries)
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0]["english"] == "dance [dɑːns]"
|
||||||
|
assert result[0]["example"] == "Let's dance."
|
||||||
|
|
||||||
|
|
||||||
|
class TestMergeContinuationRows:
|
||||||
|
"""Test _merge_continuation_rows for multi-line entry merging."""
|
||||||
|
|
||||||
|
def test_empty_list(self):
|
||||||
|
assert _merge_continuation_rows([]) == []
|
||||||
|
|
||||||
|
def test_no_merge_independent_rows(self):
|
||||||
|
"""Rows with both EN and DE should not be merged."""
|
||||||
|
entries = [
|
||||||
|
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
|
||||||
|
{"english": "dog", "german": "Hund", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_merge_lowercase_continuation(self):
|
||||||
|
"""Lowercase EN with empty DE should merge into previous."""
|
||||||
|
entries = [
|
||||||
|
{"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
|
||||||
|
{"english": "with sth.", "german": "", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_continuation_rows(entries)
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0]["english"] == "to put up with sth."
|
||||||
|
assert result[0]["german"] == "aufstellen"
|
||||||
|
|
||||||
|
def test_no_merge_uppercase_start(self):
|
||||||
|
"""EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
|
||||||
|
entries = [
|
||||||
|
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
|
||||||
|
{"english": "Dog", "german": "", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_no_merge_when_previous_ends_with_period(self):
|
||||||
|
"""If previous entry ends with sentence terminator, next is not continuation."""
|
||||||
|
entries = [
|
||||||
|
{"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
|
||||||
|
{"english": "really nice", "german": "", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_no_merge_long_text(self):
|
||||||
|
"""Text with 4+ words is likely an example sentence, not continuation."""
|
||||||
|
entries = [
|
||||||
|
{"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
|
||||||
|
{"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_first_entry_not_merged(self):
|
||||||
|
"""First entry with empty DE should not crash (no previous)."""
|
||||||
|
entries = [
|
||||||
|
{"english": "something", "german": "", "example": "", "row_index": 0},
|
||||||
|
{"english": "cat", "german": "Katze", "example": "", "row_index": 1},
|
||||||
|
]
|
||||||
|
result = _merge_continuation_rows(entries)
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
|
||||||
# =============================================
|
# =============================================
|
||||||
# RUN TESTS
|
# RUN TESTS
|
||||||
# =============================================
|
# =============================================
|
||||||
|
|||||||
Reference in New Issue
Block a user