feat(ocr-pipeline): add Step 5 word recognition (grid from columns × rows)

Backend: build_word_grid() intersects column regions with content rows,
OCRs each cell with language-specific Tesseract, and returns vocabulary
entries with percent-based bounding boxes. New endpoints: POST /words,
GET /image/words-overlay, ground-truth save/retrieve for words.
Frontend: StepWordRecognition with overview + step-through labeling modes,
goToStep callback for row correction feedback loop.
MkDocs: OCR Pipeline documentation added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 02:18:29 +01:00
parent 47dc2e6f7a
commit 954103cdf2
9 changed files with 1429 additions and 21 deletions

View File

@@ -1,19 +1,602 @@
'use client'
export function StepWordRecognition() {
return (
<div className="flex flex-col items-center justify-center py-16 text-center">
<div className="text-5xl mb-4">🔤</div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Schritt 4: Worterkennung
</h3>
<p className="text-gray-500 dark:text-gray-400 max-w-md">
OCR mit Bounding Boxes fuer jedes erkannte Wort.
Dieser Schritt wird in einer zukuenftigen Version implementiert.
</p>
<div className="mt-6 px-4 py-2 bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-400 rounded-full text-sm font-medium">
Kommt bald
import { useCallback, useEffect, useRef, useState } from 'react'
import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api'
interface StepWordRecognitionProps {
sessionId: string | null
onNext: () => void
goToStep: (step: number) => void
}
export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRecognitionProps) {
const [wordResult, setWordResult] = useState<WordResult | null>(null)
const [detecting, setDetecting] = useState(false)
const [error, setError] = useState<string | null>(null)
const [gtNotes, setGtNotes] = useState('')
const [gtSaved, setGtSaved] = useState(false)
// Step-through labeling state
const [activeIndex, setActiveIndex] = useState(0)
const [editedEntries, setEditedEntries] = useState<WordEntry[]>([])
const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
const enRef = useRef<HTMLInputElement>(null)
useEffect(() => {
if (!sessionId) return
const fetchSession = async () => {
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (res.ok) {
const info = await res.json()
if (info.word_result) {
setWordResult(info.word_result)
initEntries(info.word_result.entries)
return
}
}
} catch (e) {
console.error('Failed to fetch session info:', e)
}
runAutoDetection()
}
fetchSession()
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId])
const initEntries = (entries: WordEntry[]) => {
setEditedEntries(entries.map(e => ({ ...e, status: e.status || 'pending' })))
setActiveIndex(0)
}
const runAutoDetection = useCallback(async () => {
if (!sessionId) return
setDetecting(true)
setError(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words`, {
method: 'POST',
})
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }))
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
}
const data: WordResult = await res.json()
setWordResult(data)
initEntries(data.entries)
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setDetecting(false)
}
}, [sessionId])
const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
if (!sessionId) return
const gt: WordGroundTruth = {
is_correct: isCorrect,
corrected_entries: isCorrect ? undefined : editedEntries,
notes: gtNotes || undefined,
}
try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/ground-truth/words`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(gt),
})
setGtSaved(true)
} catch (e) {
console.error('Ground truth save failed:', e)
}
}, [sessionId, gtNotes, editedEntries])
// Step-through: update entry field
const updateEntry = (index: number, field: 'english' | 'german' | 'example', value: string) => {
setEditedEntries(prev => prev.map((e, i) =>
i === index ? { ...e, [field]: value, status: 'edited' as const } : e
))
}
// Step-through: confirm current entry
const confirmEntry = () => {
setEditedEntries(prev => prev.map((e, i) =>
i === activeIndex ? { ...e, status: e.status === 'edited' ? 'edited' : 'confirmed' } : e
))
if (activeIndex < editedEntries.length - 1) {
setActiveIndex(activeIndex + 1)
}
}
// Step-through: skip current entry
const skipEntry = () => {
setEditedEntries(prev => prev.map((e, i) =>
i === activeIndex ? { ...e, status: 'skipped' as const } : e
))
if (activeIndex < editedEntries.length - 1) {
setActiveIndex(activeIndex + 1)
}
}
// Focus english input when active entry changes in labeling mode
useEffect(() => {
if (mode === 'labeling' && enRef.current) {
enRef.current.focus()
}
}, [activeIndex, mode])
// Keyboard shortcuts in labeling mode
useEffect(() => {
if (mode !== 'labeling') return
const handler = (e: KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault()
confirmEntry()
} else if (e.key === 'Tab' && !e.shiftKey) {
// Let Tab move between fields naturally unless on last field
} else if (e.key === 'ArrowDown' && e.ctrlKey) {
e.preventDefault()
skipEntry()
} else if (e.key === 'ArrowUp' && e.ctrlKey) {
e.preventDefault()
if (activeIndex > 0) setActiveIndex(activeIndex - 1)
}
}
window.addEventListener('keydown', handler)
return () => window.removeEventListener('keydown', handler)
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [mode, activeIndex, editedEntries])
if (!sessionId) {
return (
<div className="flex flex-col items-center justify-center py-16 text-center">
<div className="text-5xl mb-4">🔤</div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Schritt 5: Worterkennung
</h3>
<p className="text-gray-500 dark:text-gray-400 max-w-md">
Bitte zuerst Schritte 1-4 abschliessen.
</p>
</div>
)
}
const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/words-overlay`
const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped`
const confColor = (conf: number) => {
if (conf >= 70) return 'text-green-600 dark:text-green-400'
if (conf >= 50) return 'text-yellow-600 dark:text-yellow-400'
return 'text-red-600 dark:text-red-400'
}
const statusBadge = (status?: string) => {
const map: Record<string, string> = {
pending: 'bg-gray-100 dark:bg-gray-700 text-gray-500',
confirmed: 'bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-400',
edited: 'bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-400',
skipped: 'bg-orange-100 dark:bg-orange-900/30 text-orange-700 dark:text-orange-400',
}
return map[status || 'pending'] || map.pending
}
const summary = wordResult?.summary
const confirmedCount = editedEntries.filter(e => e.status === 'confirmed' || e.status === 'edited').length
const totalCount = editedEntries.length
return (
<div className="space-y-4">
{/* Loading */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Worterkennung laeuft...
</div>
)}
{/* Mode toggle */}
{wordResult && (
<div className="flex items-center gap-2">
<button
onClick={() => setMode('overview')}
className={`px-3 py-1.5 text-xs rounded-lg font-medium transition-colors ${
mode === 'overview'
? 'bg-teal-600 text-white'
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
}`}
>
Uebersicht
</button>
<button
onClick={() => setMode('labeling')}
className={`px-3 py-1.5 text-xs rounded-lg font-medium transition-colors ${
mode === 'labeling'
? 'bg-teal-600 text-white'
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
}`}
>
Labeling ({confirmedCount}/{totalCount})
</button>
</div>
)}
{/* Overview mode: side-by-side images + entry list */}
{mode === 'overview' && (
<>
{/* Images: overlay vs clean */}
<div className="grid grid-cols-2 gap-4">
<div>
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
Mit Grid-Overlay
</div>
<div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900">
{wordResult ? (
// eslint-disable-next-line @next/next/no-img-element
<img
src={`${overlayUrl}?t=${Date.now()}`}
alt="Wort-Overlay"
className="w-full h-auto"
/>
) : (
<div className="aspect-[3/4] flex items-center justify-center text-gray-400 text-sm">
{detecting ? 'Erkenne Woerter...' : 'Keine Daten'}
</div>
)}
</div>
</div>
<div>
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
Entzerrtes Bild
</div>
<div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={dewarpedUrl}
alt="Entzerrt"
className="w-full h-auto"
/>
</div>
</div>
</div>
{/* Result summary */}
{wordResult && summary && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<div className="flex items-center justify-between">
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
Ergebnis: {summary.total_entries} Eintraege erkannt
</h4>
<span className="text-xs text-gray-400">
{wordResult.duration_seconds}s
</span>
</div>
{/* Summary badges */}
<div className="flex gap-2 flex-wrap">
<span className="px-2 py-0.5 rounded text-xs font-medium bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-300">
EN: {summary.with_english}
</span>
<span className="px-2 py-0.5 rounded text-xs font-medium bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-300">
DE: {summary.with_german}
</span>
{summary.low_confidence > 0 && (
<span className="px-2 py-0.5 rounded text-xs font-medium bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300">
Unsicher: {summary.low_confidence}
</span>
)}
</div>
{/* Entry table */}
<div className="max-h-80 overflow-y-auto">
<table className="w-full text-xs">
<thead className="sticky top-0 bg-white dark:bg-gray-800">
<tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
<th className="py-1 pr-2 w-8">#</th>
<th className="py-1 pr-2">English</th>
<th className="py-1 pr-2">Deutsch</th>
<th className="py-1 pr-2">Example</th>
<th className="py-1 w-12 text-right">Conf</th>
</tr>
</thead>
<tbody>
{editedEntries.map((entry, idx) => (
<tr
key={idx}
className={`border-b dark:border-gray-700/50 ${
idx === activeIndex ? 'bg-teal-50 dark:bg-teal-900/20' : ''
}`}
onClick={() => { setActiveIndex(idx); setMode('labeling') }}
>
<td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
{entry.english || <span className="text-gray-300 dark:text-gray-600"></span>}
</td>
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
{entry.german || <span className="text-gray-300 dark:text-gray-600"></span>}
</td>
<td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px] truncate">
{entry.example || <span className="text-gray-300 dark:text-gray-600"></span>}
</td>
<td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
{entry.confidence}%
</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
)}
</>
)}
{/* Labeling mode: image crop + editable fields */}
{mode === 'labeling' && editedEntries.length > 0 && (
<div className="grid grid-cols-3 gap-4">
{/* Left 2/3: Image with highlighted active row */}
<div className="col-span-2">
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
Eintrag {activeIndex + 1} von {editedEntries.length}
</div>
<div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900 relative">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={`${overlayUrl}?t=${Date.now()}`}
alt="Wort-Overlay"
className="w-full h-auto"
/>
{/* Highlight overlay for active entry bbox */}
{editedEntries[activeIndex]?.bbox && (
<div
className="absolute border-2 border-yellow-400 bg-yellow-400/10 pointer-events-none"
style={{
left: `${editedEntries[activeIndex].bbox.x}%`,
top: `${editedEntries[activeIndex].bbox.y}%`,
width: `${editedEntries[activeIndex].bbox.w}%`,
height: `${editedEntries[activeIndex].bbox.h}%`,
}}
/>
)}
</div>
</div>
{/* Right 1/3: Editable entry fields */}
<div className="space-y-3">
{/* Navigation */}
<div className="flex items-center justify-between">
<button
onClick={() => setActiveIndex(Math.max(0, activeIndex - 1))}
disabled={activeIndex === 0}
className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
>
Zurueck
</button>
<span className="text-xs text-gray-500">{activeIndex + 1} / {editedEntries.length}</span>
<button
onClick={() => setActiveIndex(Math.min(editedEntries.length - 1, activeIndex + 1))}
disabled={activeIndex >= editedEntries.length - 1}
className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
>
Weiter
</button>
</div>
{/* Status badge */}
<div className="flex items-center gap-2">
<span className={`px-2 py-0.5 rounded text-[10px] uppercase font-semibold ${statusBadge(editedEntries[activeIndex]?.status)}`}>
{editedEntries[activeIndex]?.status || 'pending'}
</span>
<span className={`text-xs font-mono ${confColor(editedEntries[activeIndex]?.confidence || 0)}`}>
{editedEntries[activeIndex]?.confidence}% Konfidenz
</span>
</div>
{/* Cell crops */}
{editedEntries[activeIndex]?.bbox_en && (
<div>
<div className="text-[10px] font-medium text-blue-500 mb-0.5">EN-Zelle</div>
<div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
<CellCrop
imageUrl={dewarpedUrl}
bbox={editedEntries[activeIndex].bbox_en!}
/>
</div>
</div>
)}
{editedEntries[activeIndex]?.bbox_de && (
<div>
<div className="text-[10px] font-medium text-green-500 mb-0.5">DE-Zelle</div>
<div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
<CellCrop
imageUrl={dewarpedUrl}
bbox={editedEntries[activeIndex].bbox_de!}
/>
</div>
</div>
)}
{/* Editable fields */}
<div className="space-y-2">
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
<input
ref={enRef}
type="text"
value={editedEntries[activeIndex]?.english || ''}
onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
/>
</div>
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
<input
type="text"
value={editedEntries[activeIndex]?.german || ''}
onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
/>
</div>
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
<input
type="text"
value={editedEntries[activeIndex]?.example || ''}
onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
/>
</div>
</div>
{/* Action buttons */}
<div className="flex gap-2">
<button
onClick={confirmEntry}
className="flex-1 px-3 py-1.5 text-xs bg-green-600 text-white rounded-lg hover:bg-green-700 font-medium"
>
Bestaetigen (Enter)
</button>
<button
onClick={skipEntry}
className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600"
>
Skip
</button>
</div>
{/* Shortcuts hint */}
<div className="text-[10px] text-gray-400 space-y-0.5">
<div>Enter = Bestaetigen & weiter</div>
<div>Ctrl+ = Ueberspringen</div>
<div>Ctrl+ = Zurueck</div>
</div>
{/* Entry list (compact) */}
<div className="border-t dark:border-gray-700 pt-2 mt-2">
<div className="text-[10px] font-medium text-gray-500 dark:text-gray-400 mb-1">
Alle Eintraege
</div>
<div className="max-h-48 overflow-y-auto space-y-0.5">
{editedEntries.map((entry, idx) => (
<div
key={idx}
onClick={() => setActiveIndex(idx)}
className={`flex items-center gap-1 px-2 py-1 rounded text-[10px] cursor-pointer transition-colors ${
idx === activeIndex
? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700'
: 'hover:bg-gray-50 dark:hover:bg-gray-700/50'
}`}
>
<span className="w-4 text-right text-gray-400">{idx + 1}</span>
<span className={`w-2 h-2 rounded-full ${
entry.status === 'confirmed' ? 'bg-green-500' :
entry.status === 'edited' ? 'bg-blue-500' :
entry.status === 'skipped' ? 'bg-orange-400' :
'bg-gray-300 dark:bg-gray-600'
}`} />
<span className="truncate text-gray-600 dark:text-gray-400 font-mono">
{entry.english || '—'} {entry.german || '—'}
</span>
</div>
))}
</div>
</div>
</div>
</div>
)}
{/* Controls */}
{wordResult && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<div className="flex items-center gap-3 flex-wrap">
<button
onClick={() => runAutoDetection()}
disabled={detecting}
className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-50"
>
Erneut erkennen
</button>
<button
onClick={() => goToStep(3)}
className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 text-orange-600 dark:text-orange-400 border-orange-300 dark:border-orange-700"
>
Zeilen korrigieren (Step 4)
</button>
<div className="flex-1" />
{/* Ground truth */}
{!gtSaved ? (
<>
<input
type="text"
placeholder="Notizen (optional)"
value={gtNotes}
onChange={(e) => setGtNotes(e.target.value)}
className="px-2 py-1 text-xs border rounded dark:bg-gray-700 dark:border-gray-600 w-48"
/>
<button
onClick={() => handleGroundTruth(true)}
className="px-3 py-1.5 text-xs bg-green-600 text-white rounded-lg hover:bg-green-700"
>
Korrekt
</button>
<button
onClick={() => handleGroundTruth(false)}
className="px-3 py-1.5 text-xs bg-red-600 text-white rounded-lg hover:bg-red-700"
>
Fehlerhaft
</button>
</>
) : (
<span className="text-xs text-green-600 dark:text-green-400">
Ground Truth gespeichert
</span>
)}
<button
onClick={onNext}
className="px-4 py-1.5 text-xs bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium"
>
Weiter
</button>
</div>
</div>
)}
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
}
/**
* CellCrop: Shows a cropped portion of the dewarped image based on percent bbox.
* Uses CSS background-image + background-position for efficient cropping.
*/
function CellCrop({ imageUrl, bbox }: { imageUrl: string; bbox: { x: number; y: number; w: number; h: number } }) {
// Scale factor: how much to zoom into the cell
const scaleX = 100 / bbox.w
const scaleY = 100 / bbox.h
const scale = Math.min(scaleX, scaleY, 8) // Cap zoom at 8x
return (
<div
className="w-full h-full"
style={{
backgroundImage: `url(${imageUrl})`,
backgroundSize: `${scale * 100}%`,
backgroundPosition: `${-bbox.x * scale}% ${-bbox.y * scale}%`,
backgroundRepeat: 'no-repeat',
}}
/>
)
}