feat(ocr-pipeline): add Step 5 word recognition (grid from columns × rows)

Backend: build_word_grid() intersects column regions with content rows,
OCRs each cell with language-specific Tesseract, and returns vocabulary
entries with percent-based bounding boxes. New endpoints: POST /words,
GET /image/words-overlay, ground-truth save/retrieve for words.
Frontend: StepWordRecognition with overview + step-through labeling modes,
goToStep callback for row correction feedback loop.
MkDocs: OCR Pipeline documentation added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 02:18:29 +01:00
parent 47dc2e6f7a
commit 954103cdf2
9 changed files with 1429 additions and 21 deletions

View File

@@ -112,6 +112,16 @@ export default function OcrPipelinePage() {
}
}
const goToStep = (step: number) => {
setCurrentStep(step)
setSteps((prev) =>
prev.map((s, i) => ({
...s,
status: i < step ? 'completed' : i === step ? 'active' : 'pending',
})),
)
}
const handleNext = () => {
if (currentStep < steps.length - 1) {
setSteps((prev) =>
@@ -161,7 +171,7 @@ export default function OcrPipelinePage() {
case 3:
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
case 4:
return <StepWordRecognition />
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
case 5:
return <StepCoordinates />
case 6:

View File

@@ -29,6 +29,7 @@ export interface SessionInfo {
dewarp_result?: DewarpResult
column_result?: ColumnResult
row_result?: RowResult
word_result?: WordResult
}
export interface DeskewResult {
@@ -116,6 +117,46 @@ export interface RowGroundTruth {
notes?: string
}
export interface WordBbox {
x: number
y: number
w: number
h: number
}
export interface WordEntry {
row_index: number
english: string
german: string
example: string
confidence: number
bbox: WordBbox
bbox_en: WordBbox | null
bbox_de: WordBbox | null
bbox_ex: WordBbox | null
status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
}
export interface WordResult {
entries: WordEntry[]
entry_count: number
image_width: number
image_height: number
duration_seconds: number
summary: {
total_entries: number
with_english: number
with_german: number
low_confidence: number
}
}
export interface WordGroundTruth {
is_correct: boolean
corrected_entries?: WordEntry[]
notes?: string
}
export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },

View File

@@ -1,19 +1,602 @@
'use client'
export function StepWordRecognition() {
return (
<div className="flex flex-col items-center justify-center py-16 text-center">
<div className="text-5xl mb-4">🔤</div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Schritt 4: Worterkennung
</h3>
<p className="text-gray-500 dark:text-gray-400 max-w-md">
OCR mit Bounding Boxes fuer jedes erkannte Wort.
Dieser Schritt wird in einer zukuenftigen Version implementiert.
</p>
<div className="mt-6 px-4 py-2 bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-400 rounded-full text-sm font-medium">
Kommt bald
import { useCallback, useEffect, useRef, useState } from 'react'
import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api'
interface StepWordRecognitionProps {
sessionId: string | null
onNext: () => void
goToStep: (step: number) => void
}
export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRecognitionProps) {
const [wordResult, setWordResult] = useState<WordResult | null>(null)
const [detecting, setDetecting] = useState(false)
const [error, setError] = useState<string | null>(null)
const [gtNotes, setGtNotes] = useState('')
const [gtSaved, setGtSaved] = useState(false)
// Step-through labeling state
const [activeIndex, setActiveIndex] = useState(0)
const [editedEntries, setEditedEntries] = useState<WordEntry[]>([])
const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
const enRef = useRef<HTMLInputElement>(null)
useEffect(() => {
if (!sessionId) return
const fetchSession = async () => {
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (res.ok) {
const info = await res.json()
if (info.word_result) {
setWordResult(info.word_result)
initEntries(info.word_result.entries)
return
}
}
} catch (e) {
console.error('Failed to fetch session info:', e)
}
runAutoDetection()
}
fetchSession()
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId])
const initEntries = (entries: WordEntry[]) => {
setEditedEntries(entries.map(e => ({ ...e, status: e.status || 'pending' })))
setActiveIndex(0)
}
const runAutoDetection = useCallback(async () => {
if (!sessionId) return
setDetecting(true)
setError(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words`, {
method: 'POST',
})
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }))
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
}
const data: WordResult = await res.json()
setWordResult(data)
initEntries(data.entries)
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setDetecting(false)
}
}, [sessionId])
const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
if (!sessionId) return
const gt: WordGroundTruth = {
is_correct: isCorrect,
corrected_entries: isCorrect ? undefined : editedEntries,
notes: gtNotes || undefined,
}
try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/ground-truth/words`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(gt),
})
setGtSaved(true)
} catch (e) {
console.error('Ground truth save failed:', e)
}
}, [sessionId, gtNotes, editedEntries])
// Step-through: update entry field
const updateEntry = (index: number, field: 'english' | 'german' | 'example', value: string) => {
setEditedEntries(prev => prev.map((e, i) =>
i === index ? { ...e, [field]: value, status: 'edited' as const } : e
))
}
// Step-through: confirm current entry
const confirmEntry = () => {
setEditedEntries(prev => prev.map((e, i) =>
i === activeIndex ? { ...e, status: e.status === 'edited' ? 'edited' : 'confirmed' } : e
))
if (activeIndex < editedEntries.length - 1) {
setActiveIndex(activeIndex + 1)
}
}
// Step-through: skip current entry
const skipEntry = () => {
setEditedEntries(prev => prev.map((e, i) =>
i === activeIndex ? { ...e, status: 'skipped' as const } : e
))
if (activeIndex < editedEntries.length - 1) {
setActiveIndex(activeIndex + 1)
}
}
// Focus english input when active entry changes in labeling mode
useEffect(() => {
if (mode === 'labeling' && enRef.current) {
enRef.current.focus()
}
}, [activeIndex, mode])
// Keyboard shortcuts in labeling mode
useEffect(() => {
if (mode !== 'labeling') return
const handler = (e: KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault()
confirmEntry()
} else if (e.key === 'Tab' && !e.shiftKey) {
// Let Tab move between fields naturally unless on last field
} else if (e.key === 'ArrowDown' && e.ctrlKey) {
e.preventDefault()
skipEntry()
} else if (e.key === 'ArrowUp' && e.ctrlKey) {
e.preventDefault()
if (activeIndex > 0) setActiveIndex(activeIndex - 1)
}
}
window.addEventListener('keydown', handler)
return () => window.removeEventListener('keydown', handler)
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [mode, activeIndex, editedEntries])
if (!sessionId) {
return (
<div className="flex flex-col items-center justify-center py-16 text-center">
<div className="text-5xl mb-4">🔤</div>
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
Schritt 5: Worterkennung
</h3>
<p className="text-gray-500 dark:text-gray-400 max-w-md">
Bitte zuerst Schritte 1-4 abschliessen.
</p>
</div>
)
}
const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/words-overlay`
const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped`
const confColor = (conf: number) => {
if (conf >= 70) return 'text-green-600 dark:text-green-400'
if (conf >= 50) return 'text-yellow-600 dark:text-yellow-400'
return 'text-red-600 dark:text-red-400'
}
const statusBadge = (status?: string) => {
const map: Record<string, string> = {
pending: 'bg-gray-100 dark:bg-gray-700 text-gray-500',
confirmed: 'bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-400',
edited: 'bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-400',
skipped: 'bg-orange-100 dark:bg-orange-900/30 text-orange-700 dark:text-orange-400',
}
return map[status || 'pending'] || map.pending
}
const summary = wordResult?.summary
const confirmedCount = editedEntries.filter(e => e.status === 'confirmed' || e.status === 'edited').length
const totalCount = editedEntries.length
return (
<div className="space-y-4">
{/* Loading */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Worterkennung laeuft...
</div>
)}
{/* Mode toggle */}
{wordResult && (
<div className="flex items-center gap-2">
<button
onClick={() => setMode('overview')}
className={`px-3 py-1.5 text-xs rounded-lg font-medium transition-colors ${
mode === 'overview'
? 'bg-teal-600 text-white'
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
}`}
>
Uebersicht
</button>
<button
onClick={() => setMode('labeling')}
className={`px-3 py-1.5 text-xs rounded-lg font-medium transition-colors ${
mode === 'labeling'
? 'bg-teal-600 text-white'
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
}`}
>
Labeling ({confirmedCount}/{totalCount})
</button>
</div>
)}
{/* Overview mode: side-by-side images + entry list */}
{mode === 'overview' && (
<>
{/* Images: overlay vs clean */}
<div className="grid grid-cols-2 gap-4">
<div>
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
Mit Grid-Overlay
</div>
<div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900">
{wordResult ? (
// eslint-disable-next-line @next/next/no-img-element
<img
src={`${overlayUrl}?t=${Date.now()}`}
alt="Wort-Overlay"
className="w-full h-auto"
/>
) : (
<div className="aspect-[3/4] flex items-center justify-center text-gray-400 text-sm">
{detecting ? 'Erkenne Woerter...' : 'Keine Daten'}
</div>
)}
</div>
</div>
<div>
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
Entzerrtes Bild
</div>
<div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={dewarpedUrl}
alt="Entzerrt"
className="w-full h-auto"
/>
</div>
</div>
</div>
{/* Result summary */}
{wordResult && summary && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<div className="flex items-center justify-between">
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
Ergebnis: {summary.total_entries} Eintraege erkannt
</h4>
<span className="text-xs text-gray-400">
{wordResult.duration_seconds}s
</span>
</div>
{/* Summary badges */}
<div className="flex gap-2 flex-wrap">
<span className="px-2 py-0.5 rounded text-xs font-medium bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-300">
EN: {summary.with_english}
</span>
<span className="px-2 py-0.5 rounded text-xs font-medium bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-300">
DE: {summary.with_german}
</span>
{summary.low_confidence > 0 && (
<span className="px-2 py-0.5 rounded text-xs font-medium bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300">
Unsicher: {summary.low_confidence}
</span>
)}
</div>
{/* Entry table */}
<div className="max-h-80 overflow-y-auto">
<table className="w-full text-xs">
<thead className="sticky top-0 bg-white dark:bg-gray-800">
<tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
<th className="py-1 pr-2 w-8">#</th>
<th className="py-1 pr-2">English</th>
<th className="py-1 pr-2">Deutsch</th>
<th className="py-1 pr-2">Example</th>
<th className="py-1 w-12 text-right">Conf</th>
</tr>
</thead>
<tbody>
{editedEntries.map((entry, idx) => (
<tr
key={idx}
className={`border-b dark:border-gray-700/50 ${
idx === activeIndex ? 'bg-teal-50 dark:bg-teal-900/20' : ''
}`}
onClick={() => { setActiveIndex(idx); setMode('labeling') }}
>
<td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
{entry.english || <span className="text-gray-300 dark:text-gray-600"></span>}
</td>
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
{entry.german || <span className="text-gray-300 dark:text-gray-600"></span>}
</td>
<td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px] truncate">
{entry.example || <span className="text-gray-300 dark:text-gray-600"></span>}
</td>
<td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
{entry.confidence}%
</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
)}
</>
)}
{/* Labeling mode: image crop + editable fields */}
{mode === 'labeling' && editedEntries.length > 0 && (
<div className="grid grid-cols-3 gap-4">
{/* Left 2/3: Image with highlighted active row */}
<div className="col-span-2">
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
Eintrag {activeIndex + 1} von {editedEntries.length}
</div>
<div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900 relative">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={`${overlayUrl}?t=${Date.now()}`}
alt="Wort-Overlay"
className="w-full h-auto"
/>
{/* Highlight overlay for active entry bbox */}
{editedEntries[activeIndex]?.bbox && (
<div
className="absolute border-2 border-yellow-400 bg-yellow-400/10 pointer-events-none"
style={{
left: `${editedEntries[activeIndex].bbox.x}%`,
top: `${editedEntries[activeIndex].bbox.y}%`,
width: `${editedEntries[activeIndex].bbox.w}%`,
height: `${editedEntries[activeIndex].bbox.h}%`,
}}
/>
)}
</div>
</div>
{/* Right 1/3: Editable entry fields */}
<div className="space-y-3">
{/* Navigation */}
<div className="flex items-center justify-between">
<button
onClick={() => setActiveIndex(Math.max(0, activeIndex - 1))}
disabled={activeIndex === 0}
className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
>
Zurueck
</button>
<span className="text-xs text-gray-500">{activeIndex + 1} / {editedEntries.length}</span>
<button
onClick={() => setActiveIndex(Math.min(editedEntries.length - 1, activeIndex + 1))}
disabled={activeIndex >= editedEntries.length - 1}
className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
>
Weiter
</button>
</div>
{/* Status badge */}
<div className="flex items-center gap-2">
<span className={`px-2 py-0.5 rounded text-[10px] uppercase font-semibold ${statusBadge(editedEntries[activeIndex]?.status)}`}>
{editedEntries[activeIndex]?.status || 'pending'}
</span>
<span className={`text-xs font-mono ${confColor(editedEntries[activeIndex]?.confidence || 0)}`}>
{editedEntries[activeIndex]?.confidence}% Konfidenz
</span>
</div>
{/* Cell crops */}
{editedEntries[activeIndex]?.bbox_en && (
<div>
<div className="text-[10px] font-medium text-blue-500 mb-0.5">EN-Zelle</div>
<div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
<CellCrop
imageUrl={dewarpedUrl}
bbox={editedEntries[activeIndex].bbox_en!}
/>
</div>
</div>
)}
{editedEntries[activeIndex]?.bbox_de && (
<div>
<div className="text-[10px] font-medium text-green-500 mb-0.5">DE-Zelle</div>
<div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
<CellCrop
imageUrl={dewarpedUrl}
bbox={editedEntries[activeIndex].bbox_de!}
/>
</div>
</div>
)}
{/* Editable fields */}
<div className="space-y-2">
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
<input
ref={enRef}
type="text"
value={editedEntries[activeIndex]?.english || ''}
onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
/>
</div>
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
<input
type="text"
value={editedEntries[activeIndex]?.german || ''}
onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
/>
</div>
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
<input
type="text"
value={editedEntries[activeIndex]?.example || ''}
onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
/>
</div>
</div>
{/* Action buttons */}
<div className="flex gap-2">
<button
onClick={confirmEntry}
className="flex-1 px-3 py-1.5 text-xs bg-green-600 text-white rounded-lg hover:bg-green-700 font-medium"
>
Bestaetigen (Enter)
</button>
<button
onClick={skipEntry}
className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600"
>
Skip
</button>
</div>
{/* Shortcuts hint */}
<div className="text-[10px] text-gray-400 space-y-0.5">
<div>Enter = Bestaetigen & weiter</div>
<div>Ctrl+ = Ueberspringen</div>
<div>Ctrl+ = Zurueck</div>
</div>
{/* Entry list (compact) */}
<div className="border-t dark:border-gray-700 pt-2 mt-2">
<div className="text-[10px] font-medium text-gray-500 dark:text-gray-400 mb-1">
Alle Eintraege
</div>
<div className="max-h-48 overflow-y-auto space-y-0.5">
{editedEntries.map((entry, idx) => (
<div
key={idx}
onClick={() => setActiveIndex(idx)}
className={`flex items-center gap-1 px-2 py-1 rounded text-[10px] cursor-pointer transition-colors ${
idx === activeIndex
? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700'
: 'hover:bg-gray-50 dark:hover:bg-gray-700/50'
}`}
>
<span className="w-4 text-right text-gray-400">{idx + 1}</span>
<span className={`w-2 h-2 rounded-full ${
entry.status === 'confirmed' ? 'bg-green-500' :
entry.status === 'edited' ? 'bg-blue-500' :
entry.status === 'skipped' ? 'bg-orange-400' :
'bg-gray-300 dark:bg-gray-600'
}`} />
<span className="truncate text-gray-600 dark:text-gray-400 font-mono">
{entry.english || '—'} {entry.german || '—'}
</span>
</div>
))}
</div>
</div>
</div>
</div>
)}
{/* Controls */}
{wordResult && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<div className="flex items-center gap-3 flex-wrap">
<button
onClick={() => runAutoDetection()}
disabled={detecting}
className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-50"
>
Erneut erkennen
</button>
<button
onClick={() => goToStep(3)}
className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 text-orange-600 dark:text-orange-400 border-orange-300 dark:border-orange-700"
>
Zeilen korrigieren (Step 4)
</button>
<div className="flex-1" />
{/* Ground truth */}
{!gtSaved ? (
<>
<input
type="text"
placeholder="Notizen (optional)"
value={gtNotes}
onChange={(e) => setGtNotes(e.target.value)}
className="px-2 py-1 text-xs border rounded dark:bg-gray-700 dark:border-gray-600 w-48"
/>
<button
onClick={() => handleGroundTruth(true)}
className="px-3 py-1.5 text-xs bg-green-600 text-white rounded-lg hover:bg-green-700"
>
Korrekt
</button>
<button
onClick={() => handleGroundTruth(false)}
className="px-3 py-1.5 text-xs bg-red-600 text-white rounded-lg hover:bg-red-700"
>
Fehlerhaft
</button>
</>
) : (
<span className="text-xs text-green-600 dark:text-green-400">
Ground Truth gespeichert
</span>
)}
<button
onClick={onNext}
className="px-4 py-1.5 text-xs bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium"
>
Weiter
</button>
</div>
</div>
)}
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
}
/**
* CellCrop: Shows a cropped portion of the dewarped image based on percent bbox.
* Uses CSS background-image + background-position for efficient cropping.
*/
function CellCrop({ imageUrl, bbox }: { imageUrl: string; bbox: { x: number; y: number; w: number; h: number } }) {
// Scale factor: how much to zoom into the cell
const scaleX = 100 / bbox.w
const scaleY = 100 / bbox.h
const scale = Math.min(scaleX, scaleY, 8) // Cap zoom at 8x
return (
<div
className="w-full h-full"
style={{
backgroundImage: `url(${imageUrl})`,
backgroundSize: `${scale * 100}%`,
backgroundPosition: `${-bbox.x * scale}% ${-bbox.y * scale}%`,
backgroundRepeat: 'no-repeat',
}}
/>
)
}

View File

@@ -0,0 +1,373 @@
# OCR Pipeline - Schrittweise Seitenrekonstruktion
**Version:** 1.0.0
**Status:** In Entwicklung
**URL:** https://macmini:3002/ai/ocr-pipeline
## Uebersicht
Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Vokabelseiten Wort fuer Wort zu rekonstruieren. Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden.
**Ziel:** 10 Vokabelseiten fehlerfrei rekonstruieren.
### Pipeline-Schritte
| Schritt | Name | Beschreibung | Status |
|---------|------|--------------|--------|
| 1 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert |
| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert |
| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile) | Implementiert |
| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation | Implementiert |
| 5 | Worterkennung | Grid aus Spalten x Zeilen, OCR pro Zelle | Implementiert |
| 6 | Koordinatenzuweisung | Exakte Positionen innerhalb Zellen | Geplant |
| 7 | Seitenrekonstruktion | Seite nachbauen aus Koordinaten | Geplant |
| 8 | Ground Truth Validierung | Gesamtpruefung aller Schritte | Geplant |
---
## Architektur
```
Admin-Lehrer (Next.js) klausur-service (FastAPI :8086)
┌────────────────────┐ ┌─────────────────────────────┐
│ /ai/ocr-pipeline │ │ /api/v1/ocr-pipeline/ │
│ │ REST │ │
│ PipelineStepper │◄────────►│ Sessions CRUD │
│ StepDeskew │ │ Image Serving │
│ StepDewarp │ │ Deskew/Dewarp/Columns/Rows │
│ StepColumnDetection│ │ Word Recognition │
│ StepRowDetection │ │ Ground Truth │
│ StepWordRecognition│ │ Overlay Images │
└────────────────────┘ └─────────────────────────────┘
┌─────────────────────┐
│ PostgreSQL │
│ ocr_pipeline_sessions│
│ (Images + JSONB) │
└─────────────────────┘
```
### Dateistruktur
```
klausur-service/backend/
├── ocr_pipeline_api.py # FastAPI Router (alle Endpoints)
├── ocr_pipeline_session_store.py # PostgreSQL Persistence
├── cv_vocab_pipeline.py # Computer Vision Algorithmen
└── migrations/
├── 002_ocr_pipeline_sessions.sql # Basis-Schema
├── 003_add_row_result.sql # Row-Result Spalte
└── 004_add_word_result.sql # Word-Result Spalte
admin-lehrer/
├── app/(admin)/ai/ocr-pipeline/
│ ├── page.tsx # Haupt-Page mit Session-Management
│ └── types.ts # TypeScript Interfaces
└── components/ocr-pipeline/
├── PipelineStepper.tsx # Fortschritts-Stepper
├── StepDeskew.tsx # Schritt 1
├── StepDewarp.tsx # Schritt 2
├── StepColumnDetection.tsx # Schritt 3
├── StepRowDetection.tsx # Schritt 4
├── StepWordRecognition.tsx # Schritt 5
├── StepCoordinates.tsx # Schritt 6 (Platzhalter)
├── StepReconstruction.tsx # Schritt 7 (Platzhalter)
└── StepGroundTruth.tsx # Schritt 8 (Platzhalter)
```
---
## API-Referenz
Alle Endpoints unter `/api/v1/ocr-pipeline/`.
### Sessions
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions` | Neue Session erstellen (Bild hochladen) |
| `GET` | `/sessions` | Alle Sessions auflisten |
| `GET` | `/sessions/{id}` | Session-Info mit allen Step-Results |
| `PUT` | `/sessions/{id}` | Session umbenennen |
| `DELETE` | `/sessions/{id}` | Session loeschen |
### Bilder
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `GET` | `/sessions/{id}/image/original` | Originalbild |
| `GET` | `/sessions/{id}/image/deskewed` | Begradigtes Bild |
| `GET` | `/sessions/{id}/image/dewarped` | Entzerrtes Bild |
| `GET` | `/sessions/{id}/image/binarized` | Binarisiertes Bild |
| `GET` | `/sessions/{id}/image/columns-overlay` | Spalten-Overlay |
| `GET` | `/sessions/{id}/image/rows-overlay` | Zeilen-Overlay |
| `GET` | `/sessions/{id}/image/words-overlay` | Wort-Grid-Overlay |
### Schritt 1: Begradigung
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions/{id}/deskew` | Automatische Begradigung |
| `POST` | `/sessions/{id}/deskew/manual` | Manuelle Winkelkorrektur |
| `POST` | `/sessions/{id}/ground-truth/deskew` | Ground Truth speichern |
### Schritt 2: Entzerrung
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions/{id}/dewarp` | Automatische Entzerrung |
| `POST` | `/sessions/{id}/dewarp/manual` | Manueller Scherbungswinkel |
| `POST` | `/sessions/{id}/ground-truth/dewarp` | Ground Truth speichern |
### Schritt 3: Spalten
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions/{id}/columns` | Automatische Spaltenerkennung |
| `POST` | `/sessions/{id}/columns/manual` | Manuelle Spalten-Definition |
| `POST` | `/sessions/{id}/ground-truth/columns` | Ground Truth speichern |
### Schritt 4: Zeilen
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions/{id}/rows` | Automatische Zeilenerkennung |
| `POST` | `/sessions/{id}/rows/manual` | Manuelle Zeilen-Definition |
| `POST` | `/sessions/{id}/ground-truth/rows` | Ground Truth speichern |
| `GET` | `/sessions/{id}/ground-truth/rows` | Ground Truth abrufen |
### Schritt 5: Worterkennung
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen |
| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
---
## Schritt 5: Worterkennung (Detail)
### Algorithmus: `build_word_grid()`
Schritt 5 nutzt die Ergebnisse von Schritt 3 (Spalten) und Schritt 4 (Zeilen), um ein Grid zu erstellen und jede Zelle per OCR auszulesen.
```
Spalten (Step 3): column_en | column_de | column_example
───────────┼─────────────┼────────────────
Zeilen (Step 4): R0 │ hello │ hallo │ Hello, World!
R1 │ world │ Welt │ The whole world
R2 │ book │ Buch │ Read a book
───────────┼─────────────┼────────────────
```
**Ablauf:**
1. **Filterung**: Nur `content`-Zeilen (kein Header/Footer) und relevante Spalten (`column_en`, `column_de`, `column_example`)
2. **Zell-Bildung**: Pro content-Zeile x pro relevante Spalte eine `PageRegion` berechnen
3. **OCR**: `ocr_region()` mit PSM 7 (Single Line) pro Zelle aufrufen
4. **Sprache**: `eng` fuer EN-Spalte, `deu` fuer DE-Spalte, `eng+deu` fuer Beispiele
5. **Gruppierung**: Zellen zu Vokabel-Eintraegen zusammenfuehren
### Response-Format
```json
{
"entries": [
{
"row_index": 0,
"english": "hello",
"german": "hallo",
"example": "Hello, how are you?",
"confidence": 85.3,
"bbox": {"x": 5.2, "y": 12.1, "w": 90.0, "h": 2.8},
"bbox_en": {"x": 5.2, "y": 12.1, "w": 30.0, "h": 2.8},
"bbox_de": {"x": 35.5, "y": 12.1, "w": 25.0, "h": 2.8},
"bbox_ex": {"x": 61.0, "y": 12.1, "w": 34.2, "h": 2.8}
}
],
"entry_count": 25,
"image_width": 2480,
"image_height": 3508,
"duration_seconds": 3.2,
"summary": {
"total_entries": 25,
"with_english": 24,
"with_german": 22,
"low_confidence": 3
}
}
```
!!! info "Bounding Boxes in Prozent"
Alle `bbox`-Werte sind Prozent (0-100) relativ zur Bildgroesse.
Das erleichtert die Darstellung im Frontend unabhaengig von der Bildaufloesung.
### Frontend: StepWordRecognition
Die Komponente bietet zwei Modi:
**Uebersicht-Modus:**
- Zwei Bilder nebeneinander: Grid-Overlay vs. sauberes Bild
- Tabelle aller erkannten Eintraege mit Konfidenz-Werten
- Klick auf Eintrag wechselt zum Labeling-Modus
**Labeling-Modus (Step-Through):**
- Links (2/3): Bild mit hervorgehobenem aktiven Eintrag (gelber Rahmen)
- Rechts (1/3): Zell-Ausschnitte + editierbare Felder (English, Deutsch, Example)
- Tastaturkuerzel:
- `Enter` = Bestaetigen und weiter
- `Ctrl+Pfeil runter` = Ueberspringen
- `Ctrl+Pfeil hoch` = Zurueck
**Feedback-Loop:**
- "Zeilen korrigieren" springt zurueck zu Schritt 4
- Nach Korrektur der Zeilen kann Schritt 5 erneut ausgefuehrt werden
---
## Datenbank-Schema
```sql
CREATE TABLE ocr_pipeline_sessions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(255),
filename VARCHAR(255),
status VARCHAR(50) DEFAULT 'active',
current_step INT DEFAULT 1,
-- Bilder (BYTEA)
original_png BYTEA,
deskewed_png BYTEA,
binarized_png BYTEA,
dewarped_png BYTEA,
-- Step-Results (JSONB)
deskew_result JSONB,
dewarp_result JSONB,
column_result JSONB,
row_result JSONB,
word_result JSONB,
-- Ground Truth + Meta
ground_truth JSONB,
auto_shear_degrees REAL,
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
```
### Migrationen
| Datei | Beschreibung |
|-------|--------------|
| `002_ocr_pipeline_sessions.sql` | Basis-Schema (Steps 1-3) |
| `003_add_row_result.sql` | `row_result JSONB` fuer Step 4 |
| `004_add_word_result.sql` | `word_result JSONB` fuer Step 5 |
---
## TypeScript Interfaces
Die wichtigsten Typen in `types.ts`:
```typescript
interface WordEntry {
row_index: number
english: string
german: string
example: string
confidence: number
bbox: WordBbox // Gesamte Zeile
bbox_en: WordBbox | null // EN-Zelle
bbox_de: WordBbox | null // DE-Zelle
bbox_ex: WordBbox | null // Example-Zelle
status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
}
interface WordResult {
entries: WordEntry[]
entry_count: number
image_width: number
image_height: number
duration_seconds: number
summary: {
total_entries: number
with_english: number
with_german: number
low_confidence: number
}
}
```
---
## Ground Truth System
Jeder Schritt kann mit Ground-Truth-Feedback versehen werden:
```json
{
"is_correct": false,
"corrected_entries": [...],
"notes": "Zeile 5 falsch erkannt",
"saved_at": "2026-02-28T10:30:00"
}
```
Ground-Truth-Daten werden in der `ground_truth` JSONB-Spalte gespeichert, gruppiert nach Schritt:
```json
{
"deskew": { "is_correct": true, ... },
"dewarp": { "is_correct": true, ... },
"columns": { "is_correct": false, ... },
"rows": { "is_correct": true, ... },
"words": { "is_correct": false, ... }
}
```
---
## Deployment
```bash
# 1. Git push
git push origin main && git push gitea main
# 2. Mac Mini pull + build
ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && git pull --no-rebase origin main"
# klausur-service (Backend)
ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
/usr/local/bin/docker compose build --no-cache klausur-service && \
/usr/local/bin/docker compose up -d klausur-service"
# admin-lehrer (Frontend)
ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
/usr/local/bin/docker compose build --no-cache admin-lehrer && \
/usr/local/bin/docker compose up -d admin-lehrer"
# 3. Migration ausfuehren
ssh macmini "/usr/local/bin/docker exec bp-lehrer-klausur-service \
python -c \"import asyncio; from ocr_pipeline_session_store import *; asyncio.run(init_ocr_pipeline_tables())\""
# 4. Testen unter:
# https://macmini:3002/ai/ocr-pipeline
```
---
## Aenderungshistorie
| Datum | Version | Aenderung |
|-------|---------|----------|
| 2026-02-28 | 1.0.0 | Schritt 5 (Worterkennung) implementiert |
| 2026-02-22 | 0.4.0 | Schritt 4 (Zeilenerkennung) implementiert |
| 2026-02-20 | 0.3.0 | Schritt 3 (Spaltenerkennung) mit Typ-Klassifikation |
| 2026-02-15 | 0.2.0 | Schritt 2 (Entzerrung/Dewarp) |
| 2026-02-12 | 0.1.0 | Schritt 1 (Begradigung/Deskew) + Session-Management |

View File

@@ -2169,6 +2169,142 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
return regions
# =============================================================================
# Pipeline Step 5: Word Grid from Columns × Rows
# =============================================================================
def build_word_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
row_geometries: List[RowGeometry],
img_w: int,
img_h: int,
lang: str = "eng+deu",
) -> List[Dict[str, Any]]:
"""Build a word grid by intersecting columns and rows, then OCR each cell.
Args:
ocr_img: Binarized full-page image.
column_regions: Classified columns from Step 3 (PageRegion list).
row_geometries: Rows from Step 4 (RowGeometry list).
img_w: Image width in pixels.
img_h: Image height in pixels.
lang: Default Tesseract language.
Returns:
List of entry dicts with english/german/example text and bbox info (percent).
"""
# Filter to content rows only (skip header/footer)
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
logger.warning("build_word_grid: no content rows found")
return []
# Map column types to roles
VOCAB_COLUMN_TYPES = {'column_en', 'column_de', 'column_example'}
relevant_cols = [c for c in column_regions if c.type in VOCAB_COLUMN_TYPES]
if not relevant_cols:
logger.warning("build_word_grid: no relevant vocabulary columns found")
return []
# Sort columns left-to-right
relevant_cols.sort(key=lambda c: c.x)
# Choose OCR language per column type
lang_map = {
'column_en': 'eng',
'column_de': 'deu',
'column_example': 'eng+deu',
}
entries: List[Dict[str, Any]] = []
for row_idx, row in enumerate(content_rows):
entry: Dict[str, Any] = {
'row_index': row_idx,
'english': '',
'german': '',
'example': '',
'confidence': 0.0,
'bbox': {
'x': round(row.x / img_w * 100, 2),
'y': round(row.y / img_h * 100, 2),
'w': round(row.width / img_w * 100, 2),
'h': round(row.height / img_h * 100, 2),
},
'bbox_en': None,
'bbox_de': None,
'bbox_ex': None,
}
confidences: List[float] = []
for col in relevant_cols:
# Compute cell region: column x/width, row y/height
cell_x = col.x
cell_y = row.y
cell_w = col.width
cell_h = row.height
# Clamp to image bounds
cell_x = max(0, cell_x)
cell_y = max(0, cell_y)
if cell_x + cell_w > img_w:
cell_w = img_w - cell_x
if cell_y + cell_h > img_h:
cell_h = img_h - cell_y
if cell_w <= 0 or cell_h <= 0:
continue
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
cell_lang = lang_map.get(col.type, lang)
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
# Sort words by x position, join to text
words.sort(key=lambda w: w['left'])
text = ' '.join(w['text'] for w in words)
if words:
avg_conf = sum(w['conf'] for w in words) / len(words)
confidences.append(avg_conf)
# Bbox in percent
cell_bbox = {
'x': round(cell_x / img_w * 100, 2),
'y': round(cell_y / img_h * 100, 2),
'w': round(cell_w / img_w * 100, 2),
'h': round(cell_h / img_h * 100, 2),
}
if col.type == 'column_en':
entry['english'] = text
entry['bbox_en'] = cell_bbox
elif col.type == 'column_de':
entry['german'] = text
entry['bbox_de'] = cell_bbox
elif col.type == 'column_example':
entry['example'] = text
entry['bbox_ex'] = cell_bbox
entry['confidence'] = round(
sum(confidences) / len(confidences), 1
) if confidences else 0.0
# Only include if at least one field has text
if entry['english'] or entry['german'] or entry['example']:
entries.append(entry)
logger.info(f"build_word_grid: {len(entries)} entries from "
f"{len(content_rows)} content rows × {len(relevant_cols)} columns")
return entries
# =============================================================================
# Stage 6: Multi-Pass OCR
# =============================================================================

View File

@@ -0,0 +1,4 @@
-- Migration 004: Add word_result column for OCR Pipeline Step 5
-- Stores the word recognition grid result (entries with english/german/example + bboxes)
ALTER TABLE ocr_pipeline_sessions ADD COLUMN IF NOT EXISTS word_result JSONB;

View File

@@ -29,8 +29,11 @@ from fastapi.responses import Response
from pydantic import BaseModel
from cv_vocab_pipeline import (
PageRegion,
RowGeometry,
analyze_layout,
analyze_layout_by_words,
build_word_grid,
classify_column_types,
create_layout_image,
create_ocr_image,
@@ -261,6 +264,10 @@ async def get_session_info(session_id: str):
result["dewarp_result"] = session["dewarp_result"]
if session.get("column_result"):
result["column_result"] = session["column_result"]
if session.get("row_result"):
result["row_result"] = session["row_result"]
if session.get("word_result"):
result["word_result"] = session["word_result"]
return result
@@ -291,7 +298,7 @@ async def delete_session(session_id: str):
@router.get("/sessions/{session_id}/image/{image_type}")
async def get_image(session_id: str, image_type: str):
"""Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay"}
valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay"}
if image_type not in valid_types:
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
@@ -301,6 +308,9 @@ async def get_image(session_id: str, image_type: str):
if image_type == "rows-overlay":
return await _get_rows_overlay(session_id)
if image_type == "words-overlay":
return await _get_words_overlay(session_id)
# Try cache first for fast serving
cached = _cache.get(session_id)
if cached:
@@ -992,6 +1002,153 @@ async def get_row_ground_truth(session_id: str):
}
# ---------------------------------------------------------------------------
# Word Recognition Endpoints (Step 5)
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/words")
async def detect_words(session_id: str):
"""Build word grid from columns × rows, OCR each cell."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection")
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
column_result = session.get("column_result")
row_result = session.get("row_result")
if not column_result or not column_result.get("columns"):
raise HTTPException(status_code=400, detail="Column detection must be completed first")
if not row_result or not row_result.get("rows"):
raise HTTPException(status_code=400, detail="Row detection must be completed first")
t0 = time.time()
# Create binarized OCR image
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
# Convert column dicts back to PageRegion objects
col_regions = [
PageRegion(
type=c["type"],
x=c["x"], y=c["y"],
width=c["width"], height=c["height"],
classification_confidence=c.get("classification_confidence", 1.0),
classification_method=c.get("classification_method", ""),
)
for c in column_result["columns"]
]
# Convert row dicts back to RowGeometry objects
row_geoms = [
RowGeometry(
index=r["index"],
x=r["x"], y=r["y"],
width=r["width"], height=r["height"],
word_count=r.get("word_count", 0),
words=[],
row_type=r.get("row_type", "content"),
gap_before=r.get("gap_before", 0),
)
for r in row_result["rows"]
]
# Build word grid
entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h)
duration = time.time() - t0
# Build summary
summary = {
"total_entries": len(entries),
"with_english": sum(1 for e in entries if e.get("english")),
"with_german": sum(1 for e in entries if e.get("german")),
"low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
}
word_result = {
"entries": entries,
"entry_count": len(entries),
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"summary": summary,
}
# Persist to DB
await update_session_db(
session_id,
word_result=word_result,
current_step=5,
)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline: words session {session_id}: "
f"{len(entries)} entries ({duration:.2f}s), summary: {summary}")
return {
"session_id": session_id,
**word_result,
}
class WordGroundTruthRequest(BaseModel):
is_correct: bool
corrected_entries: Optional[List[Dict[str, Any]]] = None
notes: Optional[str] = None
@router.post("/sessions/{session_id}/ground-truth/words")
async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
"""Save ground truth feedback for the word recognition step."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
gt = {
"is_correct": req.is_correct,
"corrected_entries": req.corrected_entries,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"word_result": session.get("word_result"),
}
ground_truth["words"] = gt
await update_session_db(session_id, ground_truth=ground_truth)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
return {"session_id": session_id, "ground_truth": gt}
@router.get("/sessions/{session_id}/ground-truth/words")
async def get_word_ground_truth(session_id: str):
"""Retrieve saved ground truth for word recognition."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
words_gt = ground_truth.get("words")
if not words_gt:
raise HTTPException(status_code=404, detail="No word ground truth saved")
return {
"session_id": session_id,
"words_gt": words_gt,
"words_auto": session.get("word_result"),
}
async def _get_rows_overlay(session_id: str) -> Response:
"""Generate dewarped image with row bands drawn on it."""
session = await get_session_db(session_id)
@@ -1049,3 +1206,106 @@ async def _get_rows_overlay(session_id: str) -> Response:
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
return Response(content=result_png.tobytes(), media_type="image/png")
async def _get_words_overlay(session_id: str) -> Response:
"""Generate dewarped image with word grid cells drawn on it."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
word_result = session.get("word_result")
if not word_result or not word_result.get("entries"):
raise HTTPException(status_code=404, detail="No word data available")
column_result = session.get("column_result")
row_result = session.get("row_result")
# Load dewarped image
dewarped_png = await get_session_image(session_id, "dewarped")
if not dewarped_png:
raise HTTPException(status_code=404, detail="Dewarped image not available")
arr = np.frombuffer(dewarped_png, dtype=np.uint8)
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img is None:
raise HTTPException(status_code=500, detail="Failed to decode image")
img_h, img_w = img.shape[:2]
# Color map for cell types (BGR)
cell_colors = {
"column_en": (255, 180, 0), # Blue
"column_de": (0, 200, 0), # Green
"column_example": (0, 140, 255), # Orange
}
overlay = img.copy()
# Draw column divider lines (vertical)
if column_result and column_result.get("columns"):
for col in column_result["columns"]:
col_type = col.get("type", "")
if col_type in cell_colors:
cx = col["x"]
cv2.line(img, (cx, 0), (cx, img_h), cell_colors[col_type], 1)
cx_end = col["x"] + col["width"]
cv2.line(img, (cx_end, 0), (cx_end, img_h), cell_colors[col_type], 1)
# Draw row divider lines (horizontal) for content rows
if row_result and row_result.get("rows"):
for row in row_result["rows"]:
if row.get("row_type") == "content":
ry = row["y"]
cv2.line(img, (0, ry), (img_w, ry), (180, 180, 180), 1)
# Draw entry cells with text labels
entries = word_result["entries"]
for entry in entries:
conf = entry.get("confidence", 0)
# Color by confidence: green > 70, yellow 50-70, red < 50
if conf >= 70:
text_color = (0, 180, 0)
elif conf >= 50:
text_color = (0, 180, 220)
else:
text_color = (0, 0, 220)
for bbox_key, field_key, col_type in [
("bbox_en", "english", "column_en"),
("bbox_de", "german", "column_de"),
("bbox_ex", "example", "column_example"),
]:
bbox = entry.get(bbox_key)
text = entry.get(field_key, "")
if not bbox or not text:
continue
# Convert percent to pixels
bx = int(bbox["x"] / 100 * img_w)
by = int(bbox["y"] / 100 * img_h)
bw = int(bbox["w"] / 100 * img_w)
bh = int(bbox["h"] / 100 * img_h)
color = cell_colors.get(col_type, (200, 200, 200))
# Semi-transparent fill
cv2.rectangle(overlay, (bx, by), (bx + bw, by + bh), color, -1)
# Border
cv2.rectangle(img, (bx, by), (bx + bw, by + bh), text_color, 1)
# Text label (truncate if too long)
label = text[:30] if len(text) > 30 else text
font_scale = 0.35
cv2.putText(img, label, (bx + 3, by + bh - 4),
cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)
# Blend overlay at 10% opacity
cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img)
success, result_png = cv2.imencode(".png", img)
if not success:
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
return Response(content=result_png.tobytes(), media_type="image/png")

View File

@@ -80,7 +80,7 @@ async def create_session_db(
) VALUES ($1, $2, $3, $4, 'active', 1)
RETURNING id, name, filename, status, current_step,
deskew_result, dewarp_result, column_result, row_result,
ground_truth, auto_shear_degrees,
word_result, ground_truth, auto_shear_degrees,
created_at, updated_at
""", uuid.UUID(session_id), name, filename, original_png)
@@ -94,7 +94,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
row = await conn.fetchrow("""
SELECT id, name, filename, status, current_step,
deskew_result, dewarp_result, column_result, row_result,
ground_truth, auto_shear_degrees,
word_result, ground_truth, auto_shear_degrees,
created_at, updated_at
FROM ocr_pipeline_sessions WHERE id = $1
""", uuid.UUID(session_id))
@@ -136,10 +136,10 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
'name', 'filename', 'status', 'current_step',
'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
'deskew_result', 'dewarp_result', 'column_result', 'row_result',
'ground_truth', 'auto_shear_degrees',
'word_result', 'ground_truth', 'auto_shear_degrees',
}
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth'}
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth'}
for key, value in kwargs.items():
if key in allowed_fields:
@@ -164,7 +164,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
WHERE id = ${param_idx}
RETURNING id, name, filename, status, current_step,
deskew_result, dewarp_result, column_result, row_result,
ground_truth, auto_shear_degrees,
word_result, ground_truth, auto_shear_degrees,
created_at, updated_at
""", *values)
@@ -220,7 +220,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result[key] = result[key].isoformat()
# JSONB → parsed (asyncpg returns str for JSONB)
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth']:
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])

View File

@@ -65,6 +65,7 @@ nav:
- BYOEH Architektur: services/klausur-service/BYOEH-Architecture.md
- BYOEH Developer Guide: services/klausur-service/BYOEH-Developer-Guide.md
- NiBiS Pipeline: services/klausur-service/NiBiS-Ingestion-Pipeline.md
- OCR Pipeline: services/klausur-service/OCR-Pipeline.md
- OCR Labeling: services/klausur-service/OCR-Labeling-Spec.md
- OCR Vergleich: services/klausur-service/OCR-Compare.md
- RAG Admin: services/klausur-service/RAG-Admin-Spec.md