diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx index fdd202b..4e45621 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx @@ -112,6 +112,16 @@ export default function OcrPipelinePage() { } } + const goToStep = (step: number) => { + setCurrentStep(step) + setSteps((prev) => + prev.map((s, i) => ({ + ...s, + status: i < step ? 'completed' : i === step ? 'active' : 'pending', + })), + ) + } + const handleNext = () => { if (currentStep < steps.length - 1) { setSteps((prev) => @@ -161,7 +171,7 @@ export default function OcrPipelinePage() { case 3: return case 4: - return + return case 5: return case 6: diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index c1acb40..a8a1f3a 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -29,6 +29,7 @@ export interface SessionInfo { dewarp_result?: DewarpResult column_result?: ColumnResult row_result?: RowResult + word_result?: WordResult } export interface DeskewResult { @@ -116,6 +117,46 @@ export interface RowGroundTruth { notes?: string } +export interface WordBbox { + x: number + y: number + w: number + h: number +} + +export interface WordEntry { + row_index: number + english: string + german: string + example: string + confidence: number + bbox: WordBbox + bbox_en: WordBbox | null + bbox_de: WordBbox | null + bbox_ex: WordBbox | null + status?: 'pending' | 'confirmed' | 'edited' | 'skipped' +} + +export interface WordResult { + entries: WordEntry[] + entry_count: number + image_width: number + image_height: number + duration_seconds: number + summary: { + total_entries: number + with_english: number + with_german: number + low_confidence: number + } +} + +export interface WordGroundTruth { + is_correct: boolean + corrected_entries?: WordEntry[] + notes?: string +} + export const PIPELINE_STEPS: PipelineStep[] = [ { id: 'deskew', name: 'Begradigung', icon: 'πŸ“', status: 'pending' }, { id: 'dewarp', name: 'Entzerrung', icon: 'πŸ”§', status: 'pending' }, diff --git a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx index 5c6b67c..76842c3 100644 --- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx @@ -1,19 +1,602 @@ 'use client' -export function StepWordRecognition() { - return ( -
-
πŸ”€
-

- Schritt 4: Worterkennung -

-

- OCR mit Bounding Boxes fuer jedes erkannte Wort. - Dieser Schritt wird in einer zukuenftigen Version implementiert. -

-
- Kommt bald +import { useCallback, useEffect, useRef, useState } from 'react' +import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/ocr-pipeline/types' + +const KLAUSUR_API = '/klausur-api' + +interface StepWordRecognitionProps { + sessionId: string | null + onNext: () => void + goToStep: (step: number) => void +} + +export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRecognitionProps) { + const [wordResult, setWordResult] = useState(null) + const [detecting, setDetecting] = useState(false) + const [error, setError] = useState(null) + const [gtNotes, setGtNotes] = useState('') + const [gtSaved, setGtSaved] = useState(false) + + // Step-through labeling state + const [activeIndex, setActiveIndex] = useState(0) + const [editedEntries, setEditedEntries] = useState([]) + const [mode, setMode] = useState<'overview' | 'labeling'>('overview') + + const enRef = useRef(null) + + useEffect(() => { + if (!sessionId) return + + const fetchSession = async () => { + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`) + if (res.ok) { + const info = await res.json() + if (info.word_result) { + setWordResult(info.word_result) + initEntries(info.word_result.entries) + return + } + } + } catch (e) { + console.error('Failed to fetch session info:', e) + } + runAutoDetection() + } + + fetchSession() + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [sessionId]) + + const initEntries = (entries: WordEntry[]) => { + setEditedEntries(entries.map(e => ({ ...e, status: e.status || 'pending' }))) + setActiveIndex(0) + } + + const runAutoDetection = useCallback(async () => { + if (!sessionId) return + setDetecting(true) + setError(null) + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words`, { + method: 'POST', + }) + if (!res.ok) { + const err = await res.json().catch(() => ({ detail: res.statusText })) + throw new Error(err.detail || 'Worterkennung fehlgeschlagen') + } + const data: WordResult = await res.json() + setWordResult(data) + initEntries(data.entries) + } catch (e) { + setError(e instanceof Error ? e.message : 'Unbekannter Fehler') + } finally { + setDetecting(false) + } + }, [sessionId]) + + const handleGroundTruth = useCallback(async (isCorrect: boolean) => { + if (!sessionId) return + const gt: WordGroundTruth = { + is_correct: isCorrect, + corrected_entries: isCorrect ? undefined : editedEntries, + notes: gtNotes || undefined, + } + try { + await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/ground-truth/words`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(gt), + }) + setGtSaved(true) + } catch (e) { + console.error('Ground truth save failed:', e) + } + }, [sessionId, gtNotes, editedEntries]) + + // Step-through: update entry field + const updateEntry = (index: number, field: 'english' | 'german' | 'example', value: string) => { + setEditedEntries(prev => prev.map((e, i) => + i === index ? { ...e, [field]: value, status: 'edited' as const } : e + )) + } + + // Step-through: confirm current entry + const confirmEntry = () => { + setEditedEntries(prev => prev.map((e, i) => + i === activeIndex ? { ...e, status: e.status === 'edited' ? 'edited' : 'confirmed' } : e + )) + if (activeIndex < editedEntries.length - 1) { + setActiveIndex(activeIndex + 1) + } + } + + // Step-through: skip current entry + const skipEntry = () => { + setEditedEntries(prev => prev.map((e, i) => + i === activeIndex ? { ...e, status: 'skipped' as const } : e + )) + if (activeIndex < editedEntries.length - 1) { + setActiveIndex(activeIndex + 1) + } + } + + // Focus english input when active entry changes in labeling mode + useEffect(() => { + if (mode === 'labeling' && enRef.current) { + enRef.current.focus() + } + }, [activeIndex, mode]) + + // Keyboard shortcuts in labeling mode + useEffect(() => { + if (mode !== 'labeling') return + const handler = (e: KeyboardEvent) => { + if (e.key === 'Enter' && !e.shiftKey) { + e.preventDefault() + confirmEntry() + } else if (e.key === 'Tab' && !e.shiftKey) { + // Let Tab move between fields naturally unless on last field + } else if (e.key === 'ArrowDown' && e.ctrlKey) { + e.preventDefault() + skipEntry() + } else if (e.key === 'ArrowUp' && e.ctrlKey) { + e.preventDefault() + if (activeIndex > 0) setActiveIndex(activeIndex - 1) + } + } + window.addEventListener('keydown', handler) + return () => window.removeEventListener('keydown', handler) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [mode, activeIndex, editedEntries]) + + if (!sessionId) { + return ( +
+
πŸ”€
+

+ Schritt 5: Worterkennung +

+

+ Bitte zuerst Schritte 1-4 abschliessen. +

+ ) + } + + const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/words-overlay` + const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + + const confColor = (conf: number) => { + if (conf >= 70) return 'text-green-600 dark:text-green-400' + if (conf >= 50) return 'text-yellow-600 dark:text-yellow-400' + return 'text-red-600 dark:text-red-400' + } + + const statusBadge = (status?: string) => { + const map: Record = { + pending: 'bg-gray-100 dark:bg-gray-700 text-gray-500', + confirmed: 'bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-400', + edited: 'bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-400', + skipped: 'bg-orange-100 dark:bg-orange-900/30 text-orange-700 dark:text-orange-400', + } + return map[status || 'pending'] || map.pending + } + + const summary = wordResult?.summary + const confirmedCount = editedEntries.filter(e => e.status === 'confirmed' || e.status === 'edited').length + const totalCount = editedEntries.length + + return ( +
+ {/* Loading */} + {detecting && ( +
+
+ Worterkennung laeuft... +
+ )} + + {/* Mode toggle */} + {wordResult && ( +
+ + +
+ )} + + {/* Overview mode: side-by-side images + entry list */} + {mode === 'overview' && ( + <> + {/* Images: overlay vs clean */} +
+
+
+ Mit Grid-Overlay +
+
+ {wordResult ? ( + // eslint-disable-next-line @next/next/no-img-element + Wort-Overlay + ) : ( +
+ {detecting ? 'Erkenne Woerter...' : 'Keine Daten'} +
+ )} +
+
+
+
+ Entzerrtes Bild +
+
+ {/* eslint-disable-next-line @next/next/no-img-element */} + Entzerrt +
+
+
+ + {/* Result summary */} + {wordResult && summary && ( +
+
+

+ Ergebnis: {summary.total_entries} Eintraege erkannt +

+ + {wordResult.duration_seconds}s + +
+ + {/* Summary badges */} +
+ + EN: {summary.with_english} + + + DE: {summary.with_german} + + {summary.low_confidence > 0 && ( + + Unsicher: {summary.low_confidence} + + )} +
+ + {/* Entry table */} +
+ + + + + + + + + + + + {editedEntries.map((entry, idx) => ( + { setActiveIndex(idx); setMode('labeling') }} + > + + + + + + + ))} + +
#EnglishDeutschExampleConf
{idx + 1} + {entry.english || β€”} + + {entry.german || β€”} + + {entry.example || β€”} + + {entry.confidence}% +
+
+
+ )} + + )} + + {/* Labeling mode: image crop + editable fields */} + {mode === 'labeling' && editedEntries.length > 0 && ( +
+ {/* Left 2/3: Image with highlighted active row */} +
+
+ Eintrag {activeIndex + 1} von {editedEntries.length} +
+
+ {/* eslint-disable-next-line @next/next/no-img-element */} + Wort-Overlay + {/* Highlight overlay for active entry bbox */} + {editedEntries[activeIndex]?.bbox && ( +
+ )} +
+
+ + {/* Right 1/3: Editable entry fields */} +
+ {/* Navigation */} +
+ + {activeIndex + 1} / {editedEntries.length} + +
+ + {/* Status badge */} +
+ + {editedEntries[activeIndex]?.status || 'pending'} + + + {editedEntries[activeIndex]?.confidence}% Konfidenz + +
+ + {/* Cell crops */} + {editedEntries[activeIndex]?.bbox_en && ( +
+
EN-Zelle
+
+ +
+
+ )} + {editedEntries[activeIndex]?.bbox_de && ( +
+
DE-Zelle
+
+ +
+
+ )} + + {/* Editable fields */} +
+
+ + updateEntry(activeIndex, 'english', e.target.value)} + className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono" + /> +
+
+ + updateEntry(activeIndex, 'german', e.target.value)} + className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono" + /> +
+
+ + updateEntry(activeIndex, 'example', e.target.value)} + className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono" + /> +
+
+ + {/* Action buttons */} +
+ + +
+ + {/* Shortcuts hint */} +
+
Enter = Bestaetigen & weiter
+
Ctrl+↓ = Ueberspringen
+
Ctrl+↑ = Zurueck
+
+ + {/* Entry list (compact) */} +
+
+ Alle Eintraege +
+
+ {editedEntries.map((entry, idx) => ( +
setActiveIndex(idx)} + className={`flex items-center gap-1 px-2 py-1 rounded text-[10px] cursor-pointer transition-colors ${ + idx === activeIndex + ? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700' + : 'hover:bg-gray-50 dark:hover:bg-gray-700/50' + }`} + > + {idx + 1} + + + {entry.english || 'β€”'} β†’ {entry.german || 'β€”'} + +
+ ))} +
+
+
+
+ )} + + {/* Controls */} + {wordResult && ( +
+
+ + + + +
+ + {/* Ground truth */} + {!gtSaved ? ( + <> + setGtNotes(e.target.value)} + className="px-2 py-1 text-xs border rounded dark:bg-gray-700 dark:border-gray-600 w-48" + /> + + + + ) : ( + + Ground Truth gespeichert + + )} + + +
+
+ )} + + {error && ( +
+ {error} +
+ )}
) } + + +/** + * CellCrop: Shows a cropped portion of the dewarped image based on percent bbox. + * Uses CSS background-image + background-position for efficient cropping. + */ +function CellCrop({ imageUrl, bbox }: { imageUrl: string; bbox: { x: number; y: number; w: number; h: number } }) { + // Scale factor: how much to zoom into the cell + const scaleX = 100 / bbox.w + const scaleY = 100 / bbox.h + const scale = Math.min(scaleX, scaleY, 8) // Cap zoom at 8x + + return ( +
+ ) +} diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md new file mode 100644 index 0000000..111cdcf --- /dev/null +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -0,0 +1,373 @@ +# OCR Pipeline - Schrittweise Seitenrekonstruktion + +**Version:** 1.0.0 +**Status:** In Entwicklung +**URL:** https://macmini:3002/ai/ocr-pipeline + +## Uebersicht + +Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Vokabelseiten Wort fuer Wort zu rekonstruieren. Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden. + +**Ziel:** 10 Vokabelseiten fehlerfrei rekonstruieren. + +### Pipeline-Schritte + +| Schritt | Name | Beschreibung | Status | +|---------|------|--------------|--------| +| 1 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert | +| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert | +| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile) | Implementiert | +| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation | Implementiert | +| 5 | Worterkennung | Grid aus Spalten x Zeilen, OCR pro Zelle | Implementiert | +| 6 | Koordinatenzuweisung | Exakte Positionen innerhalb Zellen | Geplant | +| 7 | Seitenrekonstruktion | Seite nachbauen aus Koordinaten | Geplant | +| 8 | Ground Truth Validierung | Gesamtpruefung aller Schritte | Geplant | + +--- + +## Architektur + +``` +Admin-Lehrer (Next.js) klausur-service (FastAPI :8086) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ /ai/ocr-pipeline β”‚ β”‚ /api/v1/ocr-pipeline/ β”‚ +β”‚ β”‚ REST β”‚ β”‚ +β”‚ PipelineStepper │◄────────►│ Sessions CRUD β”‚ +β”‚ StepDeskew β”‚ β”‚ Image Serving β”‚ +β”‚ StepDewarp β”‚ β”‚ Deskew/Dewarp/Columns/Rows β”‚ +β”‚ StepColumnDetectionβ”‚ β”‚ Word Recognition β”‚ +β”‚ StepRowDetection β”‚ β”‚ Ground Truth β”‚ +β”‚ StepWordRecognitionβ”‚ β”‚ Overlay Images β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ PostgreSQL β”‚ + β”‚ ocr_pipeline_sessionsβ”‚ + β”‚ (Images + JSONB) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Dateistruktur + +``` +klausur-service/backend/ +β”œβ”€β”€ ocr_pipeline_api.py # FastAPI Router (alle Endpoints) +β”œβ”€β”€ ocr_pipeline_session_store.py # PostgreSQL Persistence +β”œβ”€β”€ cv_vocab_pipeline.py # Computer Vision Algorithmen +└── migrations/ + β”œβ”€β”€ 002_ocr_pipeline_sessions.sql # Basis-Schema + β”œβ”€β”€ 003_add_row_result.sql # Row-Result Spalte + └── 004_add_word_result.sql # Word-Result Spalte + +admin-lehrer/ +β”œβ”€β”€ app/(admin)/ai/ocr-pipeline/ +β”‚ β”œβ”€β”€ page.tsx # Haupt-Page mit Session-Management +β”‚ └── types.ts # TypeScript Interfaces +└── components/ocr-pipeline/ + β”œβ”€β”€ PipelineStepper.tsx # Fortschritts-Stepper + β”œβ”€β”€ StepDeskew.tsx # Schritt 1 + β”œβ”€β”€ StepDewarp.tsx # Schritt 2 + β”œβ”€β”€ StepColumnDetection.tsx # Schritt 3 + β”œβ”€β”€ StepRowDetection.tsx # Schritt 4 + β”œβ”€β”€ StepWordRecognition.tsx # Schritt 5 + β”œβ”€β”€ StepCoordinates.tsx # Schritt 6 (Platzhalter) + β”œβ”€β”€ StepReconstruction.tsx # Schritt 7 (Platzhalter) + └── StepGroundTruth.tsx # Schritt 8 (Platzhalter) +``` + +--- + +## API-Referenz + +Alle Endpoints unter `/api/v1/ocr-pipeline/`. + +### Sessions + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions` | Neue Session erstellen (Bild hochladen) | +| `GET` | `/sessions` | Alle Sessions auflisten | +| `GET` | `/sessions/{id}` | Session-Info mit allen Step-Results | +| `PUT` | `/sessions/{id}` | Session umbenennen | +| `DELETE` | `/sessions/{id}` | Session loeschen | + +### Bilder + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `GET` | `/sessions/{id}/image/original` | Originalbild | +| `GET` | `/sessions/{id}/image/deskewed` | Begradigtes Bild | +| `GET` | `/sessions/{id}/image/dewarped` | Entzerrtes Bild | +| `GET` | `/sessions/{id}/image/binarized` | Binarisiertes Bild | +| `GET` | `/sessions/{id}/image/columns-overlay` | Spalten-Overlay | +| `GET` | `/sessions/{id}/image/rows-overlay` | Zeilen-Overlay | +| `GET` | `/sessions/{id}/image/words-overlay` | Wort-Grid-Overlay | + +### Schritt 1: Begradigung + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/deskew` | Automatische Begradigung | +| `POST` | `/sessions/{id}/deskew/manual` | Manuelle Winkelkorrektur | +| `POST` | `/sessions/{id}/ground-truth/deskew` | Ground Truth speichern | + +### Schritt 2: Entzerrung + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/dewarp` | Automatische Entzerrung | +| `POST` | `/sessions/{id}/dewarp/manual` | Manueller Scherbungswinkel | +| `POST` | `/sessions/{id}/ground-truth/dewarp` | Ground Truth speichern | + +### Schritt 3: Spalten + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/columns` | Automatische Spaltenerkennung | +| `POST` | `/sessions/{id}/columns/manual` | Manuelle Spalten-Definition | +| `POST` | `/sessions/{id}/ground-truth/columns` | Ground Truth speichern | + +### Schritt 4: Zeilen + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/rows` | Automatische Zeilenerkennung | +| `POST` | `/sessions/{id}/rows/manual` | Manuelle Zeilen-Definition | +| `POST` | `/sessions/{id}/ground-truth/rows` | Ground Truth speichern | +| `GET` | `/sessions/{id}/ground-truth/rows` | Ground Truth abrufen | + +### Schritt 5: Worterkennung + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen | +| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern | +| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen | + +--- + +## Schritt 5: Worterkennung (Detail) + +### Algorithmus: `build_word_grid()` + +Schritt 5 nutzt die Ergebnisse von Schritt 3 (Spalten) und Schritt 4 (Zeilen), um ein Grid zu erstellen und jede Zelle per OCR auszulesen. + +``` +Spalten (Step 3): column_en | column_de | column_example + ───────────┼─────────────┼──────────────── +Zeilen (Step 4): R0 β”‚ hello β”‚ hallo β”‚ Hello, World! + R1 β”‚ world β”‚ Welt β”‚ The whole world + R2 β”‚ book β”‚ Buch β”‚ Read a book + ───────────┼─────────────┼──────────────── +``` + +**Ablauf:** + +1. **Filterung**: Nur `content`-Zeilen (kein Header/Footer) und relevante Spalten (`column_en`, `column_de`, `column_example`) +2. **Zell-Bildung**: Pro content-Zeile x pro relevante Spalte eine `PageRegion` berechnen +3. **OCR**: `ocr_region()` mit PSM 7 (Single Line) pro Zelle aufrufen +4. **Sprache**: `eng` fuer EN-Spalte, `deu` fuer DE-Spalte, `eng+deu` fuer Beispiele +5. **Gruppierung**: Zellen zu Vokabel-Eintraegen zusammenfuehren + +### Response-Format + +```json +{ + "entries": [ + { + "row_index": 0, + "english": "hello", + "german": "hallo", + "example": "Hello, how are you?", + "confidence": 85.3, + "bbox": {"x": 5.2, "y": 12.1, "w": 90.0, "h": 2.8}, + "bbox_en": {"x": 5.2, "y": 12.1, "w": 30.0, "h": 2.8}, + "bbox_de": {"x": 35.5, "y": 12.1, "w": 25.0, "h": 2.8}, + "bbox_ex": {"x": 61.0, "y": 12.1, "w": 34.2, "h": 2.8} + } + ], + "entry_count": 25, + "image_width": 2480, + "image_height": 3508, + "duration_seconds": 3.2, + "summary": { + "total_entries": 25, + "with_english": 24, + "with_german": 22, + "low_confidence": 3 + } +} +``` + +!!! info "Bounding Boxes in Prozent" + Alle `bbox`-Werte sind Prozent (0-100) relativ zur Bildgroesse. + Das erleichtert die Darstellung im Frontend unabhaengig von der Bildaufloesung. + +### Frontend: StepWordRecognition + +Die Komponente bietet zwei Modi: + +**Uebersicht-Modus:** + +- Zwei Bilder nebeneinander: Grid-Overlay vs. sauberes Bild +- Tabelle aller erkannten Eintraege mit Konfidenz-Werten +- Klick auf Eintrag wechselt zum Labeling-Modus + +**Labeling-Modus (Step-Through):** + +- Links (2/3): Bild mit hervorgehobenem aktiven Eintrag (gelber Rahmen) +- Rechts (1/3): Zell-Ausschnitte + editierbare Felder (English, Deutsch, Example) +- Tastaturkuerzel: + - `Enter` = Bestaetigen und weiter + - `Ctrl+Pfeil runter` = Ueberspringen + - `Ctrl+Pfeil hoch` = Zurueck + +**Feedback-Loop:** + +- "Zeilen korrigieren" springt zurueck zu Schritt 4 +- Nach Korrektur der Zeilen kann Schritt 5 erneut ausgefuehrt werden + +--- + +## Datenbank-Schema + +```sql +CREATE TABLE ocr_pipeline_sessions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(255), + filename VARCHAR(255), + status VARCHAR(50) DEFAULT 'active', + current_step INT DEFAULT 1, + + -- Bilder (BYTEA) + original_png BYTEA, + deskewed_png BYTEA, + binarized_png BYTEA, + dewarped_png BYTEA, + + -- Step-Results (JSONB) + deskew_result JSONB, + dewarp_result JSONB, + column_result JSONB, + row_result JSONB, + word_result JSONB, + + -- Ground Truth + Meta + ground_truth JSONB, + auto_shear_degrees REAL, + created_at TIMESTAMP DEFAULT NOW(), + updated_at TIMESTAMP DEFAULT NOW() +); +``` + +### Migrationen + +| Datei | Beschreibung | +|-------|--------------| +| `002_ocr_pipeline_sessions.sql` | Basis-Schema (Steps 1-3) | +| `003_add_row_result.sql` | `row_result JSONB` fuer Step 4 | +| `004_add_word_result.sql` | `word_result JSONB` fuer Step 5 | + +--- + +## TypeScript Interfaces + +Die wichtigsten Typen in `types.ts`: + +```typescript +interface WordEntry { + row_index: number + english: string + german: string + example: string + confidence: number + bbox: WordBbox // Gesamte Zeile + bbox_en: WordBbox | null // EN-Zelle + bbox_de: WordBbox | null // DE-Zelle + bbox_ex: WordBbox | null // Example-Zelle + status?: 'pending' | 'confirmed' | 'edited' | 'skipped' +} + +interface WordResult { + entries: WordEntry[] + entry_count: number + image_width: number + image_height: number + duration_seconds: number + summary: { + total_entries: number + with_english: number + with_german: number + low_confidence: number + } +} +``` + +--- + +## Ground Truth System + +Jeder Schritt kann mit Ground-Truth-Feedback versehen werden: + +```json +{ + "is_correct": false, + "corrected_entries": [...], + "notes": "Zeile 5 falsch erkannt", + "saved_at": "2026-02-28T10:30:00" +} +``` + +Ground-Truth-Daten werden in der `ground_truth` JSONB-Spalte gespeichert, gruppiert nach Schritt: + +```json +{ + "deskew": { "is_correct": true, ... }, + "dewarp": { "is_correct": true, ... }, + "columns": { "is_correct": false, ... }, + "rows": { "is_correct": true, ... }, + "words": { "is_correct": false, ... } +} +``` + +--- + +## Deployment + +```bash +# 1. Git push +git push origin main && git push gitea main + +# 2. Mac Mini pull + build +ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && git pull --no-rebase origin main" + +# klausur-service (Backend) +ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \ + /usr/local/bin/docker compose build --no-cache klausur-service && \ + /usr/local/bin/docker compose up -d klausur-service" + +# admin-lehrer (Frontend) +ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \ + /usr/local/bin/docker compose build --no-cache admin-lehrer && \ + /usr/local/bin/docker compose up -d admin-lehrer" + +# 3. Migration ausfuehren +ssh macmini "/usr/local/bin/docker exec bp-lehrer-klausur-service \ + python -c \"import asyncio; from ocr_pipeline_session_store import *; asyncio.run(init_ocr_pipeline_tables())\"" + +# 4. Testen unter: +# https://macmini:3002/ai/ocr-pipeline +``` + +--- + +## Aenderungshistorie + +| Datum | Version | Aenderung | +|-------|---------|----------| +| 2026-02-28 | 1.0.0 | Schritt 5 (Worterkennung) implementiert | +| 2026-02-22 | 0.4.0 | Schritt 4 (Zeilenerkennung) implementiert | +| 2026-02-20 | 0.3.0 | Schritt 3 (Spaltenerkennung) mit Typ-Klassifikation | +| 2026-02-15 | 0.2.0 | Schritt 2 (Entzerrung/Dewarp) | +| 2026-02-12 | 0.1.0 | Schritt 1 (Begradigung/Deskew) + Session-Management | diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 4e25f67..5acaf1c 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2169,6 +2169,142 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li return regions +# ============================================================================= +# Pipeline Step 5: Word Grid from Columns Γ— Rows +# ============================================================================= + +def build_word_grid( + ocr_img: np.ndarray, + column_regions: List[PageRegion], + row_geometries: List[RowGeometry], + img_w: int, + img_h: int, + lang: str = "eng+deu", +) -> List[Dict[str, Any]]: + """Build a word grid by intersecting columns and rows, then OCR each cell. + + Args: + ocr_img: Binarized full-page image. + column_regions: Classified columns from Step 3 (PageRegion list). + row_geometries: Rows from Step 4 (RowGeometry list). + img_w: Image width in pixels. + img_h: Image height in pixels. + lang: Default Tesseract language. + + Returns: + List of entry dicts with english/german/example text and bbox info (percent). + """ + # Filter to content rows only (skip header/footer) + content_rows = [r for r in row_geometries if r.row_type == 'content'] + if not content_rows: + logger.warning("build_word_grid: no content rows found") + return [] + + # Map column types to roles + VOCAB_COLUMN_TYPES = {'column_en', 'column_de', 'column_example'} + relevant_cols = [c for c in column_regions if c.type in VOCAB_COLUMN_TYPES] + if not relevant_cols: + logger.warning("build_word_grid: no relevant vocabulary columns found") + return [] + + # Sort columns left-to-right + relevant_cols.sort(key=lambda c: c.x) + + # Choose OCR language per column type + lang_map = { + 'column_en': 'eng', + 'column_de': 'deu', + 'column_example': 'eng+deu', + } + + entries: List[Dict[str, Any]] = [] + + for row_idx, row in enumerate(content_rows): + entry: Dict[str, Any] = { + 'row_index': row_idx, + 'english': '', + 'german': '', + 'example': '', + 'confidence': 0.0, + 'bbox': { + 'x': round(row.x / img_w * 100, 2), + 'y': round(row.y / img_h * 100, 2), + 'w': round(row.width / img_w * 100, 2), + 'h': round(row.height / img_h * 100, 2), + }, + 'bbox_en': None, + 'bbox_de': None, + 'bbox_ex': None, + } + + confidences: List[float] = [] + + for col in relevant_cols: + # Compute cell region: column x/width, row y/height + cell_x = col.x + cell_y = row.y + cell_w = col.width + cell_h = row.height + + # Clamp to image bounds + cell_x = max(0, cell_x) + cell_y = max(0, cell_y) + if cell_x + cell_w > img_w: + cell_w = img_w - cell_x + if cell_y + cell_h > img_h: + cell_h = img_h - cell_y + + if cell_w <= 0 or cell_h <= 0: + continue + + cell_region = PageRegion( + type=col.type, + x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + + cell_lang = lang_map.get(col.type, lang) + words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7) + + # Sort words by x position, join to text + words.sort(key=lambda w: w['left']) + text = ' '.join(w['text'] for w in words) + if words: + avg_conf = sum(w['conf'] for w in words) / len(words) + confidences.append(avg_conf) + + # Bbox in percent + cell_bbox = { + 'x': round(cell_x / img_w * 100, 2), + 'y': round(cell_y / img_h * 100, 2), + 'w': round(cell_w / img_w * 100, 2), + 'h': round(cell_h / img_h * 100, 2), + } + + if col.type == 'column_en': + entry['english'] = text + entry['bbox_en'] = cell_bbox + elif col.type == 'column_de': + entry['german'] = text + entry['bbox_de'] = cell_bbox + elif col.type == 'column_example': + entry['example'] = text + entry['bbox_ex'] = cell_bbox + + entry['confidence'] = round( + sum(confidences) / len(confidences), 1 + ) if confidences else 0.0 + + # Only include if at least one field has text + if entry['english'] or entry['german'] or entry['example']: + entries.append(entry) + + logger.info(f"build_word_grid: {len(entries)} entries from " + f"{len(content_rows)} content rows Γ— {len(relevant_cols)} columns") + + return entries + + # ============================================================================= # Stage 6: Multi-Pass OCR # ============================================================================= diff --git a/klausur-service/backend/migrations/004_add_word_result.sql b/klausur-service/backend/migrations/004_add_word_result.sql new file mode 100644 index 0000000..0b6b1b9 --- /dev/null +++ b/klausur-service/backend/migrations/004_add_word_result.sql @@ -0,0 +1,4 @@ +-- Migration 004: Add word_result column for OCR Pipeline Step 5 +-- Stores the word recognition grid result (entries with english/german/example + bboxes) + +ALTER TABLE ocr_pipeline_sessions ADD COLUMN IF NOT EXISTS word_result JSONB; diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index ecea61d..db83087 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -29,8 +29,11 @@ from fastapi.responses import Response from pydantic import BaseModel from cv_vocab_pipeline import ( + PageRegion, + RowGeometry, analyze_layout, analyze_layout_by_words, + build_word_grid, classify_column_types, create_layout_image, create_ocr_image, @@ -261,6 +264,10 @@ async def get_session_info(session_id: str): result["dewarp_result"] = session["dewarp_result"] if session.get("column_result"): result["column_result"] = session["column_result"] + if session.get("row_result"): + result["row_result"] = session["row_result"] + if session.get("word_result"): + result["word_result"] = session["word_result"] return result @@ -291,7 +298,7 @@ async def delete_session(session_id: str): @router.get("/sessions/{session_id}/image/{image_type}") async def get_image(session_id: str, image_type: str): """Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay.""" - valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay"} + valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay"} if image_type not in valid_types: raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}") @@ -301,6 +308,9 @@ async def get_image(session_id: str, image_type: str): if image_type == "rows-overlay": return await _get_rows_overlay(session_id) + if image_type == "words-overlay": + return await _get_words_overlay(session_id) + # Try cache first for fast serving cached = _cache.get(session_id) if cached: @@ -992,6 +1002,153 @@ async def get_row_ground_truth(session_id: str): } +# --------------------------------------------------------------------------- +# Word Recognition Endpoints (Step 5) +# --------------------------------------------------------------------------- + +@router.post("/sessions/{session_id}/words") +async def detect_words(session_id: str): + """Build word grid from columns Γ— rows, OCR each cell.""" + if session_id not in _cache: + await _load_session_to_cache(session_id) + cached = _get_cached(session_id) + + dewarped_bgr = cached.get("dewarped_bgr") + if dewarped_bgr is None: + raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection") + + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + column_result = session.get("column_result") + row_result = session.get("row_result") + if not column_result or not column_result.get("columns"): + raise HTTPException(status_code=400, detail="Column detection must be completed first") + if not row_result or not row_result.get("rows"): + raise HTTPException(status_code=400, detail="Row detection must be completed first") + + t0 = time.time() + + # Create binarized OCR image + ocr_img = create_ocr_image(dewarped_bgr) + img_h, img_w = dewarped_bgr.shape[:2] + + # Convert column dicts back to PageRegion objects + col_regions = [ + PageRegion( + type=c["type"], + x=c["x"], y=c["y"], + width=c["width"], height=c["height"], + classification_confidence=c.get("classification_confidence", 1.0), + classification_method=c.get("classification_method", ""), + ) + for c in column_result["columns"] + ] + + # Convert row dicts back to RowGeometry objects + row_geoms = [ + RowGeometry( + index=r["index"], + x=r["x"], y=r["y"], + width=r["width"], height=r["height"], + word_count=r.get("word_count", 0), + words=[], + row_type=r.get("row_type", "content"), + gap_before=r.get("gap_before", 0), + ) + for r in row_result["rows"] + ] + + # Build word grid + entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h) + duration = time.time() - t0 + + # Build summary + summary = { + "total_entries": len(entries), + "with_english": sum(1 for e in entries if e.get("english")), + "with_german": sum(1 for e in entries if e.get("german")), + "low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50), + } + + word_result = { + "entries": entries, + "entry_count": len(entries), + "image_width": img_w, + "image_height": img_h, + "duration_seconds": round(duration, 2), + "summary": summary, + } + + # Persist to DB + await update_session_db( + session_id, + word_result=word_result, + current_step=5, + ) + + cached["word_result"] = word_result + + logger.info(f"OCR Pipeline: words session {session_id}: " + f"{len(entries)} entries ({duration:.2f}s), summary: {summary}") + + return { + "session_id": session_id, + **word_result, + } + + +class WordGroundTruthRequest(BaseModel): + is_correct: bool + corrected_entries: Optional[List[Dict[str, Any]]] = None + notes: Optional[str] = None + + +@router.post("/sessions/{session_id}/ground-truth/words") +async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest): + """Save ground truth feedback for the word recognition step.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + ground_truth = session.get("ground_truth") or {} + gt = { + "is_correct": req.is_correct, + "corrected_entries": req.corrected_entries, + "notes": req.notes, + "saved_at": datetime.utcnow().isoformat(), + "word_result": session.get("word_result"), + } + ground_truth["words"] = gt + + await update_session_db(session_id, ground_truth=ground_truth) + + if session_id in _cache: + _cache[session_id]["ground_truth"] = ground_truth + + return {"session_id": session_id, "ground_truth": gt} + + +@router.get("/sessions/{session_id}/ground-truth/words") +async def get_word_ground_truth(session_id: str): + """Retrieve saved ground truth for word recognition.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + ground_truth = session.get("ground_truth") or {} + words_gt = ground_truth.get("words") + if not words_gt: + raise HTTPException(status_code=404, detail="No word ground truth saved") + + return { + "session_id": session_id, + "words_gt": words_gt, + "words_auto": session.get("word_result"), + } + + async def _get_rows_overlay(session_id: str) -> Response: """Generate dewarped image with row bands drawn on it.""" session = await get_session_db(session_id) @@ -1049,3 +1206,106 @@ async def _get_rows_overlay(session_id: str) -> Response: raise HTTPException(status_code=500, detail="Failed to encode overlay image") return Response(content=result_png.tobytes(), media_type="image/png") + + +async def _get_words_overlay(session_id: str) -> Response: + """Generate dewarped image with word grid cells drawn on it.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + word_result = session.get("word_result") + if not word_result or not word_result.get("entries"): + raise HTTPException(status_code=404, detail="No word data available") + + column_result = session.get("column_result") + row_result = session.get("row_result") + + # Load dewarped image + dewarped_png = await get_session_image(session_id, "dewarped") + if not dewarped_png: + raise HTTPException(status_code=404, detail="Dewarped image not available") + + arr = np.frombuffer(dewarped_png, dtype=np.uint8) + img = cv2.imdecode(arr, cv2.IMREAD_COLOR) + if img is None: + raise HTTPException(status_code=500, detail="Failed to decode image") + + img_h, img_w = img.shape[:2] + + # Color map for cell types (BGR) + cell_colors = { + "column_en": (255, 180, 0), # Blue + "column_de": (0, 200, 0), # Green + "column_example": (0, 140, 255), # Orange + } + + overlay = img.copy() + + # Draw column divider lines (vertical) + if column_result and column_result.get("columns"): + for col in column_result["columns"]: + col_type = col.get("type", "") + if col_type in cell_colors: + cx = col["x"] + cv2.line(img, (cx, 0), (cx, img_h), cell_colors[col_type], 1) + cx_end = col["x"] + col["width"] + cv2.line(img, (cx_end, 0), (cx_end, img_h), cell_colors[col_type], 1) + + # Draw row divider lines (horizontal) for content rows + if row_result and row_result.get("rows"): + for row in row_result["rows"]: + if row.get("row_type") == "content": + ry = row["y"] + cv2.line(img, (0, ry), (img_w, ry), (180, 180, 180), 1) + + # Draw entry cells with text labels + entries = word_result["entries"] + for entry in entries: + conf = entry.get("confidence", 0) + # Color by confidence: green > 70, yellow 50-70, red < 50 + if conf >= 70: + text_color = (0, 180, 0) + elif conf >= 50: + text_color = (0, 180, 220) + else: + text_color = (0, 0, 220) + + for bbox_key, field_key, col_type in [ + ("bbox_en", "english", "column_en"), + ("bbox_de", "german", "column_de"), + ("bbox_ex", "example", "column_example"), + ]: + bbox = entry.get(bbox_key) + text = entry.get(field_key, "") + if not bbox or not text: + continue + + # Convert percent to pixels + bx = int(bbox["x"] / 100 * img_w) + by = int(bbox["y"] / 100 * img_h) + bw = int(bbox["w"] / 100 * img_w) + bh = int(bbox["h"] / 100 * img_h) + + color = cell_colors.get(col_type, (200, 200, 200)) + + # Semi-transparent fill + cv2.rectangle(overlay, (bx, by), (bx + bw, by + bh), color, -1) + + # Border + cv2.rectangle(img, (bx, by), (bx + bw, by + bh), text_color, 1) + + # Text label (truncate if too long) + label = text[:30] if len(text) > 30 else text + font_scale = 0.35 + cv2.putText(img, label, (bx + 3, by + bh - 4), + cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1) + + # Blend overlay at 10% opacity + cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img) + + success, result_png = cv2.imencode(".png", img) + if not success: + raise HTTPException(status_code=500, detail="Failed to encode overlay image") + + return Response(content=result_png.tobytes(), media_type="image/png") diff --git a/klausur-service/backend/ocr_pipeline_session_store.py b/klausur-service/backend/ocr_pipeline_session_store.py index 9670763..84343c6 100644 --- a/klausur-service/backend/ocr_pipeline_session_store.py +++ b/klausur-service/backend/ocr_pipeline_session_store.py @@ -80,7 +80,7 @@ async def create_session_db( ) VALUES ($1, $2, $3, $4, 'active', 1) RETURNING id, name, filename, status, current_step, deskew_result, dewarp_result, column_result, row_result, - ground_truth, auto_shear_degrees, + word_result, ground_truth, auto_shear_degrees, created_at, updated_at """, uuid.UUID(session_id), name, filename, original_png) @@ -94,7 +94,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]: row = await conn.fetchrow(""" SELECT id, name, filename, status, current_step, deskew_result, dewarp_result, column_result, row_result, - ground_truth, auto_shear_degrees, + word_result, ground_truth, auto_shear_degrees, created_at, updated_at FROM ocr_pipeline_sessions WHERE id = $1 """, uuid.UUID(session_id)) @@ -136,10 +136,10 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any 'name', 'filename', 'status', 'current_step', 'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', - 'ground_truth', 'auto_shear_degrees', + 'word_result', 'ground_truth', 'auto_shear_degrees', } - jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth'} + jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth'} for key, value in kwargs.items(): if key in allowed_fields: @@ -164,7 +164,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any WHERE id = ${param_idx} RETURNING id, name, filename, status, current_step, deskew_result, dewarp_result, column_result, row_result, - ground_truth, auto_shear_degrees, + word_result, ground_truth, auto_shear_degrees, created_at, updated_at """, *values) @@ -220,7 +220,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]: result[key] = result[key].isoformat() # JSONB β†’ parsed (asyncpg returns str for JSONB) - for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth']: + for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth']: if key in result and result[key] is not None: if isinstance(result[key], str): result[key] = json.loads(result[key]) diff --git a/mkdocs.yml b/mkdocs.yml index e1ccb8f..d3de9c4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,6 +65,7 @@ nav: - BYOEH Architektur: services/klausur-service/BYOEH-Architecture.md - BYOEH Developer Guide: services/klausur-service/BYOEH-Developer-Guide.md - NiBiS Pipeline: services/klausur-service/NiBiS-Ingestion-Pipeline.md + - OCR Pipeline: services/klausur-service/OCR-Pipeline.md - OCR Labeling: services/klausur-service/OCR-Labeling-Spec.md - OCR Vergleich: services/klausur-service/OCR-Compare.md - RAG Admin: services/klausur-service/RAG-Admin-Spec.md