diff --git a/admin-lehrer/app/(admin)/ai/ocr-ground-truth/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-ground-truth/page.tsx new file mode 100644 index 0000000..44e6fb4 --- /dev/null +++ b/admin-lehrer/app/(admin)/ai/ocr-ground-truth/page.tsx @@ -0,0 +1,580 @@ +'use client' + +/** + * Ground-Truth Review Workflow + * + * Efficient mass-review of OCR sessions: + * - Session queue with auto-advance + * - Split-view: original image left, grid right + * - Confidence highlighting on cells + * - Quick-accept per row + * - Inline cell editing + * - Batch mark as ground truth + * - Progress tracking + */ + +import { useState, useEffect, useCallback, useRef } from 'react' +import { PagePurpose } from '@/components/common/PagePurpose' +import { AIToolsSidebarResponsive } from '@/components/ai/AIToolsSidebar' + +const KLAUSUR_API = '/klausur-api' + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface Session { + id: string + name: string + filename: string + status: string + created_at: string + document_category: string | null + has_ground_truth: boolean +} + +interface GridZone { + zone_id: string + zone_type: string + columns: Array<{ col_index: number; col_type: string; header: string }> + rows: Array<{ row_index: number; is_header: boolean }> + cells: GridCell[] +} + +interface GridCell { + cell_id: string + row_index: number + col_index: number + col_type: string + text: string + confidence?: number + is_bold?: boolean +} + +interface GridResult { + zones: GridZone[] + summary?: { + total_zones: number + total_columns: number + total_rows: number + total_cells: number + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function confidenceColor(conf: number | undefined): string { + if (conf === undefined) return '' + if (conf >= 80) return 'bg-emerald-50' + if (conf >= 50) return 'bg-amber-50' + return 'bg-red-50' +} + +function confidenceBorder(conf: number | undefined): string { + if (conf === undefined) return 'border-slate-200' + if (conf >= 80) return 'border-emerald-200' + if (conf >= 50) return 'border-amber-300' + return 'border-red-300' +} + +// --------------------------------------------------------------------------- +// Component +// --------------------------------------------------------------------------- + +export default function GroundTruthReviewPage() { + // Session list & queue + const [allSessions, setAllSessions] = useState([]) + const [filter, setFilter] = useState<'all' | 'unreviewed' | 'reviewed'>('unreviewed') + const [currentIdx, setCurrentIdx] = useState(0) + const [loading, setLoading] = useState(true) + + // Current session data + const [grid, setGrid] = useState(null) + const [loadingGrid, setLoadingGrid] = useState(false) + const [editingCell, setEditingCell] = useState(null) + const [editText, setEditText] = useState('') + const [acceptedRows, setAcceptedRows] = useState>(new Set()) + const [zoom, setZoom] = useState(100) + + // Batch operations + const [selectedSessions, setSelectedSessions] = useState>(new Set()) + const [marking, setMarking] = useState(false) + const [markResult, setMarkResult] = useState(null) + + // Stats + const [reviewedCount, setReviewedCount] = useState(0) + const [totalCount, setTotalCount] = useState(0) + + const imageRef = useRef(null) + + // Load all sessions + const loadSessions = useCallback(async () => { + setLoading(true) + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions?limit=200`) + if (!res.ok) return + const data = await res.json() + const sessions: Session[] = (data.sessions || []).map((s: any) => ({ + id: s.id, + name: s.name || '', + filename: s.filename || '', + status: s.status || 'active', + created_at: s.created_at || '', + document_category: s.document_category || null, + has_ground_truth: !!(s.ground_truth && s.ground_truth.build_grid_reference), + })) + setAllSessions(sessions) + setTotalCount(sessions.length) + setReviewedCount(sessions.filter(s => s.has_ground_truth).length) + } catch (e) { + console.error('Failed to load sessions:', e) + } finally { + setLoading(false) + } + }, []) + + useEffect(() => { loadSessions() }, [loadSessions]) + + // Filtered sessions + const filteredSessions = allSessions.filter(s => { + if (filter === 'unreviewed') return !s.has_ground_truth && s.status === 'active' + if (filter === 'reviewed') return s.has_ground_truth + return true + }) + + const currentSession = filteredSessions[currentIdx] || null + + // Load grid for current session + const loadGrid = useCallback(async (sessionId: string) => { + setLoadingGrid(true) + setGrid(null) + setAcceptedRows(new Set()) + setEditingCell(null) + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/grid-editor`) + if (res.ok) { + const data = await res.json() + setGrid(data.grid || data) + } + } catch (e) { + console.error('Failed to load grid:', e) + } finally { + setLoadingGrid(false) + } + }, []) + + useEffect(() => { + if (currentSession) loadGrid(currentSession.id) + }, [currentSession, loadGrid]) + + // Navigation + const goNext = () => { + if (currentIdx < filteredSessions.length - 1) setCurrentIdx(currentIdx + 1) + } + const goPrev = () => { + if (currentIdx > 0) setCurrentIdx(currentIdx - 1) + } + + // Accept row + const acceptRow = (zoneId: string, rowIdx: number) => { + const key = `${zoneId}-${rowIdx}` + setAcceptedRows(prev => new Set([...prev, key])) + } + + // Edit cell + const startEdit = (cell: GridCell) => { + setEditingCell(cell.cell_id) + setEditText(cell.text) + } + + const saveEdit = async () => { + if (!editingCell || !currentSession) return + try { + await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${currentSession.id}/update-cell`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ cell_id: editingCell, text: editText }), + }) + // Update local state + if (grid) { + const newGrid = { ...grid } + for (const zone of newGrid.zones) { + for (const cell of zone.cells) { + if (cell.cell_id === editingCell) { + cell.text = editText + } + } + } + setGrid(newGrid) + } + } catch (e) { + console.error('Failed to save cell:', e) + } + setEditingCell(null) + } + + // Mark as ground truth + const markGroundTruth = async (sessionId: string) => { + setMarking(true) + setMarkResult(null) + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/mark-ground-truth`, { + method: 'POST', + }) + if (res.ok) { + setMarkResult('success') + // Update local session state + setAllSessions(prev => prev.map(s => + s.id === sessionId ? { ...s, has_ground_truth: true } : s + )) + setReviewedCount(prev => prev + 1) + } else { + setMarkResult('error') + } + } catch { + setMarkResult('error') + } finally { + setMarking(false) + } + } + + // Batch mark + const batchMark = async () => { + setMarking(true) + let success = 0 + for (const sid of selectedSessions) { + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}/mark-ground-truth`, { + method: 'POST', + }) + if (res.ok) success++ + } catch { /* skip */ } + } + setAllSessions(prev => prev.map(s => + selectedSessions.has(s.id) ? { ...s, has_ground_truth: true } : s + )) + setReviewedCount(prev => prev + success) + setSelectedSessions(new Set()) + setMarking(false) + setMarkResult(`${success} Sessions als Ground Truth markiert`) + setTimeout(() => setMarkResult(null), 3000) + } + + // All cells for current grid + const allCells = grid?.zones?.flatMap(z => z.cells) || [] + const lowConfCells = allCells.filter(c => (c.confidence ?? 100) < 50) + + const imageUrl = currentSession + ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${currentSession.id}/image/original` + : null + + return ( + +
+ + + {/* Progress Bar */} +
+
+

Ground Truth Review

+ + {reviewedCount} von {totalCount} geprueft ({totalCount > 0 ? Math.round(reviewedCount / totalCount * 100) : 0}%) + +
+
+
0 ? (reviewedCount / totalCount) * 100 : 0}%` }} + /> +
+
+ + {/* Filter + Queue */} +
+
+ {(['unreviewed', 'reviewed', 'all'] as const).map(f => ( + + ))} +
+ + {/* Navigation */} +
+ + + {filteredSessions.length > 0 ? `${currentIdx + 1} / ${filteredSessions.length}` : '—'} + + +
+ + {/* Batch mark button */} + {selectedSessions.size > 0 && ( + + )} +
+ + {/* Toast */} + {markResult && ( +
+ {markResult === 'success' ? 'Als Ground Truth markiert!' : markResult === 'error' ? 'Fehler beim Markieren' : markResult} +
+ )} + + {/* Main Content: Split View */} + {loading ? ( +
Lade Sessions...
+ ) : !currentSession ? ( +
+

Keine Sessions in dieser Ansicht

+
+ ) : ( +
+ {/* Left: Original Image */} +
+
+ + {currentSession.name || currentSession.filename} + +
+ + {zoom}% + +
+
+
+ {imageUrl && ( + Original scan + )} +
+
+ + {/* Right: Grid Review */} +
+
+
+ + {allCells.length} Zellen + + {lowConfCells.length > 0 && ( + + {lowConfCells.length} niedrige Konfidenz + + )} +
+
+ {!currentSession.has_ground_truth && ( + + )} + {currentSession.has_ground_truth && ( + + Ground Truth + + )} + +
+
+ + {/* Grid Content */} +
+ {loadingGrid ? ( +
+ + + + + Lade Grid... +
+ ) : !grid || !grid.zones ? ( +
+ Kein Grid vorhanden. Bitte zuerst die Pipeline ausfuehren. +
+ ) : ( +
+ {grid.zones.map((zone, zi) => ( +
+ {/* Zone header */} +
+ Zone {zi + 1} ({zone.zone_type}) + {zone.columns?.length > 0 && ( + + {zone.columns.map(c => c.col_type.replace('column_', '')).join(' | ')} + + )} +
+ + {/* Group cells by row */} + {Array.from(new Set(zone.cells.map(c => c.row_index))) + .sort((a, b) => a - b) + .map(rowIdx => { + const rowCells = zone.cells + .filter(c => c.row_index === rowIdx) + .sort((a, b) => a.col_index - b.col_index) + const rowKey = `${zone.zone_id || zi}-${rowIdx}` + const isAccepted = acceptedRows.has(rowKey) + + return ( +
+ {/* Quick accept button */} + + + {/* Cells */} +
+ {rowCells.map(cell => ( +
!isAccepted && startEdit(cell)} + title={`Konfidenz: ${cell.confidence ?? '?'}% | ${cell.col_type}`} + > + {editingCell === cell.cell_id ? ( + setEditText(e.target.value)} + onBlur={saveEdit} + onKeyDown={e => { + if (e.key === 'Enter') saveEdit() + if (e.key === 'Escape') setEditingCell(null) + }} + className="w-full bg-transparent outline-none text-sm" + /> + ) : ( + + {cell.text || '(leer)'} + + )} +
+ ))} +
+
+ ) + })} +
+ ))} +
+ )} +
+
+
+ )} + + {/* Session List (collapsed) */} + {filteredSessions.length > 1 && ( +
+ + Session-Liste ({filteredSessions.length}) + +
+ {filteredSessions.map((s, idx) => ( +
setCurrentIdx(idx)} + > + { + e.stopPropagation() + setSelectedSessions(prev => { + const next = new Set(prev) + if (next.has(s.id)) next.delete(s.id) + else next.add(s.id) + return next + }) + }} + className="rounded border-slate-300" + /> + + {s.name || s.filename || s.id} + {s.document_category && ( + {s.document_category} + )} +
+ ))} +
+
+ )} +
+ + ) +} diff --git a/admin-lehrer/app/(admin)/ai/ocr-regression/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-regression/page.tsx new file mode 100644 index 0000000..c2252d1 --- /dev/null +++ b/admin-lehrer/app/(admin)/ai/ocr-regression/page.tsx @@ -0,0 +1,391 @@ +'use client' + +/** + * OCR Regression Dashboard + * + * Shows all ground-truth sessions, runs regression tests, + * displays pass/fail results with diff details, and shows history. + */ + +import { useState, useEffect, useCallback } from 'react' +import { PagePurpose } from '@/components/common/PagePurpose' +import { AIToolsSidebarResponsive } from '@/components/ai/AIToolsSidebar' + +const KLAUSUR_API = '/klausur-api' + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface GTSession { + session_id: string + name: string + filename: string + document_category: string | null + pipeline: string | null + saved_at: string | null + summary: { + total_zones: number + total_columns: number + total_rows: number + total_cells: number + } +} + +interface DiffSummary { + structural_changes: number + cells_missing: number + cells_added: number + text_changes: number + col_type_changes: number +} + +interface RegressionResult { + session_id: string + name: string + status: 'pass' | 'fail' | 'error' + error?: string + diff_summary?: DiffSummary + reference_summary?: Record + current_summary?: Record + structural_diffs?: Array<{ field: string; reference: number; current: number }> + cell_diffs?: Array<{ type: string; cell_id: string; reference?: string; current?: string }> +} + +interface RegressionRun { + id: string + run_at: string + status: string + total: number + passed: number + failed: number + errors: number + duration_ms: number + triggered_by: string +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function StatusBadge({ status }: { status: string }) { + const cls = + status === 'pass' + ? 'bg-emerald-100 text-emerald-800 border-emerald-200' + : status === 'fail' + ? 'bg-red-100 text-red-800 border-red-200' + : 'bg-amber-100 text-amber-800 border-amber-200' + return ( + + {status === 'pass' ? 'Pass' : status === 'fail' ? 'Fail' : 'Error'} + + ) +} + +function formatDate(iso: string | null) { + if (!iso) return '—' + return new Date(iso).toLocaleString('de-DE', { + day: '2-digit', month: '2-digit', year: 'numeric', + hour: '2-digit', minute: '2-digit', + }) +} + +// --------------------------------------------------------------------------- +// Component +// --------------------------------------------------------------------------- + +export default function OCRRegressionPage() { + const [sessions, setSessions] = useState([]) + const [results, setResults] = useState([]) + const [history, setHistory] = useState([]) + const [running, setRunning] = useState(false) + const [overallStatus, setOverallStatus] = useState(null) + const [durationMs, setDurationMs] = useState(null) + const [expandedSession, setExpandedSession] = useState(null) + const [tab, setTab] = useState<'current' | 'history'>('current') + + // Load ground-truth sessions + const loadSessions = useCallback(async () => { + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/ground-truth-sessions`) + if (res.ok) { + const data = await res.json() + setSessions(data.sessions || []) + } + } catch (e) { + console.error('Failed to load GT sessions:', e) + } + }, []) + + // Load history + const loadHistory = useCallback(async () => { + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/regression/history?limit=20`) + if (res.ok) { + const data = await res.json() + setHistory(data.runs || []) + } + } catch (e) { + console.error('Failed to load history:', e) + } + }, []) + + useEffect(() => { + loadSessions() + loadHistory() + }, [loadSessions, loadHistory]) + + // Run all regressions + const runAll = async () => { + setRunning(true) + setResults([]) + setOverallStatus(null) + setDurationMs(null) + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/regression/run?triggered_by=manual`, { + method: 'POST', + }) + if (res.ok) { + const data = await res.json() + setResults(data.results || []) + setOverallStatus(data.status) + setDurationMs(data.duration_ms) + loadHistory() + } + } catch (e) { + console.error('Regression run failed:', e) + setOverallStatus('error') + } finally { + setRunning(false) + } + } + + const totalPass = results.filter(r => r.status === 'pass').length + const totalFail = results.filter(r => r.status === 'fail').length + const totalError = results.filter(r => r.status === 'error').length + + return ( + +
+ + + {/* Header + Run Button */} +
+
+

OCR Regression Tests

+

+ {sessions.length} Ground-Truth Session{sessions.length !== 1 ? 's' : ''} +

+
+ +
+ + {/* Overall Result Banner */} + {overallStatus && ( +
+
+
+ + + {totalPass} bestanden, {totalFail} fehlgeschlagen, {totalError} Fehler + +
+ {durationMs !== null && ( + {(durationMs / 1000).toFixed(1)}s + )} +
+
+ )} + + {/* Tabs */} +
+ +
+ + {/* Current Results Tab */} + {tab === 'current' && ( +
+ {results.length === 0 && !running && ( +
+

Keine Ergebnisse

+

Klicken Sie "Alle Tests starten" um die Regression zu laufen.

+
+ )} + {results.map(r => ( +
+
setExpandedSession(expandedSession === r.session_id ? null : r.session_id)} + > +
+ + {r.name || r.session_id} +
+
+ {r.diff_summary && ( + + {r.diff_summary.text_changes} Text, {r.diff_summary.structural_changes} Struktur + + )} + {r.error && {r.error}} + + + +
+
+ + {/* Expanded Details */} + {expandedSession === r.session_id && r.status === 'fail' && ( +
+ {/* Structural Diffs */} + {r.structural_diffs && r.structural_diffs.length > 0 && ( +
+

Strukturelle Aenderungen

+
+ {r.structural_diffs.map((d, i) => ( +
+ {d.field}: {d.reference} → {d.current} +
+ ))} +
+
+ )} + {/* Cell Diffs */} + {r.cell_diffs && r.cell_diffs.length > 0 && ( +
+

+ Zellen-Aenderungen ({r.cell_diffs.length}) +

+
+ {r.cell_diffs.slice(0, 50).map((d, i) => ( +
+ + {d.type} + {' '} + {d.cell_id} + {d.reference && ( + <> + {' '}{d.reference} + + )} + {d.current && ( + <> + {' '}{d.current} + + )} +
+ ))} + {r.cell_diffs.length > 50 && ( +

... und {r.cell_diffs.length - 50} weitere

+ )} +
+
+ )} +
+ )} +
+ ))} + + {/* Ground Truth Sessions Overview (when no results yet) */} + {results.length === 0 && sessions.length > 0 && ( +
+

Ground-Truth Sessions

+
+ {sessions.map(s => ( +
+
+ {s.name || s.session_id} + {s.filename} +
+
+ {s.summary.total_cells} Zellen, {s.summary.total_zones} Zonen + {s.pipeline && {s.pipeline}} +
+
+ ))} +
+
+ )} +
+ )} + + {/* History Tab */} + {tab === 'history' && ( +
+ {history.length === 0 ? ( +

Noch keine Laeufe aufgezeichnet.

+ ) : ( + + + + + + + + + + + + + + {history.map(run => ( + + + + + + + + + + ))} + +
DatumStatusGesamtPassFailDauerTrigger
{formatDate(run.run_at)}{run.total}{run.passed}{run.failed + run.errors}{(run.duration_ms / 1000).toFixed(1)}s{run.triggered_by}
+ )} +
+ )} +
+
+ ) +} diff --git a/admin-lehrer/lib/navigation.ts b/admin-lehrer/lib/navigation.ts index 618b6cd..5bcd4fc 100644 --- a/admin-lehrer/lib/navigation.ts +++ b/admin-lehrer/lib/navigation.ts @@ -182,6 +182,24 @@ export const navigation: NavCategory[] = [ // ----------------------------------------------------------------------- // KI-Anwendungen: Endnutzer-orientierte KI-Module // ----------------------------------------------------------------------- + { + id: 'ocr-regression', + name: 'OCR Regression', + href: '/ai/ocr-regression', + description: 'Regressions-Tests & Ground Truth', + purpose: 'Regressions-Tests fuer die OCR-Pipeline ausfuehren. Zeigt Pass/Fail pro Ground-Truth Session, Diff-Details und Verlauf vergangener Laeufe.', + audience: ['Entwickler', 'QA'], + subgroup: 'KI-Werkzeuge', + }, + { + id: 'ocr-ground-truth', + name: 'Ground Truth Review', + href: '/ai/ocr-ground-truth', + description: 'Ground Truth pruefen & markieren', + purpose: 'Effiziente Massenpruefung von OCR-Sessions. Split-View mit Confidence-Highlighting, Quick-Accept und Batch-Markierung als Ground Truth.', + audience: ['Entwickler', 'QA'], + subgroup: 'KI-Werkzeuge', + }, { id: 'agents', name: 'Agent Management', diff --git a/docs-src/development/regression-testing.md b/docs-src/development/regression-testing.md new file mode 100644 index 0000000..a92e414 --- /dev/null +++ b/docs-src/development/regression-testing.md @@ -0,0 +1,166 @@ +# OCR Pipeline Regression Testing + +**Stand:** 2026-03-23 + +--- + +## Uebersicht + +Das Regression Framework stellt sicher, dass Aenderungen an der OCR-Pipeline keine bestehenden +Ergebnisse verschlechtern. Ground-Truth Sessions dienen als Referenz — nach jeder Code-Aenderung +wird die Pipeline neu ausgefuehrt und das Ergebnis mit der Referenz verglichen. + +--- + +## Ground Truth markieren + +### Via Admin-UI (empfohlen) + +1. Oeffne die OCR Pipeline: [/ai/ocr-pipeline](https://macmini:3002/ai/ocr-pipeline) +2. Lade eine Session und fuehre alle Pipeline-Schritte aus +3. Pruefe das Ergebnis im Grid Editor (Schritt 10) +4. Korrigiere Fehler falls noetig (Inline-Edit) +5. Klicke **"Als Ground Truth markieren"** + +### Via API + +```bash +# Bestehende Session als Ground Truth markieren +curl -X POST "http://macmini:8086/api/v1/ocr-pipeline/sessions/{session_id}/mark-ground-truth" + +# Ground Truth entfernen +curl -X DELETE "http://macmini:8086/api/v1/ocr-pipeline/sessions/{session_id}/mark-ground-truth" + +# Alle Ground-Truth Sessions auflisten +curl "http://macmini:8086/api/v1/ocr-pipeline/ground-truth-sessions" +``` + +### Via Ground-Truth Review UI + +Fuer die Massenpruefung von 50-100 Sessions: + +1. Oeffne [/ai/ocr-ground-truth](https://macmini:3002/ai/ocr-ground-truth) +2. Filter auf "Offen" (ungeprueft) +3. Split-View: Bild links, Grid rechts pruefen +4. Korrekte Zeilen mit Haekchen bestaetigen +5. Fehler inline korrigieren +6. "Markieren & Weiter" fuer naechste Session + +--- + +## Regression ausfuehren + +### Via Shell-Script (CI/CD) + +```bash +# Standard: macmini:8086 +./scripts/run-regression.sh + +# Custom URL +./scripts/run-regression.sh http://localhost:8086 + +# Exit-Codes: +# 0 = alle bestanden +# 1 = Fehler gefunden +# 2 = Verbindungsfehler +``` + +### Via Admin-UI + +1. Oeffne [/ai/ocr-regression](https://macmini:3002/ai/ocr-regression) +2. Klicke **"Alle Tests starten"** +3. Ergebnis: Pass/Fail pro Session mit Diff-Details + +### Via API + +```bash +# Alle Ground-Truth Sessions testen +curl -X POST "http://macmini:8086/api/v1/ocr-pipeline/regression/run?triggered_by=script" + +# Einzelne Session testen +curl -X POST "http://macmini:8086/api/v1/ocr-pipeline/sessions/{session_id}/regression/run" + +# Verlauf abrufen +curl "http://macmini:8086/api/v1/ocr-pipeline/regression/history?limit=20" +``` + +--- + +## Ergebnisse lesen + +### Diff-Typen + +| Typ | Beschreibung | +|-----|-------------| +| `structural_changes` | Anzahl Zonen, Spalten oder Zeilen hat sich geaendert | +| `text_change` | Text einer Zelle hat sich geaendert | +| `cell_missing` | Zelle war in der Referenz, fehlt jetzt | +| `cell_added` | Neue Zelle die in der Referenz nicht existierte | +| `col_type_change` | Spaltentyp einer Zelle hat sich geaendert | + +### Status-Bewertung + +- **pass**: Keine Diffs → Code-Aenderung hat keine Auswirkung +- **fail**: Diffs gefunden → pruefen ob gewollt (Feature) oder ungewollt (Regression) +- **error**: Pipeline-Fehler → Build oder Config-Problem + +### Verlauf + +Alle Laeufe werden in der Tabelle `regression_runs` persistiert: + +```sql +SELECT id, run_at, status, total, passed, failed, errors, duration_ms, triggered_by +FROM regression_runs +ORDER BY run_at DESC +LIMIT 10; +``` + +--- + +## Best Practices + +### Ground-Truth Sessions waehlen + +Decke verschiedene Seitentypen ab: + +- Woerterbuchseiten (2-3 Spalten, IPA-Klammern) +- Uebungsseiten (Tabellen, Checkboxen) +- Seiten mit Illustrationen +- Seiten ohne IPA (reines Deutsch-Vokabular) +- Verschiedene Verlage und Layouts + +### Workflow vor jedem Commit + +```bash +# 1. Regression laufen lassen +./scripts/run-regression.sh + +# 2. Bei Failure: Diff pruefen +# - Gewollte Aenderung? → Ground Truth aktualisieren +# - Ungewollte Regression? → Code fixen + +# 3. Bei Pass: Commit +git add . && git commit -m "fix: ..." +``` + +--- + +## Datenbank-Schema + +```sql +CREATE TABLE regression_runs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + run_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + status VARCHAR(20) NOT NULL, -- pass, fail, error + total INT NOT NULL DEFAULT 0, + passed INT NOT NULL DEFAULT 0, + failed INT NOT NULL DEFAULT 0, + errors INT NOT NULL DEFAULT 0, + duration_ms INT, + results JSONB NOT NULL DEFAULT '[]', -- Detail-Ergebnisse pro Session + triggered_by VARCHAR(50) DEFAULT 'manual' +); +``` + +Ground-Truth Referenzen werden im `ground_truth` JSONB-Feld der +`ocr_pipeline_sessions` Tabelle gespeichert. diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md index e84473b..1a454e3 100644 --- a/docs-src/services/klausur-service/OCR-Pipeline.md +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -1,7 +1,7 @@ # OCR Pipeline - Schrittweise Seitenrekonstruktion -**Version:** 4.7.0 -**Status:** Produktiv (Schritte 1–10 + Grid Editor implementiert) +**Version:** 5.0.0 +**Status:** Produktiv (Schritte 1–10 + Grid Editor + Regression Framework) **URL:** https://macmini:3002/ai/ocr-pipeline ## Uebersicht @@ -1197,6 +1197,62 @@ des Headwords der vorherigen Zeile). Diese werden von PaddleOCR als garbled Text 4. Schlaegt IPA im Britfone-Woerterbuch nach 5. Beruecksichtigt alle Wortteile (z.B. "close sth. down" → `[klˈəʊz dˈaʊn]`) +### Compound Word IPA Decomposition (Step 5e) + +Zusammengesetzte Woerter wie "schoolbag" oder "blackbird" haben oft keinen eigenen +IPA-Eintrag im Woerterbuch. Die Funktion `_decompose_compound()` zerlegt sie: + +1. Probiere jede Teilungsposition (min. 3 Zeichen pro Teil) +2. Wenn beide Teile im Woerterbuch stehen → IPA verketten +3. Waehle die Teilung mit dem laengsten ersten Teil + +**Beispiele:** + +| Eingabe | Zerlegung | IPA | +|---------|-----------|-----| +| schoolbag | school + bag | skˈuːl + bæɡ | +| blackbird | black + bird | blæk + bˈɜːd | +| ice-cream | ice + cream | aɪs + kɹˈiːm | + +### Trailing Garbled Fragment Removal (Step 5f) + +Nach korrekt erkanntem IPA (z.B. `seat [sˈiːt]`) haengt OCR manchmal +eine garbled Kopie der IPA-Transkription an: `seat [sˈiːt] belt si:t belt`. + +**`_strip_post_bracket_garbled()`** erkennt und entfernt diese: + +1. Alles nach dem letzten `]` scannen +2. Woerter mit IPA-Markern (`:`, `ə`, `ɪ` etc.) → garbled, entfernen +3. Echte Woerter (Woerterbuch, Deutsch, Delimiter) → behalten +4. **Multi-Wort-Headword:** "belt" ist ein echtes Wort, aber wenn danach + garbled IPA kommt, wird nur "belt" behalten, der Rest entfernt + +### Regression Framework (Step 5g) + +Ground-Truth Sessions koennen als Referenz markiert werden. Nach jeder +Code-Aenderung vergleicht `POST /regression/run` die aktuelle Pipeline-Ausgabe +mit den gespeicherten Referenzen: + +- **Strukturelle Diffs:** Zonen, Spalten, Zeilen (Anzahl-Aenderungen) +- **Zellen-Diffs:** Text-Aenderungen, fehlende/neue Zellen, col_type-Aenderungen +- **Persistenz:** Ergebnisse in `regression_runs` Tabelle fuer Trend-Analyse +- **Shell-Script:** `scripts/run-regression.sh` fuer CI-Integration + +Admin-UI: [/ai/ocr-regression](https://macmini:3002/ai/ocr-regression) + +### Ground Truth Review Workflow (Step 5h) + +Admin-UI fuer effiziente Massenpruefung von Sessions: + +- **Split-View:** Original-Bild links, erkannter Grid rechts +- **Confidence-Highlighting:** Niedrige Konfidenz rot hervorgehoben +- **Quick-Accept:** Korrekte Zeilen mit einem Klick bestaetigen +- **Inline-Edit:** Text direkt im Grid korrigieren +- **Session-Queue:** Automatisch naechste Session laden +- **Batch-Mark:** Mehrere Sessions gleichzeitig als Ground Truth markieren + +Admin-UI: [/ai/ocr-ground-truth](https://macmini:3002/ai/ocr-ground-truth) + ### `en_col_type` Erkennung Die Erkennung der Englisch-Headword-Spalte nutzt **Bracket-IPA-Pattern-Count** @@ -1536,6 +1592,7 @@ cd klausur-service/backend && pytest tests/test_paddle_kombi.py -v # 36 Tests | Datum | Version | Aenderung | |-------|---------|----------| +| 2026-03-23 | 5.0.0 | **Phase 1 Sprint 1:** Compound-IPA-Zerlegung (`_decompose_compound`), Trailing-Garbled-Fragment-Entfernung (Multi-Wort-Headwords), Regression Framework mit DB-Persistenz + History + Shell-Script, Ground-Truth Review Workflow UI, Page-Crop Determinismus verifiziert. Admin-Seiten: `/ai/ocr-regression`, `/ai/ocr-ground-truth`. | | 2026-03-20 | 4.7.0 | Grid Editor: Zone Merging ueber Bilder (`image_overlays`), Heading Detection (Farbe + Hoehe), Ghost-Filter (borderless-aware), Oversized Word Box Removal, IPA Phonetic Correction (Britfone), IPA Continuation Detection, `en_col_type` via Bracket-Count. 27 Tests. | | 2026-03-16 | 4.6.0 | Strukturerkennung (Schritt 8): Region-basierte Grafikerkennung (`cv_graphic_detect.py`) mit Zwei-Pass-Verfahren (Farbregionen + schwarze Illustrationen), Wort-Ueberlappungs-Filter, Box/Zonen/Farb-Analyse. Schritt laeuft nach Worterkennung. | | 2026-03-12 | 4.5.0 | Kombi-Modus (PaddleOCR + Tesseract): Beide Engines laufen parallel, Koordinaten werden IoU-basiert gematcht und confidence-gewichtet gemittelt. Ungematchte Tesseract-Woerter (Bullets, Symbole) werden hinzugefuegt. 3er-Toggle in OCR Overlay. | diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 0e96432..3a3efa2 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1032,6 +1032,37 @@ def _text_has_garbled_ipa(text: str) -> bool: return False +def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]: + """Try to decompose a compound word and concatenate IPA for each part. + + E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated. + Only returns IPA if ALL parts are found in the dictionary. + + Tries splits at every position (min 3 chars per part) and picks the + split where the first part is longest. + """ + if not IPA_AVAILABLE: + return None + lower = word.lower().strip() + if len(lower) < 6: + return None # too short for a compound + + best_ipa = None + best_first_len = 0 + + for split_pos in range(3, len(lower) - 2): # min 3 chars each part + first = lower[:split_pos] + second = lower[split_pos:] + ipa_first = _lookup_ipa(first, pronunciation) + ipa_second = _lookup_ipa(second, pronunciation) + if ipa_first and ipa_second: + if split_pos > best_first_len: + best_first_len = split_pos + best_ipa = ipa_first + ipa_second + + return best_ipa + + def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: """Insert IPA pronunciation for English words that have no brackets at all. @@ -1077,6 +1108,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: # Fallback: try without hyphens (e.g. "second-hand" → "secondhand") if not ipa and '-' in clean: ipa = _lookup_ipa(clean.replace('-', ''), pronunciation) + # Fallback 0b: compound word decomposition + # E.g. "schoolbag" → "school"+"bag" → concatenated IPA + if not ipa: + ipa = _decompose_compound(clean, pronunciation) # Fallback 1: IPA-marker split for merged tokens where OCR # joined headword with its IPA (e.g. "schoolbagsku:lbæg"). # Find the first IPA marker character (:, æ, ɪ, etc.), walk @@ -1098,6 +1133,9 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: headword = w[:split] ocr_ipa = w[split:] hw_ipa = _lookup_ipa(headword, pronunciation) + if not hw_ipa: + # Try compound decomposition for the headword part + hw_ipa = _decompose_compound(headword, pronunciation) if hw_ipa: words[i] = f"{headword} [{hw_ipa}]" else: @@ -1197,6 +1235,12 @@ def _strip_post_bracket_garbled( E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]`` ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]`` + ``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt`` + + For multi-word headwords like "seat belt", a real English word ("belt") + may be followed by garbled IPA duplicates. We detect this by checking + whether the sequence after a real word contains IPA markers (`:`, `ə`, + etc.) — if so, everything from the first garbled token onward is stripped. """ if ']' not in text: return text @@ -1207,6 +1251,8 @@ def _strip_post_bracket_garbled( after = text[last_bracket + 1:].strip() if not after: return text + + _IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ') after_words = after.split() kept: List[str] = [] for idx, w in enumerate(after_words): @@ -1215,17 +1261,42 @@ def _strip_post_bracket_garbled( kept.extend(after_words[idx:]) break # Contains IPA markers (length mark, IPA chars) — garbled, skip - if ':' in w or any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋˈˌ'): + if any(c in w for c in _IPA_MARKER_CHARS): + # Everything from here is garbled IPA — stop scanning + # but look ahead: if any remaining words are real English + # words WITHOUT IPA markers, they might be a different headword + # following. Only skip the contiguous garbled run. continue clean = re.sub(r'[^a-zA-Z]', '', w) # Uppercase — likely German, keep rest if clean and clean[0].isupper(): kept.extend(after_words[idx:]) break - # Known English word — keep rest + # Known English word — keep it, but check if followed by garbled IPA + # (multi-word headword case like "seat [siːt] belt si:t belt") if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation): - kept.extend(after_words[idx:]) - break + # Peek ahead: if next word has IPA markers, the rest is garbled + remaining = after_words[idx + 1:] + has_garbled_after = any( + any(c in rw for c in _IPA_MARKER_CHARS) + for rw in remaining + ) + if has_garbled_after: + # Keep this real word but stop — rest is garbled duplication + kept.append(w) + # Still scan for delimiters/German in the remaining words + for ridx, rw in enumerate(remaining): + if rw in ('–', '—', '-', '/', '|', ',', ';'): + kept.extend(remaining[ridx:]) + break + rclean = re.sub(r'[^a-zA-Z]', '', rw) + if rclean and rclean[0].isupper(): + kept.extend(remaining[ridx:]) + break + break + else: + kept.extend(after_words[idx:]) + break # Unknown short word — likely garbled, skip if kept: return before + ' ' + ' '.join(kept) diff --git a/klausur-service/backend/migrations/008_regression_runs.sql b/klausur-service/backend/migrations/008_regression_runs.sql new file mode 100644 index 0000000..f071086 --- /dev/null +++ b/klausur-service/backend/migrations/008_regression_runs.sql @@ -0,0 +1,18 @@ +-- Migration 008: Regression test run history +-- Stores results of regression test runs for trend analysis. + +CREATE TABLE IF NOT EXISTS regression_runs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + run_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + status VARCHAR(20) NOT NULL, -- 'pass', 'fail', 'error' + total INT NOT NULL DEFAULT 0, + passed INT NOT NULL DEFAULT 0, + failed INT NOT NULL DEFAULT 0, + errors INT NOT NULL DEFAULT 0, + duration_ms INT, + results JSONB NOT NULL DEFAULT '[]', + triggered_by VARCHAR(50) DEFAULT 'manual' -- 'manual', 'script', 'ci' +); + +CREATE INDEX IF NOT EXISTS idx_regression_runs_run_at + ON regression_runs (run_at DESC); diff --git a/klausur-service/backend/ocr_pipeline_regression.py b/klausur-service/backend/ocr_pipeline_regression.py index 47f206f..7636609 100644 --- a/klausur-service/backend/ocr_pipeline_regression.py +++ b/klausur-service/backend/ocr_pipeline_regression.py @@ -8,7 +8,11 @@ Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ +import json import logging +import os +import time +import uuid from datetime import datetime, timezone from typing import Any, Dict, List, Optional @@ -16,6 +20,7 @@ from fastapi import APIRouter, HTTPException, Query from grid_editor_api import _build_grid_core from ocr_pipeline_session_store import ( + get_pool, get_session_db, list_ground_truth_sessions_db, update_session_db, @@ -26,6 +31,60 @@ logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["regression"]) +# --------------------------------------------------------------------------- +# DB persistence for regression runs +# --------------------------------------------------------------------------- + +async def _init_regression_table(): + """Ensure regression_runs table exists (idempotent).""" + pool = await get_pool() + async with pool.acquire() as conn: + migration_path = os.path.join( + os.path.dirname(__file__), + "migrations/008_regression_runs.sql", + ) + if os.path.exists(migration_path): + with open(migration_path, "r") as f: + sql = f.read() + await conn.execute(sql) + + +async def _persist_regression_run( + status: str, + summary: dict, + results: list, + duration_ms: int, + triggered_by: str = "manual", +) -> str: + """Save a regression run to the database. Returns the run ID.""" + try: + await _init_regression_table() + pool = await get_pool() + run_id = str(uuid.uuid4()) + async with pool.acquire() as conn: + await conn.execute( + """ + INSERT INTO regression_runs + (id, status, total, passed, failed, errors, duration_ms, results, triggered_by) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9) + """, + run_id, + status, + summary.get("total", 0), + summary.get("passed", 0), + summary.get("failed", 0), + summary.get("errors", 0), + duration_ms, + json.dumps(results), + triggered_by, + ) + logger.info("Regression run %s persisted: %s", run_id, status) + return run_id + except Exception as e: + logger.warning("Failed to persist regression run: %s", e) + return "" + + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -299,8 +358,11 @@ async def run_single_regression(session_id: str): @router.post("/regression/run") -async def run_all_regressions(): +async def run_all_regressions( + triggered_by: str = Query("manual", description="Who triggered: manual, script, ci"), +): """Re-run build_grid for ALL ground-truth sessions and compare.""" + start_time = time.monotonic() sessions = await list_ground_truth_sessions_db() if not sessions: @@ -370,19 +432,105 @@ async def run_all_regressions(): results.append(entry) overall = "pass" if failed == 0 and errors == 0 else "fail" + duration_ms = int((time.monotonic() - start_time) * 1000) + + summary = { + "total": len(results), + "passed": passed, + "failed": failed, + "errors": errors, + } logger.info( - "Regression suite: %s — %d passed, %d failed, %d errors (of %d)", - overall, passed, failed, errors, len(results), + "Regression suite: %s — %d passed, %d failed, %d errors (of %d) in %dms", + overall, passed, failed, errors, len(results), duration_ms, + ) + + # Persist to DB + run_id = await _persist_regression_run( + status=overall, + summary=summary, + results=results, + duration_ms=duration_ms, + triggered_by=triggered_by, ) return { "status": overall, + "run_id": run_id, + "duration_ms": duration_ms, "results": results, - "summary": { - "total": len(results), - "passed": passed, - "failed": failed, - "errors": errors, - }, + "summary": summary, } + + +@router.get("/regression/history") +async def get_regression_history( + limit: int = Query(20, ge=1, le=100), +): + """Get recent regression run history from the database.""" + try: + await _init_regression_table() + pool = await get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + """ + SELECT id, run_at, status, total, passed, failed, errors, + duration_ms, triggered_by + FROM regression_runs + ORDER BY run_at DESC + LIMIT $1 + """, + limit, + ) + return { + "runs": [ + { + "id": str(row["id"]), + "run_at": row["run_at"].isoformat() if row["run_at"] else None, + "status": row["status"], + "total": row["total"], + "passed": row["passed"], + "failed": row["failed"], + "errors": row["errors"], + "duration_ms": row["duration_ms"], + "triggered_by": row["triggered_by"], + } + for row in rows + ], + "count": len(rows), + } + except Exception as e: + logger.warning("Failed to fetch regression history: %s", e) + return {"runs": [], "count": 0, "error": str(e)} + + +@router.get("/regression/history/{run_id}") +async def get_regression_run_detail(run_id: str): + """Get detailed results of a specific regression run.""" + try: + await _init_regression_table() + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT * FROM regression_runs WHERE id = $1", + run_id, + ) + if not row: + raise HTTPException(status_code=404, detail="Run not found") + return { + "id": str(row["id"]), + "run_at": row["run_at"].isoformat() if row["run_at"] else None, + "status": row["status"], + "total": row["total"], + "passed": row["passed"], + "failed": row["failed"], + "errors": row["errors"], + "duration_ms": row["duration_ms"], + "triggered_by": row["triggered_by"], + "results": json.loads(row["results"]) if row["results"] else [], + } + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/klausur-service/backend/tests/test_cell_phonetics.py b/klausur-service/backend/tests/test_cell_phonetics.py index 4918497..3ea2b74 100644 --- a/klausur-service/backend/tests/test_cell_phonetics.py +++ b/klausur-service/backend/tests/test_cell_phonetics.py @@ -57,6 +57,63 @@ class TestInsertMissingIpa: result = _insert_missing_ipa("Anstecknadel", "british") assert result == "Anstecknadel" + def test_compound_word_schoolbag_gets_ipa(self): + """R07: Compound word 'schoolbag' should get decomposed IPA (school+bag).""" + from cv_ocr_engines import _insert_missing_ipa + result = _insert_missing_ipa("schoolbag", "british") + assert "[" in result and "]" in result + assert result.startswith("schoolbag [") + + def test_compound_word_blackbird(self): + """Compound word 'blackbird' should get decomposed IPA.""" + from cv_ocr_engines import _insert_missing_ipa + result = _insert_missing_ipa("blackbird", "british") + assert "[" in result and "]" in result + + def test_compound_word_too_short(self): + """Words shorter than 6 chars should not attempt compound decomposition.""" + from cv_ocr_engines import _decompose_compound + assert _decompose_compound("bag", "british") is None + + def test_decompose_compound_direct(self): + """Direct test of _decompose_compound for known compounds.""" + from cv_ocr_engines import _decompose_compound + # schoolbag = school + bag — both should be in dictionary + result = _decompose_compound("schoolbag", "british") + assert result is not None + + +class TestStripPostBracketGarbled: + """Tests for _strip_post_bracket_garbled — trailing garbled IPA removal.""" + + def test_simple_trailing_garbled(self): + """R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed.""" + from cv_ocr_engines import _strip_post_bracket_garbled + result = _strip_post_bracket_garbled("sea [sˈiː] si:") + assert "si:" not in result + assert result.startswith("sea [sˈiː]") + + def test_multi_word_trailing_garbled(self): + """R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled.""" + from cv_ocr_engines import _strip_post_bracket_garbled + result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt") + assert "belt" in result # real word kept + assert "si:t" not in result # garbled removed + # Should contain "seat [sˈiːt] belt" but not the garbled duplication + assert result.count("belt") == 1 + + def test_delimiter_after_bracket_kept(self): + """Delimiters after IPA bracket are kept.""" + from cv_ocr_engines import _strip_post_bracket_garbled + result = _strip_post_bracket_garbled("dance [dˈɑːns] – tanzen") + assert "– tanzen" in result + + def test_german_after_bracket_kept(self): + """German words (uppercase) after IPA bracket are kept.""" + from cv_ocr_engines import _strip_post_bracket_garbled + result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen") + assert "Abzeichen" in result + class TestFixCellPhonetics: """Tests for fix_cell_phonetics function.""" diff --git a/klausur-service/backend/tests/test_page_crop.py b/klausur-service/backend/tests/test_page_crop.py index 061e73e..d506486 100644 --- a/klausur-service/backend/tests/test_page_crop.py +++ b/klausur-service/backend/tests/test_page_crop.py @@ -415,3 +415,53 @@ class TestDetectAndCropPage: assert 0 <= pct["y"] <= 100 assert 0 < pct["width"] <= 100 assert 0 < pct["height"] <= 100 + + +class TestCropDeterminism: + """A3: Verify that page crop produces identical results across N runs.""" + + @pytest.mark.parametrize("image_factory,desc", [ + ( + lambda: _make_image_with_content(800, 600, (100, 700, 80, 520)), + "standard content", + ), + ( + lambda: _make_book_scan(1000, 800), + "book scan with spine shadow", + ), + ]) + def test_determinism_10_runs(self, image_factory, desc): + """Same image must produce identical crops in 10 consecutive runs.""" + img = image_factory() + results = [] + for _ in range(10): + cropped, result = detect_and_crop_page(img.copy()) + results.append({ + "crop_applied": result["crop_applied"], + "cropped_size": result["cropped_size"], + "border_fractions": result["border_fractions"], + "shape": cropped.shape, + }) + + first = results[0] + for i, r in enumerate(results[1:], 1): + assert r["crop_applied"] == first["crop_applied"], ( + f"Run {i} crop_applied differs from run 0 ({desc})" + ) + assert r["cropped_size"] == first["cropped_size"], ( + f"Run {i} cropped_size differs from run 0 ({desc})" + ) + assert r["shape"] == first["shape"], ( + f"Run {i} output shape differs from run 0 ({desc})" + ) + + def test_determinism_pixel_identical(self): + """Crop output pixels must be identical across runs.""" + img = _make_image_with_content(800, 600, (100, 700, 80, 520)) + ref_crop, _ = detect_and_crop_page(img.copy()) + + for i in range(5): + crop, _ = detect_and_crop_page(img.copy()) + assert np.array_equal(ref_crop, crop), ( + f"Run {i} produced different pixel output" + ) diff --git a/mkdocs.yml b/mkdocs.yml index c5b1d46..8409a72 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -84,5 +84,6 @@ nav: - Zeugnis-System: architecture/zeugnis-system.md - Entwicklung: - Testing: development/testing.md + - Regression Testing: development/regression-testing.md - Dokumentation: development/documentation.md - CI/CD Pipeline: development/ci-cd-pipeline.md diff --git a/scripts/benchmark-trocr.py b/scripts/benchmark-trocr.py new file mode 100755 index 0000000..e0abd8b --- /dev/null +++ b/scripts/benchmark-trocr.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +TrOCR Baseline Benchmark — measures PyTorch TrOCR performance. + +Metrics: +- RAM usage (RSS) before and after model load +- Inference time per line (min, max, mean, p50, p95) +- Model size on disk + +Output: JSON report to stdout (redirect to file for Sprint 2 comparison). + +Usage: + python scripts/benchmark-trocr.py [--model trocr-base-printed] [--runs 10] + python scripts/benchmark-trocr.py > benchmark-trocr-baseline.json +""" + +import argparse +import json +import os +import sys +import time +from datetime import datetime + +# Add backend to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'klausur-service', 'backend')) + + +def get_rss_mb(): + """Get current process RSS in MB.""" + import resource + # resource.getrusage returns KB on Linux, bytes on macOS + usage = resource.getrusage(resource.RUSAGE_SELF) + rss = usage.ru_maxrss + if sys.platform == 'darwin': + return rss / (1024 * 1024) # bytes to MB on macOS + return rss / 1024 # KB to MB on Linux + + +def get_model_size_mb(model_name): + """Estimate model size from HuggingFace cache.""" + cache_dir = os.path.expanduser("~/.cache/huggingface/hub") + total = 0 + model_dir_pattern = model_name.replace('/', '--') + for root, dirs, files in os.walk(cache_dir): + if model_dir_pattern in root: + for f in files: + total += os.path.getsize(os.path.join(root, f)) + return total / (1024 * 1024) # bytes to MB + + +def benchmark_trocr(model_name: str = "microsoft/trocr-base-printed", num_runs: int = 10): + """Run TrOCR benchmark and return results dict.""" + import numpy as np + from PIL import Image + + rss_before = get_rss_mb() + + # Load model + print(f"Loading model: {model_name}", file=sys.stderr) + load_start = time.monotonic() + + try: + from transformers import TrOCRProcessor, VisionEncoderDecoderModel + processor = TrOCRProcessor.from_pretrained(model_name) + model = VisionEncoderDecoderModel.from_pretrained(model_name) + model.eval() + except Exception as e: + return {"error": f"Failed to load model: {e}"} + + load_time = time.monotonic() - load_start + rss_after_load = get_rss_mb() + model_size = get_model_size_mb(model_name) + + print(f"Model loaded in {load_time:.1f}s, RSS: {rss_after_load:.0f}MB", file=sys.stderr) + + # Create synthetic test images (text line images) + test_images = [] + for i in range(num_runs): + # Create a simple white image with black text-like content + # In production, these would be real cropped text lines + w, h = 384, 48 # typical TrOCR input size + img = Image.new('RGB', (w, h), 'white') + # Add some variation + pixels = img.load() + # Simple dark region to simulate text + for x in range(50 + i * 10, 200 + i * 5): + for y in range(10, 38): + pixels[x, y] = (30, 30, 30) + test_images.append(img) + + # Warm-up run (not counted) + print("Warm-up...", file=sys.stderr) + import torch + with torch.no_grad(): + pixel_values = processor(images=test_images[0], return_tensors="pt").pixel_values + _ = model.generate(pixel_values, max_new_tokens=50) + + # Benchmark runs + print(f"Running {num_runs} inference passes...", file=sys.stderr) + times_ms = [] + for i, img in enumerate(test_images): + start = time.monotonic() + with torch.no_grad(): + pixel_values = processor(images=img, return_tensors="pt").pixel_values + generated_ids = model.generate(pixel_values, max_new_tokens=50) + text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + elapsed_ms = (time.monotonic() - start) * 1000 + times_ms.append(elapsed_ms) + print(f" Run {i+1}/{num_runs}: {elapsed_ms:.0f}ms -> '{text[:30]}'", file=sys.stderr) + + rss_after_inference = get_rss_mb() + + # Compute stats + times_sorted = sorted(times_ms) + p50_idx = len(times_sorted) // 2 + p95_idx = int(len(times_sorted) * 0.95) + + report = { + "benchmark": "trocr-baseline", + "timestamp": datetime.utcnow().isoformat() + "Z", + "model": model_name, + "backend": "pytorch", + "quantization": "float32", + "num_runs": num_runs, + "model_size_mb": round(model_size, 1), + "ram_mb": { + "before_load": round(rss_before, 1), + "after_load": round(rss_after_load, 1), + "after_inference": round(rss_after_inference, 1), + "model_delta": round(rss_after_load - rss_before, 1), + }, + "load_time_seconds": round(load_time, 2), + "inference_ms": { + "min": round(min(times_ms), 1), + "max": round(max(times_ms), 1), + "mean": round(sum(times_ms) / len(times_ms), 1), + "p50": round(times_sorted[p50_idx], 1), + "p95": round(times_sorted[min(p95_idx, len(times_sorted) - 1)], 1), + }, + "times_ms": [round(t, 1) for t in times_ms], + "platform": { + "python": sys.version.split()[0], + "os": sys.platform, + }, + } + + return report + + +def main(): + parser = argparse.ArgumentParser(description="TrOCR Baseline Benchmark") + parser.add_argument("--model", default="microsoft/trocr-base-printed", + help="HuggingFace model name") + parser.add_argument("--runs", type=int, default=10, + help="Number of inference runs") + args = parser.parse_args() + + report = benchmark_trocr(model_name=args.model, num_runs=args.runs) + print(json.dumps(report, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/scripts/run-regression.sh b/scripts/run-regression.sh new file mode 100755 index 0000000..e0b378d --- /dev/null +++ b/scripts/run-regression.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Run OCR pipeline regression tests and exit non-zero on failure. +# +# Usage: +# ./scripts/run-regression.sh # default: macmini:8086 +# ./scripts/run-regression.sh http://localhost:8086 +# +# Exit codes: +# 0 = all pass +# 1 = failures or errors +# 2 = connection error + +set -euo pipefail + +BASE_URL="${1:-http://macmini:8086}" +ENDPOINT="${BASE_URL}/api/v1/ocr-pipeline/regression/run?triggered_by=script" + +echo "=== OCR Pipeline Regression Suite ===" +echo "Endpoint: ${ENDPOINT}" +echo "" + +RESPONSE=$(curl -sf -X POST "${ENDPOINT}" -H "Content-Type: application/json" 2>&1) || { + echo "ERROR: Could not reach ${ENDPOINT}" + exit 2 +} + +STATUS=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") +TOTAL=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['total'])") +PASSED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['passed'])") +FAILED=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['failed'])") +ERRORS=$(echo "${RESPONSE}" | python3 -c "import sys,json; s=json.load(sys.stdin)['summary']; print(s['errors'])") +DURATION=$(echo "${RESPONSE}" | python3 -c "import sys,json; print(json.load(sys.stdin).get('duration_ms', '?'))") + +echo "Status: ${STATUS}" +echo "Total: ${TOTAL}" +echo "Passed: ${PASSED}" +echo "Failed: ${FAILED}" +echo "Errors: ${ERRORS}" +echo "Duration: ${DURATION}ms" +echo "" + +if [ "${STATUS}" = "pass" ]; then + echo "PASS — All regression tests passed." + exit 0 +else + echo "FAIL — Regression failures detected!" + # Print failure details + echo "${RESPONSE}" | python3 -c " +import sys, json +data = json.load(sys.stdin) +for r in data.get('results', []): + if r['status'] != 'pass': + print(f\" {r['status'].upper()}: {r.get('name', r['session_id'])}\") + if 'error' in r: + print(f\" Error: {r['error']}\") + ds = r.get('diff_summary', {}) + if ds: + print(f\" Structural: {ds.get('structural_changes', 0)}, Text: {ds.get('text_changes', 0)}, Missing: {ds.get('cells_missing', 0)}, Added: {ds.get('cells_added', 0)}\") +" + exit 1 +fi