'use client' /** * Quality Dashboard - BQAS (Breakpilot Quality Assurance System) * * Umfassendes Qualitaetssicherungs-Dashboard mit: * - Golden Test Suite Ergebnisse * - Synthetic Test Generierung * - Regression Tracking * - Test Run Historie * - RAG Correction Tests */ import { useState, useEffect, useCallback } from 'react' import AdminLayout from '@/components/admin/AdminLayout' // Types interface TestResult { test_id: string test_name: string passed: boolean composite_score: number intent_accuracy: number faithfulness: number relevance: number coherence: number safety: string reasoning: string expected_intent: string detected_intent: string } interface TestRun { id: number timestamp: string git_commit: string golden_score: number synthetic_score: number total_tests: number passed_tests: number failed_tests: number duration_seconds: number } interface BQASMetrics { total_tests: number passed_tests: number failed_tests: number avg_intent_accuracy: number avg_faithfulness: number avg_relevance: number avg_coherence: number safety_pass_rate: number avg_composite_score: number scores_by_intent: Record failed_test_ids: string[] } interface TrendData { dates: string[] scores: number[] trend: 'improving' | 'stable' | 'declining' | 'insufficient_data' } // API Configuration const VOICE_SERVICE_URL = process.env.NEXT_PUBLIC_VOICE_SERVICE_URL || 'http://localhost:8091' // Components function MetricCard({ title, value, subtitle, trend, color = 'blue', }: { title: string value: string | number subtitle?: string trend?: 'up' | 'down' | 'stable' color?: 'blue' | 'green' | 'red' | 'yellow' | 'purple' }) { const colorClasses = { blue: 'bg-blue-50 border-blue-200', green: 'bg-emerald-50 border-emerald-200', red: 'bg-red-50 border-red-200', yellow: 'bg-amber-50 border-amber-200', purple: 'bg-purple-50 border-purple-200', } const trendIcons = { up: ( ), down: ( ), stable: ( ), } return (

{title}

{value}

{subtitle &&

{subtitle}

}
{trend &&
{trendIcons[trend]}
}
) } function TestSuiteCard({ title, description, metrics, onRun, isRunning, lastRun, }: { title: string description: string metrics?: BQASMetrics onRun: () => void isRunning: boolean lastRun?: string }) { const passRate = metrics ? (metrics.passed_tests / metrics.total_tests) * 100 : 0 return (

{title}

{description}

{metrics && (
Pass Rate {passRate.toFixed(1)}%
= 80 ? 'bg-emerald-500' : passRate >= 60 ? 'bg-amber-500' : 'bg-red-500' }`} style={{ width: `${passRate}%` }} />

{metrics.total_tests}

Tests

{metrics.passed_tests}

Bestanden

{metrics.failed_tests}

Fehlgeschlagen

Durchschnittlicher Score: {metrics.avg_composite_score.toFixed(2)}

)} {lastRun && (

Letzter Lauf: {new Date(lastRun).toLocaleString('de-DE')}

)}
) } function TrendChart({ data }: { data: TrendData }) { if (!data || data.dates.length === 0) { return (
Keine Trend-Daten verfuegbar
) } const maxScore = Math.max(...data.scores, 5) const minScore = Math.min(...data.scores, 0) const range = maxScore - minScore || 1 return (
{/* Y-Axis Labels */}
{maxScore.toFixed(1)} {((maxScore + minScore) / 2).toFixed(1)} {minScore.toFixed(1)}
{/* Chart Area */}
{/* Grid Lines */} {/* Line Chart */} { const x = (i / (data.scores.length - 1 || 1)) * 100 const y = 100 - ((score - minScore) / range) * 100 return `${x},${y}` }) .join(' ')} /> {/* Data Points */} {data.scores.map((score, i) => { const x = (i / (data.scores.length - 1 || 1)) * 100 const y = 100 - ((score - minScore) / range) * 100 return })}
{/* X-Axis Labels */}
{data.dates.slice(0, 5).map((date, i) => ( {new Date(date).toLocaleDateString('de-DE', { day: '2-digit', month: '2-digit' })} ))}
{/* Trend Indicator */}
{data.trend === 'improving' ? 'Verbessernd' : data.trend === 'declining' ? 'Verschlechternd' : 'Stabil'}
) } function TestRunsTable({ runs }: { runs: TestRun[] }) { if (runs.length === 0) { return (
Keine Test-Laeufe vorhanden
) } return (
{runs.map((run) => ( ))}
ID Zeitpunkt Commit Golden Score Tests Bestanden Dauer
#{run.id} {new Date(run.timestamp).toLocaleString('de-DE')} {run.git_commit?.slice(0, 7) || '-'} = 4 ? 'text-emerald-600' : run.golden_score >= 3 ? 'text-amber-600' : 'text-red-600' }`} > {run.golden_score.toFixed(2)} {run.total_tests} {run.passed_tests} / {run.failed_tests} {run.duration_seconds.toFixed(1)}s
) } function IntentScoresChart({ scores }: { scores: Record }) { const entries = Object.entries(scores).sort((a, b) => b[1] - a[1]) if (entries.length === 0) { return (
Keine Intent-Scores verfuegbar
) } return (
{entries.map(([intent, score]) => (
{intent.replace(/_/g, ' ')} = 4 ? 'text-emerald-600' : score >= 3 ? 'text-amber-600' : 'text-red-600' }`} > {score.toFixed(2)}
= 4 ? 'bg-emerald-500' : score >= 3 ? 'bg-amber-500' : 'bg-red-500' }`} style={{ width: `${(score / 5) * 100}%` }} />
))}
) } function FailedTestsList({ testIds, onViewDetails }: { testIds: string[]; onViewDetails?: (id: string) => void }) { if (testIds.length === 0) { return (
Alle Tests bestanden!
) } return (
{testIds.map((testId) => (
{testId}
{onViewDetails && ( )}
))}
) } // Scheduler Status Component function SchedulerStatusCard({ title, status, description, icon, }: { title: string status: 'active' | 'inactive' | 'warning' | 'unknown' description: string icon: React.ReactNode }) { const statusColors = { active: 'bg-emerald-100 border-emerald-200 text-emerald-700', inactive: 'bg-slate-100 border-slate-200 text-slate-700', warning: 'bg-amber-100 border-amber-200 text-amber-700', unknown: 'bg-slate-100 border-slate-200 text-slate-500', } const statusBadges = { active: 'bg-emerald-500', inactive: 'bg-slate-400', warning: 'bg-amber-500', unknown: 'bg-slate-300', } return (
{icon}

{title}

{description}

) } // Main Component export default function QualityDashboard() { const [activeTab, setActiveTab] = useState<'overview' | 'golden' | 'rag' | 'synthetic' | 'history' | 'scheduler'>('overview') const [isLoading, setIsLoading] = useState(true) const [error, setError] = useState(null) // Data states const [goldenMetrics, setGoldenMetrics] = useState(null) const [syntheticMetrics, setSyntheticMetrics] = useState(null) const [ragMetrics, setRagMetrics] = useState(null) const [testRuns, setTestRuns] = useState([]) const [trendData, setTrendData] = useState(null) // Running states const [isRunningGolden, setIsRunningGolden] = useState(false) const [isRunningSynthetic, setIsRunningSynthetic] = useState(false) const [isRunningRag, setIsRunningRag] = useState(false) // Fetch data const fetchData = useCallback(async () => { setIsLoading(true) setError(null) try { // Fetch test runs const runsResponse = await fetch(`${VOICE_SERVICE_URL}/api/v1/bqas/runs`) if (runsResponse.ok) { const runsData = await runsResponse.json() setTestRuns(runsData.runs || []) } // Fetch trend data const trendResponse = await fetch(`${VOICE_SERVICE_URL}/api/v1/bqas/trend?days=30`) if (trendResponse.ok) { const trend = await trendResponse.json() setTrendData(trend) } // Fetch latest metrics const metricsResponse = await fetch(`${VOICE_SERVICE_URL}/api/v1/bqas/latest-metrics`) if (metricsResponse.ok) { const metrics = await metricsResponse.json() setGoldenMetrics(metrics.golden || null) setSyntheticMetrics(metrics.synthetic || null) setRagMetrics(metrics.rag || null) } } catch (err) { console.error('Failed to fetch BQAS data:', err) setError('Verbindung zum Voice-Service fehlgeschlagen') } finally { setIsLoading(false) } }, []) useEffect(() => { fetchData() }, [fetchData]) // Run test suites const runGoldenTests = async () => { setIsRunningGolden(true) try { const response = await fetch(`${VOICE_SERVICE_URL}/api/v1/bqas/run/golden`, { method: 'POST', }) if (response.ok) { const result = await response.json() setGoldenMetrics(result.metrics) await fetchData() } } catch (err) { console.error('Failed to run golden tests:', err) } finally { setIsRunningGolden(false) } } const runSyntheticTests = async () => { setIsRunningSynthetic(true) try { const response = await fetch(`${VOICE_SERVICE_URL}/api/v1/bqas/run/synthetic`, { method: 'POST', }) if (response.ok) { const result = await response.json() setSyntheticMetrics(result.metrics) await fetchData() } } catch (err) { console.error('Failed to run synthetic tests:', err) } finally { setIsRunningSynthetic(false) } } const runRagTests = async () => { setIsRunningRag(true) try { const response = await fetch(`${VOICE_SERVICE_URL}/api/v1/bqas/run/rag`, { method: 'POST', }) if (response.ok) { const result = await response.json() setRagMetrics(result.metrics) await fetchData() } } catch (err) { console.error('Failed to run RAG tests:', err) } finally { setIsRunningRag(false) } } // Tab content const renderTabContent = () => { switch (activeTab) { case 'overview': return (
{/* Quick Stats */}
{/* Trend Chart */}

Score-Trend (30 Tage)

{/* Test Suites Grid */}
) case 'golden': return (

Golden Test Suite

Validierte Referenz-Tests gegen definierte Erwartungen

{goldenMetrics && ( <> {/* Metrics Overview */}

{goldenMetrics.total_tests}

Tests

{goldenMetrics.passed_tests}

Bestanden

{goldenMetrics.failed_tests}

Fehlgeschlagen

{goldenMetrics.avg_intent_accuracy.toFixed(0)}%

Intent Accuracy

{goldenMetrics.avg_composite_score.toFixed(2)}

Composite Score

{/* Intent Scores & Failed Tests */}

Scores nach Intent

Fehlgeschlagene Tests

)}
) case 'rag': return (

RAG/Korrektur Test Suite

Erwartungshorizont-Retrieval, Operatoren-Alignment, Citations

{ragMetrics ? ( <> {/* RAG Metrics */}

{ragMetrics.total_tests}

Tests

{ragMetrics.avg_faithfulness.toFixed(2)}

Faithfulness

{ragMetrics.avg_relevance.toFixed(2)}

Relevance

{(ragMetrics.safety_pass_rate * 100).toFixed(0)}%

Safety Pass

{/* RAG Categories */}

RAG Kategorien

Fehlgeschlagene Tests

) : (

Noch keine RAG-Test-Ergebnisse

Klicke "Tests starten" um die RAG-Suite auszufuehren

)}
{/* RAG Test Categories Explanation */}

Test-Kategorien

{[ { name: 'EH Retrieval', desc: 'Korrektes Abrufen von Erwartungshorizont-Passagen', color: 'blue' }, { name: 'Operator Alignment', desc: 'Passende Operatoren fuer Abitur-Aufgaben', color: 'purple' }, { name: 'Hallucination Control', desc: 'Keine erfundenen Fakten oder Inhalte', color: 'red' }, { name: 'Citation Enforcement', desc: 'Quellenangaben bei EH-Bezuegen', color: 'green' }, { name: 'Privacy Compliance', desc: 'Keine PII-Leaks, DSGVO-Konformitaet', color: 'amber' }, { name: 'Namespace Isolation', desc: 'Strikte Trennung zwischen Lehrern', color: 'slate' }, ].map((cat) => (

{cat.name}

{cat.desc}

))}
) case 'synthetic': return (

Synthetic Test Suite

LLM-generierte Variationen fuer Robustheit-Tests

{syntheticMetrics ? ( <>

{syntheticMetrics.total_tests}

Generierte Tests

{syntheticMetrics.passed_tests}

Bestanden

{syntheticMetrics.avg_composite_score.toFixed(2)}

Avg Score

{syntheticMetrics.avg_coherence.toFixed(2)}

Coherence

Intent-Variationen

Fehlgeschlagene Tests

) : (

Noch keine synthetischen Tests ausgefuehrt

Klicke "Tests starten" um Variationen zu generieren

)}
) case 'history': return (

Test Run Historie

) case 'scheduler': return (
{/* Status Overview */}
} /> } /> } />
{/* Quick Actions */}

Quick Actions

{/* GitHub Actions vs Local - Comparison */}

GitHub Actions Alternative

Der lokale BQAS Scheduler ersetzt GitHub Actions und bietet DSGVO-konforme, vollstaendig lokale Test-Ausfuehrung.

Feature GitHub Actions Lokaler Scheduler
Taegliche Tests (07:00) schedule: cron macOS launchd
Push-basierte Tests on: push Git post-commit Hook
PR-basierte Tests on: pull_request Nicht moeglich
Regression-Check API-Call Identischer API-Call
Benachrichtigungen GitHub Issues Desktop/Slack/Email
DSGVO-Konformitaet Daten bei GitHub (US) 100% lokal
Offline-Faehig Nein Ja
{/* Configuration Details */}

Konfiguration

{/* launchd Configuration */}

launchd Job

{`# ~/Library/LaunchAgents/com.breakpilot.bqas.plist
Label: com.breakpilot.bqas
Schedule: 07:00 taeglich
Script: /voice-service/scripts/run_bqas.sh
Logs: /var/log/bqas/`}
{/* Environment Variables */}

Umgebungsvariablen

BQAS_SERVICE_URL http://localhost:8091
BQAS_REGRESSION_THRESHOLD 0.1
BQAS_NOTIFY_DESKTOP true
BQAS_NOTIFY_SLACK false
{/* Detailed Explanation */}

Detaillierte Erklaerung

Warum ein lokaler Scheduler?

Der lokale BQAS Scheduler wurde entwickelt, um die gleiche Funktionalitaet wie GitHub Actions zu bieten, aber mit dem entscheidenden Vorteil, dass alle Daten zu 100% auf dem lokalen Mac Mini verbleiben. Dies ist besonders wichtig fuer DSGVO-Konformitaet, da keine Schuelerdaten oder Testergebnisse an externe Server uebertragen werden.

Komponenten

  • run_bqas.sh - Hauptscript das pytest ausfuehrt, Regression-Checks macht und Benachrichtigungen versendet
  • launchd Job - macOS-nativer Scheduler der das Script taeglich um 07:00 Uhr startet
  • Git Hook - post-commit Hook der bei Aenderungen im voice-service automatisch Quick-Tests startet
  • Notifier - Python-Modul das Desktop-, Slack- und E-Mail-Benachrichtigungen versendet

Installation

./voice-service/scripts/install_bqas_scheduler.sh install

Vorteile gegenueber GitHub Actions

  • 100% DSGVO-konform - alle Daten bleiben lokal
  • Keine Internet-Abhaengigkeit - funktioniert auch offline
  • Keine GitHub-Kosten fuer private Repositories
  • Schnellere Ausfuehrung ohne Cloud-Overhead
  • Volle Kontrolle ueber Scheduling und Benachrichtigungen
) default: return null } } return ( {/* Error Banner */} {error && (
{error}
)} {/* Loading State */} {isLoading && (
)} {/* Main Content */} {!isLoading && ( <> {/* Tabs */}
{/* Tab Content */} {renderTabContent()} )} {/* Footer Info */}
Voice Service: {VOICE_SERVICE_URL}
) }