A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
286 lines
8.9 KiB
Python
286 lines
8.9 KiB
Python
"""
|
|
BQAS Test Runner
|
|
|
|
Proxy zu den BQAS-Endpoints im Voice-Service.
|
|
"""
|
|
|
|
import httpx
|
|
from datetime import datetime
|
|
from typing import Dict, Optional
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class BQASResult:
|
|
"""Ergebnis eines BQAS-Test-Runs"""
|
|
suite_type: str # "golden", "rag", "synthetic"
|
|
total_tests: int = 0
|
|
passed_tests: int = 0
|
|
failed_tests: int = 0
|
|
avg_score: float = 0.0
|
|
duration_seconds: float = 0.0
|
|
metrics: Dict = field(default_factory=dict)
|
|
failed_test_ids: list = field(default_factory=list)
|
|
raw_output: str = ""
|
|
|
|
|
|
class BQASRunner:
|
|
"""
|
|
Runner fuer BQAS-Tests.
|
|
|
|
Leitet Anfragen an den Voice-Service weiter (Port 8091).
|
|
"""
|
|
|
|
VOICE_SERVICE_URL = "http://localhost:8091"
|
|
|
|
def __init__(self, api_base: Optional[str] = None):
|
|
self.api_base = api_base or self.VOICE_SERVICE_URL
|
|
|
|
async def run_golden(self, timeout: int = 120) -> BQASResult:
|
|
"""
|
|
Fuehrt die Golden Test Suite aus.
|
|
|
|
Returns:
|
|
BQASResult mit allen Metriken
|
|
"""
|
|
return await self._run_suite("golden", timeout)
|
|
|
|
async def run_rag(self, timeout: int = 120) -> BQASResult:
|
|
"""
|
|
Fuehrt die RAG Test Suite aus.
|
|
|
|
Returns:
|
|
BQASResult mit allen Metriken
|
|
"""
|
|
return await self._run_suite("rag", timeout)
|
|
|
|
async def run_synthetic(self, timeout: int = 300) -> BQASResult:
|
|
"""
|
|
Fuehrt die Synthetic Test Suite aus.
|
|
Dauert laenger wegen LLM-Generierung.
|
|
|
|
Returns:
|
|
BQASResult mit allen Metriken
|
|
"""
|
|
return await self._run_suite("synthetic", timeout)
|
|
|
|
async def _run_suite(self, suite_type: str, timeout: int) -> BQASResult:
|
|
"""Interne Methode zum Ausfuehren einer Suite"""
|
|
start_time = datetime.now()
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=float(timeout)) as client:
|
|
response = await client.post(
|
|
f"{self.api_base}/api/v1/bqas/run/{suite_type}",
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
metrics = data.get("metrics", {})
|
|
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
total_tests=metrics.get("total_tests", 0),
|
|
passed_tests=metrics.get("passed_tests", 0),
|
|
failed_tests=metrics.get("failed_tests", 0),
|
|
avg_score=metrics.get("avg_composite_score", 0.0),
|
|
duration_seconds=(datetime.now() - start_time).total_seconds(),
|
|
metrics=metrics,
|
|
failed_test_ids=metrics.get("failed_test_ids", []),
|
|
raw_output=str(data),
|
|
)
|
|
|
|
else:
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
raw_output=f"HTTP {response.status_code}: {response.text}",
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
duration_seconds=(datetime.now() - start_time).total_seconds(),
|
|
raw_output=f"Timeout nach {timeout} Sekunden",
|
|
)
|
|
|
|
except httpx.ConnectError:
|
|
# Demo-Daten wenn Service nicht erreichbar
|
|
return self._get_demo_result(suite_type)
|
|
|
|
except Exception as e:
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
duration_seconds=(datetime.now() - start_time).total_seconds(),
|
|
raw_output=str(e),
|
|
)
|
|
|
|
def _get_demo_result(self, suite_type: str) -> BQASResult:
|
|
"""Gibt Demo-Daten zurueck wenn Service nicht erreichbar"""
|
|
if suite_type == "golden":
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
total_tests=97,
|
|
passed_tests=89,
|
|
failed_tests=8,
|
|
avg_score=4.15,
|
|
duration_seconds=45.2,
|
|
metrics={
|
|
"avg_intent_accuracy": 91.7,
|
|
"avg_faithfulness": 4.2,
|
|
"avg_relevance": 4.1,
|
|
"avg_coherence": 4.3,
|
|
"safety_pass_rate": 0.98,
|
|
},
|
|
failed_test_ids=["GT-023", "GT-045", "GT-067", "GT-072", "GT-081", "GT-089", "GT-092", "GT-095"],
|
|
raw_output="Demo-Modus: Voice-Service nicht erreichbar",
|
|
)
|
|
|
|
elif suite_type == "rag":
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
total_tests=20,
|
|
passed_tests=18,
|
|
failed_tests=2,
|
|
avg_score=4.25,
|
|
duration_seconds=62.1,
|
|
metrics={
|
|
"avg_faithfulness": 4.3,
|
|
"avg_relevance": 4.2,
|
|
"citation_accuracy": 0.92,
|
|
},
|
|
failed_test_ids=["RAG-EH-003", "RAG-HAL-002"],
|
|
raw_output="Demo-Modus: Voice-Service nicht erreichbar",
|
|
)
|
|
|
|
else: # synthetic
|
|
return BQASResult(
|
|
suite_type=suite_type,
|
|
total_tests=50,
|
|
passed_tests=45,
|
|
failed_tests=5,
|
|
avg_score=3.95,
|
|
duration_seconds=180.5,
|
|
metrics={
|
|
"avg_robustness": 3.8,
|
|
"avg_coherence": 4.1,
|
|
},
|
|
failed_test_ids=["SYN-001", "SYN-015", "SYN-023", "SYN-041", "SYN-048"],
|
|
raw_output="Demo-Modus: Voice-Service nicht erreichbar",
|
|
)
|
|
|
|
async def get_latest_metrics(self) -> Optional[Dict]:
|
|
"""
|
|
Holt die neuesten Metriken aus dem Voice-Service.
|
|
|
|
Returns:
|
|
Dict mit allen Metriken oder None
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(
|
|
f"{self.api_base}/api/v1/bqas/latest-metrics",
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
# Demo-Daten
|
|
return {
|
|
"golden": {
|
|
"total_tests": 97,
|
|
"passed_tests": 89,
|
|
"failed_tests": 8,
|
|
"avg_composite_score": 4.15,
|
|
"last_run": datetime.now().isoformat(),
|
|
},
|
|
"rag": {
|
|
"total_tests": 20,
|
|
"passed_tests": 18,
|
|
"failed_tests": 2,
|
|
"avg_composite_score": 4.25,
|
|
"last_run": datetime.now().isoformat(),
|
|
},
|
|
"synthetic": None,
|
|
}
|
|
|
|
async def get_trend(self, days: int = 30) -> Optional[Dict]:
|
|
"""
|
|
Holt Trend-Daten.
|
|
|
|
Args:
|
|
days: Anzahl der Tage
|
|
|
|
Returns:
|
|
Dict mit Trend-Daten oder None
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(
|
|
f"{self.api_base}/api/v1/bqas/trend",
|
|
params={"days": days},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
# Demo-Daten
|
|
return {
|
|
"dates": ["2026-01-02", "2026-01-09", "2026-01-16", "2026-01-23", "2026-01-30"],
|
|
"scores": [3.9, 4.0, 4.1, 4.15, 4.15],
|
|
"trend": "improving",
|
|
}
|
|
|
|
async def get_runs(self, limit: int = 20) -> list:
|
|
"""
|
|
Holt die letzten Test-Runs.
|
|
|
|
Args:
|
|
limit: Maximale Anzahl
|
|
|
|
Returns:
|
|
Liste von Test-Runs
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(
|
|
f"{self.api_base}/api/v1/bqas/runs",
|
|
params={"limit": limit},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
return data.get("runs", [])
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
# Demo-Daten
|
|
return [
|
|
{
|
|
"id": 1,
|
|
"timestamp": "2026-01-30T07:00:00Z",
|
|
"git_commit": "abc1234",
|
|
"golden_score": 4.15,
|
|
"total_tests": 97,
|
|
"passed_tests": 89,
|
|
"failed_tests": 8,
|
|
"duration_seconds": 45.2,
|
|
},
|
|
{
|
|
"id": 2,
|
|
"timestamp": "2026-01-29T07:00:00Z",
|
|
"git_commit": "def5678",
|
|
"golden_score": 4.12,
|
|
"total_tests": 97,
|
|
"passed_tests": 88,
|
|
"failed_tests": 9,
|
|
"duration_seconds": 44.8,
|
|
},
|
|
]
|