[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions

View File

@@ -1,11 +1,12 @@
"""
BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
Split into:
- runner_golden.py: Test loading, simulation helpers, error result creation
- runner.py (this file): BQASRunner class, singleton
"""
import yaml
import asyncio
import structlog
import httpx
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
from dataclasses import dataclass, field
@@ -15,6 +16,13 @@ from bqas.judge import LLMJudge
from bqas.rag_judge import RAGJudge
from bqas.metrics import TestResult, BQASMetrics
from bqas.synthetic_generator import SyntheticGenerator
from bqas.runner_golden import (
load_golden_tests,
load_rag_tests,
simulate_response,
create_error_result,
simulate_rag_response,
)
logger = structlog.get_logger(__name__)
@@ -61,87 +69,42 @@ class BQASRunner:
# ================================
async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""
Run the golden test suite.
Loads test cases from YAML files and evaluates each one.
"""
"""Run the golden test suite."""
logger.info("Starting Golden Suite run")
start_time = datetime.utcnow()
# Load all golden test cases
test_cases = await self._load_golden_tests()
test_cases = await load_golden_tests()
logger.info(f"Loaded {len(test_cases)} golden test cases")
# Run all tests
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_golden_test(test_case)
results.append(result)
if (i + 1) % 10 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
except Exception as e:
logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
# Create a failed result
results.append(self._create_error_result(test_case, str(e)))
results.append(create_error_result(test_case, str(e)))
# Calculate metrics
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
# Record run
self._run_counter += 1
run = TestRun(
id=self._run_counter,
suite="golden",
timestamp=start_time,
git_commit=git_commit,
metrics=metrics,
results=results,
id=self._run_counter, suite="golden", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"Golden Suite completed",
total=metrics.total_tests,
passed=metrics.passed_tests,
failed=metrics.failed_tests,
score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
"Golden Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, failed=metrics.failed_tests,
score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
)
return run
async def _load_golden_tests(self) -> List[Dict[str, Any]]:
"""Load all golden test cases from YAML files."""
tests = []
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
yaml_files = [
"intent_tests.yaml",
"edge_cases.yaml",
"workflow_tests.yaml",
]
for filename in yaml_files:
filepath = golden_dir / filename
if filepath.exists():
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and 'tests' in data:
for test in data['tests']:
test['source_file'] = filename
tests.extend(data['tests'])
except Exception as e:
logger.warning(f"Failed to load {filename}", error=str(e))
return tests
async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
"""Run a single golden test case."""
test_id = test_case.get('id', 'UNKNOWN')
@@ -150,38 +113,19 @@ class BQASRunner:
expected_intent = test_case.get('expected_intent', '')
min_score = test_case.get('min_score', self.config.min_golden_score)
# Get response from voice service (or simulate)
detected_intent, response = await self._get_voice_response(user_input, expected_intent)
# Evaluate with judge
result = await self.judge.evaluate_test_case(
test_id=test_id,
test_name=test_name,
user_input=user_input,
expected_intent=expected_intent,
detected_intent=detected_intent,
response=response,
min_score=min_score,
test_id=test_id, test_name=test_name, user_input=user_input,
expected_intent=expected_intent, detected_intent=detected_intent,
response=response, min_score=min_score,
)
return result
async def _get_voice_response(
self,
user_input: str,
expected_intent: str
) -> tuple[str, str]:
"""
Get response from voice service.
For now, simulates responses since the full voice pipeline
might not be available. In production, this would call the
actual voice service endpoints.
"""
async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
"""Get response from voice service."""
try:
client = await self._get_client()
# Try to call the voice service intent detection
response = await client.post(
f"{self.config.voice_service_url}/api/v1/tasks",
json={
@@ -191,231 +135,71 @@ class BQASRunner:
},
timeout=10.0,
)
if response.status_code == 200:
data = response.json()
return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
except Exception as e:
logger.debug(f"Voice service call failed, using simulation", error=str(e))
# Simulate response based on expected intent
return self._simulate_response(user_input, expected_intent)
def _simulate_response(self, user_input: str, expected_intent: str) -> tuple[str, str]:
"""Simulate voice service response for testing without live service."""
# Simulate realistic detected intent (90% correct for golden tests)
import random
if random.random() < 0.90:
detected_intent = expected_intent
else:
# Simulate occasional misclassification
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
detected_intent = random.choice([i for i in intents if i != expected_intent])
# Generate simulated response
responses = {
"student_observation": f"Notiz wurde gespeichert: {user_input}",
"reminder": f"Erinnerung erstellt: {user_input}",
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
"quiz_generate": f"Quiz wird erstellt: {user_input}",
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
"canvas_layout": f"Layout wird angepasst: {user_input}",
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
"eh_passage": f"EH-Passage gefunden: {user_input}",
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
"reminder_schedule": f"Erinnerung geplant: {user_input}",
"task_summary": f"Aufgabenuebersicht: {user_input}",
"conference_topic": f"Konferenzthema notiert: {user_input}",
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
}
response = responses.get(detected_intent, f"Verstanden: {user_input}")
return detected_intent, response
def _create_error_result(self, test_case: Dict[str, Any], error: str) -> TestResult:
"""Create a failed test result due to error."""
return TestResult(
test_id=test_case.get('id', 'UNKNOWN'),
test_name=test_case.get('name', 'Error'),
user_input=test_case.get('input', ''),
expected_intent=test_case.get('expected_intent', ''),
detected_intent='error',
response='',
intent_accuracy=0,
faithfulness=1,
relevance=1,
coherence=1,
safety='fail',
composite_score=0.0,
passed=False,
reasoning=f"Test execution error: {error}",
timestamp=datetime.utcnow(),
duration_ms=0,
)
return simulate_response(user_input, expected_intent)
# ================================
# RAG Suite Runner
# ================================
async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""
Run the RAG/Correction test suite.
Tests EH retrieval, operator alignment, hallucination control, etc.
"""
"""Run the RAG/Correction test suite."""
logger.info("Starting RAG Suite run")
start_time = datetime.utcnow()
# Load RAG test cases
test_cases = await self._load_rag_tests()
test_cases = await load_rag_tests()
logger.info(f"Loaded {len(test_cases)} RAG test cases")
# Run all tests
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_rag_test(test_case)
service_response = await simulate_rag_response(test_case)
result = await self.rag_judge.evaluate_rag_test_case(
test_case=test_case, service_response=service_response,
)
results.append(result)
if (i + 1) % 5 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
except Exception as e:
logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
results.append(self._create_error_result(test_case, str(e)))
results.append(create_error_result(test_case, str(e)))
# Calculate metrics
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
# Record run
self._run_counter += 1
run = TestRun(
id=self._run_counter,
suite="rag",
timestamp=start_time,
git_commit=git_commit,
metrics=metrics,
results=results,
id=self._run_counter, suite="rag", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"RAG Suite completed",
total=metrics.total_tests,
passed=metrics.passed_tests,
score=metrics.avg_composite_score,
"RAG Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
)
return run
async def _load_rag_tests(self) -> List[Dict[str, Any]]:
"""Load RAG test cases from YAML."""
tests = []
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
if rag_file.exists():
try:
with open(rag_file, 'r', encoding='utf-8') as f:
# Handle YAML documents separated by ---
documents = list(yaml.safe_load_all(f))
for doc in documents:
if doc and 'tests' in doc:
tests.extend(doc['tests'])
if doc and 'edge_cases' in doc:
tests.extend(doc['edge_cases'])
except Exception as e:
logger.warning(f"Failed to load RAG tests", error=str(e))
return tests
async def _run_rag_test(self, test_case: Dict[str, Any]) -> TestResult:
"""Run a single RAG test case."""
# Simulate service response for RAG tests
service_response = await self._simulate_rag_response(test_case)
# Evaluate with RAG judge
result = await self.rag_judge.evaluate_rag_test_case(
test_case=test_case,
service_response=service_response,
)
return result
async def _simulate_rag_response(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
"""Simulate RAG service response."""
category = test_case.get('category', '')
input_data = test_case.get('input', {})
expected = test_case.get('expected', {})
# Simulate responses based on category
if category == 'eh_retrieval':
concepts = expected.get('must_contain_concepts', [])
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
return {
"passage": passage,
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
"relevance_score": 0.85,
}
elif category == 'operator_alignment':
operator = input_data.get('operator', '')
afb = expected.get('afb_level', 'II')
actions = expected.get('expected_actions', [])
return {
"operator": operator,
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
"afb_level": afb,
}
elif category == 'hallucination_control':
return {
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
"grounded": True,
}
elif category == 'privacy_compliance':
return {
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
"contains_pii": False,
}
elif category == 'namespace_isolation':
return {
"response": "Zugriff nur auf Daten im eigenen Namespace.",
"namespace_violation": False,
}
return {"response": "Simulated response", "success": True}
# ================================
# Synthetic Suite Runner
# ================================
async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""
Run the synthetic test suite.
Generates test variations using LLM and evaluates them.
"""
"""Run the synthetic test suite."""
logger.info("Starting Synthetic Suite run")
start_time = datetime.utcnow()
# Generate synthetic tests
all_variations = await self.synthetic_generator.generate_all_intents(
count_per_intent=self.config.synthetic_count_per_intent
)
# Flatten variations
test_cases = []
for intent, variations in all_variations.items():
for i, v in enumerate(variations):
@@ -431,45 +215,33 @@ class BQASRunner:
logger.info(f"Generated {len(test_cases)} synthetic test cases")
# Run all tests
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_golden_test(test_case) # Same logic as golden
result = await self._run_golden_test(test_case)
results.append(result)
if (i + 1) % 20 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
except Exception as e:
logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
results.append(self._create_error_result(test_case, str(e)))
results.append(create_error_result(test_case, str(e)))
# Calculate metrics
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
# Record run
self._run_counter += 1
run = TestRun(
id=self._run_counter,
suite="synthetic",
timestamp=start_time,
git_commit=git_commit,
metrics=metrics,
results=results,
id=self._run_counter, suite="synthetic", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"Synthetic Suite completed",
total=metrics.total_tests,
passed=metrics.passed_tests,
score=metrics.avg_composite_score,
"Synthetic Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
)
return run
# ================================
@@ -483,20 +255,17 @@ class BQASRunner:
def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
"""Get latest metrics for each suite."""
result = {"golden": None, "rag": None, "synthetic": None}
for run in self._test_runs:
if result[run.suite] is None:
result[run.suite] = run.metrics
if all(v is not None for v in result.values()):
break
return result
async def health_check(self) -> Dict[str, Any]:
"""Check health of BQAS components."""
judge_ok = await self.judge.health_check()
rag_judge_ok = await self.rag_judge.health_check()
return {
"judge_available": judge_ok,
"rag_judge_available": rag_judge_ok,