Files
breakpilot-lehrer/voice-service/bqas/runner.py
Benjamin Admin 451365a312 [split-required] Split remaining 500-680 LOC files (final batch)
website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00

299 lines
11 KiB
Python

"""
BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
Split into:
- runner_golden.py: Test loading, simulation helpers, error result creation
- runner.py (this file): BQASRunner class, singleton
"""
import structlog
import httpx
from typing import List, Dict, Any, Optional
from datetime import datetime
from dataclasses import dataclass, field
from bqas.config import BQASConfig
from bqas.judge import LLMJudge
from bqas.rag_judge import RAGJudge
from bqas.metrics import TestResult, BQASMetrics
from bqas.synthetic_generator import SyntheticGenerator
from bqas.runner_golden import (
load_golden_tests,
load_rag_tests,
simulate_response,
create_error_result,
simulate_rag_response,
)
logger = structlog.get_logger(__name__)
@dataclass
class TestRun:
"""Record of a complete test run."""
id: int
suite: str # golden, rag, synthetic
timestamp: datetime
git_commit: Optional[str]
metrics: BQASMetrics
results: List[TestResult]
duration_seconds: float
class BQASRunner:
"""
Main test runner for BQAS test suites.
Executes:
- Golden Suite: Pre-defined golden test cases from YAML
- RAG Suite: RAG/Correction quality tests
- Synthetic Suite: LLM-generated test variations
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
self.judge = LLMJudge(self.config)
self.rag_judge = RAGJudge(self.config)
self.synthetic_generator = SyntheticGenerator(self.config)
self._http_client: Optional[httpx.AsyncClient] = None
self._test_runs: List[TestRun] = []
self._run_counter = 0
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client for voice service calls."""
if self._http_client is None:
self._http_client = httpx.AsyncClient(timeout=30.0)
return self._http_client
# ================================
# Golden Suite Runner
# ================================
async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""Run the golden test suite."""
logger.info("Starting Golden Suite run")
start_time = datetime.utcnow()
test_cases = await load_golden_tests()
logger.info(f"Loaded {len(test_cases)} golden test cases")
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_golden_test(test_case)
results.append(result)
if (i + 1) % 10 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
except Exception as e:
logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
results.append(create_error_result(test_case, str(e)))
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
self._run_counter += 1
run = TestRun(
id=self._run_counter, suite="golden", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"Golden Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, failed=metrics.failed_tests,
score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
)
return run
async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
"""Run a single golden test case."""
test_id = test_case.get('id', 'UNKNOWN')
test_name = test_case.get('name', '')
user_input = test_case.get('input', '')
expected_intent = test_case.get('expected_intent', '')
min_score = test_case.get('min_score', self.config.min_golden_score)
detected_intent, response = await self._get_voice_response(user_input, expected_intent)
result = await self.judge.evaluate_test_case(
test_id=test_id, test_name=test_name, user_input=user_input,
expected_intent=expected_intent, detected_intent=detected_intent,
response=response, min_score=min_score,
)
return result
async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
"""Get response from voice service."""
try:
client = await self._get_client()
response = await client.post(
f"{self.config.voice_service_url}/api/v1/tasks",
json={
"type": "intent_detection",
"input": user_input,
"namespace_id": "test_namespace",
},
timeout=10.0,
)
if response.status_code == 200:
data = response.json()
return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
except Exception as e:
logger.debug(f"Voice service call failed, using simulation", error=str(e))
return simulate_response(user_input, expected_intent)
# ================================
# RAG Suite Runner
# ================================
async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""Run the RAG/Correction test suite."""
logger.info("Starting RAG Suite run")
start_time = datetime.utcnow()
test_cases = await load_rag_tests()
logger.info(f"Loaded {len(test_cases)} RAG test cases")
results = []
for i, test_case in enumerate(test_cases):
try:
service_response = await simulate_rag_response(test_case)
result = await self.rag_judge.evaluate_rag_test_case(
test_case=test_case, service_response=service_response,
)
results.append(result)
if (i + 1) % 5 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
except Exception as e:
logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
results.append(create_error_result(test_case, str(e)))
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
self._run_counter += 1
run = TestRun(
id=self._run_counter, suite="rag", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"RAG Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
)
return run
# ================================
# Synthetic Suite Runner
# ================================
async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""Run the synthetic test suite."""
logger.info("Starting Synthetic Suite run")
start_time = datetime.utcnow()
all_variations = await self.synthetic_generator.generate_all_intents(
count_per_intent=self.config.synthetic_count_per_intent
)
test_cases = []
for intent, variations in all_variations.items():
for i, v in enumerate(variations):
test_cases.append({
'id': f"SYN-{intent.upper()[:4]}-{i+1:03d}",
'name': f"Synthetic {intent} #{i+1}",
'input': v.input,
'expected_intent': v.expected_intent,
'slots': v.slots,
'source': v.source,
'min_score': self.config.min_synthetic_score,
})
logger.info(f"Generated {len(test_cases)} synthetic test cases")
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_golden_test(test_case)
results.append(result)
if (i + 1) % 20 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
except Exception as e:
logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
results.append(create_error_result(test_case, str(e)))
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
self._run_counter += 1
run = TestRun(
id=self._run_counter, suite="synthetic", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"Synthetic Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
)
return run
# ================================
# Utility Methods
# ================================
def get_test_runs(self, limit: int = 20) -> List[TestRun]:
"""Get recent test runs."""
return self._test_runs[:limit]
def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
"""Get latest metrics for each suite."""
result = {"golden": None, "rag": None, "synthetic": None}
for run in self._test_runs:
if result[run.suite] is None:
result[run.suite] = run.metrics
if all(v is not None for v in result.values()):
break
return result
async def health_check(self) -> Dict[str, Any]:
"""Check health of BQAS components."""
judge_ok = await self.judge.health_check()
rag_judge_ok = await self.rag_judge.health_check()
return {
"judge_available": judge_ok,
"rag_judge_available": rag_judge_ok,
"test_runs_count": len(self._test_runs),
"config": {
"ollama_url": self.config.ollama_base_url,
"judge_model": self.config.judge_model,
}
}
async def close(self):
"""Cleanup resources."""
await self.judge.close()
await self.rag_judge.close()
await self.synthetic_generator.close()
if self._http_client:
await self._http_client.aclose()
self._http_client = None
# Singleton instance for the API
_runner_instance: Optional[BQASRunner] = None
def get_runner() -> BQASRunner:
"""Get or create the global BQASRunner instance."""
global _runner_instance
if _runner_instance is None:
_runner_instance = BQASRunner()
return _runner_instance