website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
299 lines
11 KiB
Python
299 lines
11 KiB
Python
"""
|
|
BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
|
|
|
|
Split into:
|
|
- runner_golden.py: Test loading, simulation helpers, error result creation
|
|
- runner.py (this file): BQASRunner class, singleton
|
|
"""
|
|
import structlog
|
|
import httpx
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
from dataclasses import dataclass, field
|
|
|
|
from bqas.config import BQASConfig
|
|
from bqas.judge import LLMJudge
|
|
from bqas.rag_judge import RAGJudge
|
|
from bqas.metrics import TestResult, BQASMetrics
|
|
from bqas.synthetic_generator import SyntheticGenerator
|
|
from bqas.runner_golden import (
|
|
load_golden_tests,
|
|
load_rag_tests,
|
|
simulate_response,
|
|
create_error_result,
|
|
simulate_rag_response,
|
|
)
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TestRun:
|
|
"""Record of a complete test run."""
|
|
id: int
|
|
suite: str # golden, rag, synthetic
|
|
timestamp: datetime
|
|
git_commit: Optional[str]
|
|
metrics: BQASMetrics
|
|
results: List[TestResult]
|
|
duration_seconds: float
|
|
|
|
|
|
class BQASRunner:
|
|
"""
|
|
Main test runner for BQAS test suites.
|
|
|
|
Executes:
|
|
- Golden Suite: Pre-defined golden test cases from YAML
|
|
- RAG Suite: RAG/Correction quality tests
|
|
- Synthetic Suite: LLM-generated test variations
|
|
"""
|
|
|
|
def __init__(self, config: Optional[BQASConfig] = None):
|
|
self.config = config or BQASConfig.from_env()
|
|
self.judge = LLMJudge(self.config)
|
|
self.rag_judge = RAGJudge(self.config)
|
|
self.synthetic_generator = SyntheticGenerator(self.config)
|
|
self._http_client: Optional[httpx.AsyncClient] = None
|
|
self._test_runs: List[TestRun] = []
|
|
self._run_counter = 0
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""Get or create HTTP client for voice service calls."""
|
|
if self._http_client is None:
|
|
self._http_client = httpx.AsyncClient(timeout=30.0)
|
|
return self._http_client
|
|
|
|
# ================================
|
|
# Golden Suite Runner
|
|
# ================================
|
|
|
|
async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
|
|
"""Run the golden test suite."""
|
|
logger.info("Starting Golden Suite run")
|
|
start_time = datetime.utcnow()
|
|
|
|
test_cases = await load_golden_tests()
|
|
logger.info(f"Loaded {len(test_cases)} golden test cases")
|
|
|
|
results = []
|
|
for i, test_case in enumerate(test_cases):
|
|
try:
|
|
result = await self._run_golden_test(test_case)
|
|
results.append(result)
|
|
if (i + 1) % 10 == 0:
|
|
logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
|
|
except Exception as e:
|
|
logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
|
|
results.append(create_error_result(test_case, str(e)))
|
|
|
|
metrics = BQASMetrics.from_results(results)
|
|
duration = (datetime.utcnow() - start_time).total_seconds()
|
|
|
|
self._run_counter += 1
|
|
run = TestRun(
|
|
id=self._run_counter, suite="golden", timestamp=start_time,
|
|
git_commit=git_commit, metrics=metrics, results=results,
|
|
duration_seconds=duration,
|
|
)
|
|
self._test_runs.insert(0, run)
|
|
|
|
logger.info(
|
|
"Golden Suite completed", total=metrics.total_tests,
|
|
passed=metrics.passed_tests, failed=metrics.failed_tests,
|
|
score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
|
|
)
|
|
return run
|
|
|
|
async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
|
|
"""Run a single golden test case."""
|
|
test_id = test_case.get('id', 'UNKNOWN')
|
|
test_name = test_case.get('name', '')
|
|
user_input = test_case.get('input', '')
|
|
expected_intent = test_case.get('expected_intent', '')
|
|
min_score = test_case.get('min_score', self.config.min_golden_score)
|
|
|
|
detected_intent, response = await self._get_voice_response(user_input, expected_intent)
|
|
|
|
result = await self.judge.evaluate_test_case(
|
|
test_id=test_id, test_name=test_name, user_input=user_input,
|
|
expected_intent=expected_intent, detected_intent=detected_intent,
|
|
response=response, min_score=min_score,
|
|
)
|
|
return result
|
|
|
|
async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
|
|
"""Get response from voice service."""
|
|
try:
|
|
client = await self._get_client()
|
|
response = await client.post(
|
|
f"{self.config.voice_service_url}/api/v1/tasks",
|
|
json={
|
|
"type": "intent_detection",
|
|
"input": user_input,
|
|
"namespace_id": "test_namespace",
|
|
},
|
|
timeout=10.0,
|
|
)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
|
|
except Exception as e:
|
|
logger.debug(f"Voice service call failed, using simulation", error=str(e))
|
|
|
|
return simulate_response(user_input, expected_intent)
|
|
|
|
# ================================
|
|
# RAG Suite Runner
|
|
# ================================
|
|
|
|
async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
|
|
"""Run the RAG/Correction test suite."""
|
|
logger.info("Starting RAG Suite run")
|
|
start_time = datetime.utcnow()
|
|
|
|
test_cases = await load_rag_tests()
|
|
logger.info(f"Loaded {len(test_cases)} RAG test cases")
|
|
|
|
results = []
|
|
for i, test_case in enumerate(test_cases):
|
|
try:
|
|
service_response = await simulate_rag_response(test_case)
|
|
result = await self.rag_judge.evaluate_rag_test_case(
|
|
test_case=test_case, service_response=service_response,
|
|
)
|
|
results.append(result)
|
|
if (i + 1) % 5 == 0:
|
|
logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
|
|
except Exception as e:
|
|
logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
|
|
results.append(create_error_result(test_case, str(e)))
|
|
|
|
metrics = BQASMetrics.from_results(results)
|
|
duration = (datetime.utcnow() - start_time).total_seconds()
|
|
|
|
self._run_counter += 1
|
|
run = TestRun(
|
|
id=self._run_counter, suite="rag", timestamp=start_time,
|
|
git_commit=git_commit, metrics=metrics, results=results,
|
|
duration_seconds=duration,
|
|
)
|
|
self._test_runs.insert(0, run)
|
|
|
|
logger.info(
|
|
"RAG Suite completed", total=metrics.total_tests,
|
|
passed=metrics.passed_tests, score=metrics.avg_composite_score,
|
|
duration=f"{duration:.1f}s",
|
|
)
|
|
return run
|
|
|
|
# ================================
|
|
# Synthetic Suite Runner
|
|
# ================================
|
|
|
|
async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
|
|
"""Run the synthetic test suite."""
|
|
logger.info("Starting Synthetic Suite run")
|
|
start_time = datetime.utcnow()
|
|
|
|
all_variations = await self.synthetic_generator.generate_all_intents(
|
|
count_per_intent=self.config.synthetic_count_per_intent
|
|
)
|
|
|
|
test_cases = []
|
|
for intent, variations in all_variations.items():
|
|
for i, v in enumerate(variations):
|
|
test_cases.append({
|
|
'id': f"SYN-{intent.upper()[:4]}-{i+1:03d}",
|
|
'name': f"Synthetic {intent} #{i+1}",
|
|
'input': v.input,
|
|
'expected_intent': v.expected_intent,
|
|
'slots': v.slots,
|
|
'source': v.source,
|
|
'min_score': self.config.min_synthetic_score,
|
|
})
|
|
|
|
logger.info(f"Generated {len(test_cases)} synthetic test cases")
|
|
|
|
results = []
|
|
for i, test_case in enumerate(test_cases):
|
|
try:
|
|
result = await self._run_golden_test(test_case)
|
|
results.append(result)
|
|
if (i + 1) % 20 == 0:
|
|
logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
|
|
except Exception as e:
|
|
logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
|
|
results.append(create_error_result(test_case, str(e)))
|
|
|
|
metrics = BQASMetrics.from_results(results)
|
|
duration = (datetime.utcnow() - start_time).total_seconds()
|
|
|
|
self._run_counter += 1
|
|
run = TestRun(
|
|
id=self._run_counter, suite="synthetic", timestamp=start_time,
|
|
git_commit=git_commit, metrics=metrics, results=results,
|
|
duration_seconds=duration,
|
|
)
|
|
self._test_runs.insert(0, run)
|
|
|
|
logger.info(
|
|
"Synthetic Suite completed", total=metrics.total_tests,
|
|
passed=metrics.passed_tests, score=metrics.avg_composite_score,
|
|
duration=f"{duration:.1f}s",
|
|
)
|
|
return run
|
|
|
|
# ================================
|
|
# Utility Methods
|
|
# ================================
|
|
|
|
def get_test_runs(self, limit: int = 20) -> List[TestRun]:
|
|
"""Get recent test runs."""
|
|
return self._test_runs[:limit]
|
|
|
|
def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
|
|
"""Get latest metrics for each suite."""
|
|
result = {"golden": None, "rag": None, "synthetic": None}
|
|
for run in self._test_runs:
|
|
if result[run.suite] is None:
|
|
result[run.suite] = run.metrics
|
|
if all(v is not None for v in result.values()):
|
|
break
|
|
return result
|
|
|
|
async def health_check(self) -> Dict[str, Any]:
|
|
"""Check health of BQAS components."""
|
|
judge_ok = await self.judge.health_check()
|
|
rag_judge_ok = await self.rag_judge.health_check()
|
|
return {
|
|
"judge_available": judge_ok,
|
|
"rag_judge_available": rag_judge_ok,
|
|
"test_runs_count": len(self._test_runs),
|
|
"config": {
|
|
"ollama_url": self.config.ollama_base_url,
|
|
"judge_model": self.config.judge_model,
|
|
}
|
|
}
|
|
|
|
async def close(self):
|
|
"""Cleanup resources."""
|
|
await self.judge.close()
|
|
await self.rag_judge.close()
|
|
await self.synthetic_generator.close()
|
|
if self._http_client:
|
|
await self._http_client.aclose()
|
|
self._http_client = None
|
|
|
|
|
|
# Singleton instance for the API
|
|
_runner_instance: Optional[BQASRunner] = None
|
|
|
|
|
|
def get_runner() -> BQASRunner:
|
|
"""Get or create the global BQASRunner instance."""
|
|
global _runner_instance
|
|
if _runner_instance is None:
|
|
_runner_instance = BQASRunner()
|
|
return _runner_instance
|