""" BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites Split into: - runner_golden.py: Test loading, simulation helpers, error result creation - runner.py (this file): BQASRunner class, singleton """ import structlog import httpx from typing import List, Dict, Any, Optional from datetime import datetime from dataclasses import dataclass, field from bqas.config import BQASConfig from bqas.judge import LLMJudge from bqas.rag_judge import RAGJudge from bqas.metrics import TestResult, BQASMetrics from bqas.synthetic_generator import SyntheticGenerator from bqas.runner_golden import ( load_golden_tests, load_rag_tests, simulate_response, create_error_result, simulate_rag_response, ) logger = structlog.get_logger(__name__) @dataclass class TestRun: """Record of a complete test run.""" id: int suite: str # golden, rag, synthetic timestamp: datetime git_commit: Optional[str] metrics: BQASMetrics results: List[TestResult] duration_seconds: float class BQASRunner: """ Main test runner for BQAS test suites. Executes: - Golden Suite: Pre-defined golden test cases from YAML - RAG Suite: RAG/Correction quality tests - Synthetic Suite: LLM-generated test variations """ def __init__(self, config: Optional[BQASConfig] = None): self.config = config or BQASConfig.from_env() self.judge = LLMJudge(self.config) self.rag_judge = RAGJudge(self.config) self.synthetic_generator = SyntheticGenerator(self.config) self._http_client: Optional[httpx.AsyncClient] = None self._test_runs: List[TestRun] = [] self._run_counter = 0 async def _get_client(self) -> httpx.AsyncClient: """Get or create HTTP client for voice service calls.""" if self._http_client is None: self._http_client = httpx.AsyncClient(timeout=30.0) return self._http_client # ================================ # Golden Suite Runner # ================================ async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun: """Run the golden test suite.""" logger.info("Starting Golden Suite run") start_time = datetime.utcnow() test_cases = await load_golden_tests() logger.info(f"Loaded {len(test_cases)} golden test cases") results = [] for i, test_case in enumerate(test_cases): try: result = await self._run_golden_test(test_case) results.append(result) if (i + 1) % 10 == 0: logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed") except Exception as e: logger.error(f"Test {test_case.get('id')} failed with error", error=str(e)) results.append(create_error_result(test_case, str(e))) metrics = BQASMetrics.from_results(results) duration = (datetime.utcnow() - start_time).total_seconds() self._run_counter += 1 run = TestRun( id=self._run_counter, suite="golden", timestamp=start_time, git_commit=git_commit, metrics=metrics, results=results, duration_seconds=duration, ) self._test_runs.insert(0, run) logger.info( "Golden Suite completed", total=metrics.total_tests, passed=metrics.passed_tests, failed=metrics.failed_tests, score=metrics.avg_composite_score, duration=f"{duration:.1f}s", ) return run async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult: """Run a single golden test case.""" test_id = test_case.get('id', 'UNKNOWN') test_name = test_case.get('name', '') user_input = test_case.get('input', '') expected_intent = test_case.get('expected_intent', '') min_score = test_case.get('min_score', self.config.min_golden_score) detected_intent, response = await self._get_voice_response(user_input, expected_intent) result = await self.judge.evaluate_test_case( test_id=test_id, test_name=test_name, user_input=user_input, expected_intent=expected_intent, detected_intent=detected_intent, response=response, min_score=min_score, ) return result async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple: """Get response from voice service.""" try: client = await self._get_client() response = await client.post( f"{self.config.voice_service_url}/api/v1/tasks", json={ "type": "intent_detection", "input": user_input, "namespace_id": "test_namespace", }, timeout=10.0, ) if response.status_code == 200: data = response.json() return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}") except Exception as e: logger.debug(f"Voice service call failed, using simulation", error=str(e)) return simulate_response(user_input, expected_intent) # ================================ # RAG Suite Runner # ================================ async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun: """Run the RAG/Correction test suite.""" logger.info("Starting RAG Suite run") start_time = datetime.utcnow() test_cases = await load_rag_tests() logger.info(f"Loaded {len(test_cases)} RAG test cases") results = [] for i, test_case in enumerate(test_cases): try: service_response = await simulate_rag_response(test_case) result = await self.rag_judge.evaluate_rag_test_case( test_case=test_case, service_response=service_response, ) results.append(result) if (i + 1) % 5 == 0: logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed") except Exception as e: logger.error(f"RAG test {test_case.get('id')} failed", error=str(e)) results.append(create_error_result(test_case, str(e))) metrics = BQASMetrics.from_results(results) duration = (datetime.utcnow() - start_time).total_seconds() self._run_counter += 1 run = TestRun( id=self._run_counter, suite="rag", timestamp=start_time, git_commit=git_commit, metrics=metrics, results=results, duration_seconds=duration, ) self._test_runs.insert(0, run) logger.info( "RAG Suite completed", total=metrics.total_tests, passed=metrics.passed_tests, score=metrics.avg_composite_score, duration=f"{duration:.1f}s", ) return run # ================================ # Synthetic Suite Runner # ================================ async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun: """Run the synthetic test suite.""" logger.info("Starting Synthetic Suite run") start_time = datetime.utcnow() all_variations = await self.synthetic_generator.generate_all_intents( count_per_intent=self.config.synthetic_count_per_intent ) test_cases = [] for intent, variations in all_variations.items(): for i, v in enumerate(variations): test_cases.append({ 'id': f"SYN-{intent.upper()[:4]}-{i+1:03d}", 'name': f"Synthetic {intent} #{i+1}", 'input': v.input, 'expected_intent': v.expected_intent, 'slots': v.slots, 'source': v.source, 'min_score': self.config.min_synthetic_score, }) logger.info(f"Generated {len(test_cases)} synthetic test cases") results = [] for i, test_case in enumerate(test_cases): try: result = await self._run_golden_test(test_case) results.append(result) if (i + 1) % 20 == 0: logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed") except Exception as e: logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e)) results.append(create_error_result(test_case, str(e))) metrics = BQASMetrics.from_results(results) duration = (datetime.utcnow() - start_time).total_seconds() self._run_counter += 1 run = TestRun( id=self._run_counter, suite="synthetic", timestamp=start_time, git_commit=git_commit, metrics=metrics, results=results, duration_seconds=duration, ) self._test_runs.insert(0, run) logger.info( "Synthetic Suite completed", total=metrics.total_tests, passed=metrics.passed_tests, score=metrics.avg_composite_score, duration=f"{duration:.1f}s", ) return run # ================================ # Utility Methods # ================================ def get_test_runs(self, limit: int = 20) -> List[TestRun]: """Get recent test runs.""" return self._test_runs[:limit] def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]: """Get latest metrics for each suite.""" result = {"golden": None, "rag": None, "synthetic": None} for run in self._test_runs: if result[run.suite] is None: result[run.suite] = run.metrics if all(v is not None for v in result.values()): break return result async def health_check(self) -> Dict[str, Any]: """Check health of BQAS components.""" judge_ok = await self.judge.health_check() rag_judge_ok = await self.rag_judge.health_check() return { "judge_available": judge_ok, "rag_judge_available": rag_judge_ok, "test_runs_count": len(self._test_runs), "config": { "ollama_url": self.config.ollama_base_url, "judge_model": self.config.judge_model, } } async def close(self): """Cleanup resources.""" await self.judge.close() await self.rag_judge.close() await self.synthetic_generator.close() if self._http_client: await self._http_client.aclose() self._http_client = None # Singleton instance for the API _runner_instance: Optional[BQASRunner] = None def get_runner() -> BQASRunner: """Get or create the global BQASRunner instance.""" global _runner_instance if _runner_instance is None: _runner_instance = BQASRunner() return _runner_instance