#!/usr/bin/env python3 """ BQAS Runner Script Run BQAS tests and generate reports """ import asyncio import argparse import sys import json from pathlib import Path from datetime import datetime # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) from bqas.judge import LLMJudge from bqas.config import BQASConfig from bqas.regression_tracker import RegressionTracker from bqas.synthetic_generator import SyntheticGenerator from bqas.backlog_generator import BacklogGenerator from bqas.metrics import BQASMetrics, TestResult async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list: """Run the golden test suite.""" import yaml results = [] golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" for yaml_file in golden_dir.glob("*.yaml"): print(f"\nšŸ“‹ Loading {yaml_file.name}...") with open(yaml_file) as f: data = yaml.safe_load(f) tests = data.get("tests", []) + data.get("edge_cases", []) for test in tests: test_id = test.get("id", "UNKNOWN") print(f" Testing {test_id}...", end=" ", flush=True) result = await judge.evaluate_test_case( test_id=test_id, test_name=test.get("name", ""), user_input=test.get("input", ""), expected_intent=test.get("expected_intent", "unknown"), detected_intent=test.get("expected_intent", "unknown"), # Mock for now response="Verstanden.", min_score=test.get("min_score", 3.5), ) results.append(result) if result.passed: print(f"āœ… {result.composite_score:.2f}") else: print(f"āŒ {result.composite_score:.2f} ({result.reasoning[:50]})") return results async def run_synthetic_tests( config: BQASConfig, judge: LLMJudge, generator: SyntheticGenerator, ) -> list: """Run synthetic tests.""" results = [] print("\nšŸ”„ Generating synthetic tests...") intents = ["student_observation", "worksheet_generate", "reminder"] for intent in intents: print(f"\n Intent: {intent}") variations = generator._generate_fallback(intent, count=5) for i, var in enumerate(variations): test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}" print(f" {test_id}...", end=" ", flush=True) result = await judge.evaluate_test_case( test_id=test_id, test_name=f"Synthetic {intent}", user_input=var.input, expected_intent=var.expected_intent, detected_intent=var.expected_intent, response="Verstanden.", min_score=3.0, ) results.append(result) if result.passed: print(f"āœ… {result.composite_score:.2f}") else: print(f"āŒ {result.composite_score:.2f}") return results def generate_report( golden_metrics: BQASMetrics, synthetic_metrics: BQASMetrics, output_path: Path, ): """Generate HTML report.""" html = f""" BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}

BQAS Test Report

Golden Suite

Total: {golden_metrics.total_tests}

Passed: {golden_metrics.passed_tests}

Failed: {golden_metrics.failed_tests}

Avg Score: {golden_metrics.avg_composite_score:.3f}

Synthetic Tests

Total: {synthetic_metrics.total_tests}

Passed: {synthetic_metrics.passed_tests}

Failed: {synthetic_metrics.failed_tests}

Avg Score: {synthetic_metrics.avg_composite_score:.3f}

Scores by Intent

{''.join(f"" for k, v in golden_metrics.scores_by_intent.items())}
IntentScore
{k}{v:.3f}

Failed Tests

""" output_path.write_text(html) print(f"\nšŸ“Š Report saved to: {output_path}") async def main(): parser = argparse.ArgumentParser(description="BQAS Test Runner") parser.add_argument("--all", action="store_true", help="Run all tests") parser.add_argument("--golden", action="store_true", help="Run golden suite only") parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only") parser.add_argument("--check-regression", action="store_true", help="Check for regression") parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold") parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures") parser.add_argument("--report", action="store_true", help="Generate HTML report") parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path") args = parser.parse_args() # Default to --all if no specific test type selected if not (args.golden or args.synthetic or args.check_regression): args.all = True print("=" * 60) print("BQAS - Breakpilot Quality Assurance System") print("=" * 60) config = BQASConfig.from_env() judge = LLMJudge(config=config) tracker = RegressionTracker(config=config) generator = SyntheticGenerator(config=config) backlog = BacklogGenerator(config=config) # Check if judge is available print("\nšŸ” Checking LLM availability...") is_available = await judge.health_check() if not is_available: print("āŒ LLM Judge not available. Make sure Ollama is running with the model.") print(f" Expected model: {config.judge_model}") print(f" Ollama URL: {config.ollama_base_url}") sys.exit(1) print("āœ… LLM Judge available") golden_results = [] synthetic_results = [] # Run tests if args.all or args.golden: print("\n" + "=" * 60) print("Running Golden Suite") print("=" * 60) golden_results = await run_golden_suite(config, judge) if args.all or args.synthetic: print("\n" + "=" * 60) print("Running Synthetic Tests") print("=" * 60) synthetic_results = await run_synthetic_tests(config, judge, generator) # Calculate metrics golden_metrics = BQASMetrics.from_results(golden_results) synthetic_metrics = BQASMetrics.from_results(synthetic_results) # Print summary print("\n" + golden_metrics.summary()) # Record run if golden_results: run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score) print(f"\nšŸ“ Run recorded: #{run.id}") # Check regression if args.check_regression: print("\nšŸ” Checking for regression...") is_regression, delta, msg = tracker.check_regression( golden_metrics.avg_composite_score, args.threshold, ) print(f" {msg}") if is_regression and args.create_issues: print("\nšŸ“® Creating regression alert...") runs = tracker.get_last_runs(1) if runs: url = await backlog.create_regression_alert( golden_metrics.avg_composite_score, golden_metrics.avg_composite_score + delta, delta, runs[0], ) if url: print(f" Issue created: {url}") # Create issues for failures if args.create_issues and golden_metrics.failed_tests > 0: print("\nšŸ“® Creating issue for test failures...") failed = [r for r in golden_results if not r.passed] runs = tracker.get_last_runs(1) if runs: url = await backlog.create_issue( runs[0], golden_metrics, failed, ) if url: print(f" Issue created: {url}") # Generate report if args.report: generate_report( golden_metrics, synthetic_metrics, Path(args.output), ) # Cleanup await judge.close() await generator.close() # Exit with error code if tests failed if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0: sys.exit(1) if __name__ == "__main__": asyncio.run(main())