#!/usr/bin/env python3
"""
BQAS Runner Script
Run BQAS tests and generate reports
"""
import asyncio
import argparse
import sys
import json
from pathlib import Path
from datetime import datetime
# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from bqas.judge import LLMJudge
from bqas.config import BQASConfig
from bqas.regression_tracker import RegressionTracker
from bqas.synthetic_generator import SyntheticGenerator
from bqas.backlog_generator import BacklogGenerator
from bqas.metrics import BQASMetrics, TestResult
async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list:
"""Run the golden test suite."""
import yaml
results = []
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
for yaml_file in golden_dir.glob("*.yaml"):
print(f"\nš Loading {yaml_file.name}...")
with open(yaml_file) as f:
data = yaml.safe_load(f)
tests = data.get("tests", []) + data.get("edge_cases", [])
for test in tests:
test_id = test.get("id", "UNKNOWN")
print(f" Testing {test_id}...", end=" ", flush=True)
result = await judge.evaluate_test_case(
test_id=test_id,
test_name=test.get("name", ""),
user_input=test.get("input", ""),
expected_intent=test.get("expected_intent", "unknown"),
detected_intent=test.get("expected_intent", "unknown"), # Mock for now
response="Verstanden.",
min_score=test.get("min_score", 3.5),
)
results.append(result)
if result.passed:
print(f"ā
{result.composite_score:.2f}")
else:
print(f"ā {result.composite_score:.2f} ({result.reasoning[:50]})")
return results
async def run_synthetic_tests(
config: BQASConfig,
judge: LLMJudge,
generator: SyntheticGenerator,
) -> list:
"""Run synthetic tests."""
results = []
print("\nš Generating synthetic tests...")
intents = ["student_observation", "worksheet_generate", "reminder"]
for intent in intents:
print(f"\n Intent: {intent}")
variations = generator._generate_fallback(intent, count=5)
for i, var in enumerate(variations):
test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}"
print(f" {test_id}...", end=" ", flush=True)
result = await judge.evaluate_test_case(
test_id=test_id,
test_name=f"Synthetic {intent}",
user_input=var.input,
expected_intent=var.expected_intent,
detected_intent=var.expected_intent,
response="Verstanden.",
min_score=3.0,
)
results.append(result)
if result.passed:
print(f"ā
{result.composite_score:.2f}")
else:
print(f"ā {result.composite_score:.2f}")
return results
def generate_report(
golden_metrics: BQASMetrics,
synthetic_metrics: BQASMetrics,
output_path: Path,
):
"""Generate HTML report."""
html = f"""
BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}
BQAS Test Report
Golden Suite
Total: {golden_metrics.total_tests}
Passed: {golden_metrics.passed_tests}
Failed: {golden_metrics.failed_tests}
Avg Score: {golden_metrics.avg_composite_score:.3f}
Synthetic Tests
Total: {synthetic_metrics.total_tests}
Passed: {synthetic_metrics.passed_tests}
Failed: {synthetic_metrics.failed_tests}
Avg Score: {synthetic_metrics.avg_composite_score:.3f}
Scores by Intent
| Intent | Score |
{''.join(f"| {k} | {v:.3f} |
" for k, v in golden_metrics.scores_by_intent.items())}
Failed Tests
{''.join(f"- {tid}
" for tid in golden_metrics.failed_test_ids[:20])}
"""
output_path.write_text(html)
print(f"\nš Report saved to: {output_path}")
async def main():
parser = argparse.ArgumentParser(description="BQAS Test Runner")
parser.add_argument("--all", action="store_true", help="Run all tests")
parser.add_argument("--golden", action="store_true", help="Run golden suite only")
parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only")
parser.add_argument("--check-regression", action="store_true", help="Check for regression")
parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold")
parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures")
parser.add_argument("--report", action="store_true", help="Generate HTML report")
parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path")
args = parser.parse_args()
# Default to --all if no specific test type selected
if not (args.golden or args.synthetic or args.check_regression):
args.all = True
print("=" * 60)
print("BQAS - Breakpilot Quality Assurance System")
print("=" * 60)
config = BQASConfig.from_env()
judge = LLMJudge(config=config)
tracker = RegressionTracker(config=config)
generator = SyntheticGenerator(config=config)
backlog = BacklogGenerator(config=config)
# Check if judge is available
print("\nš Checking LLM availability...")
is_available = await judge.health_check()
if not is_available:
print("ā LLM Judge not available. Make sure Ollama is running with the model.")
print(f" Expected model: {config.judge_model}")
print(f" Ollama URL: {config.ollama_base_url}")
sys.exit(1)
print("ā
LLM Judge available")
golden_results = []
synthetic_results = []
# Run tests
if args.all or args.golden:
print("\n" + "=" * 60)
print("Running Golden Suite")
print("=" * 60)
golden_results = await run_golden_suite(config, judge)
if args.all or args.synthetic:
print("\n" + "=" * 60)
print("Running Synthetic Tests")
print("=" * 60)
synthetic_results = await run_synthetic_tests(config, judge, generator)
# Calculate metrics
golden_metrics = BQASMetrics.from_results(golden_results)
synthetic_metrics = BQASMetrics.from_results(synthetic_results)
# Print summary
print("\n" + golden_metrics.summary())
# Record run
if golden_results:
run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score)
print(f"\nš Run recorded: #{run.id}")
# Check regression
if args.check_regression:
print("\nš Checking for regression...")
is_regression, delta, msg = tracker.check_regression(
golden_metrics.avg_composite_score,
args.threshold,
)
print(f" {msg}")
if is_regression and args.create_issues:
print("\nš® Creating regression alert...")
runs = tracker.get_last_runs(1)
if runs:
url = await backlog.create_regression_alert(
golden_metrics.avg_composite_score,
golden_metrics.avg_composite_score + delta,
delta,
runs[0],
)
if url:
print(f" Issue created: {url}")
# Create issues for failures
if args.create_issues and golden_metrics.failed_tests > 0:
print("\nš® Creating issue for test failures...")
failed = [r for r in golden_results if not r.passed]
runs = tracker.get_last_runs(1)
if runs:
url = await backlog.create_issue(
runs[0],
golden_metrics,
failed,
)
if url:
print(f" Issue created: {url}")
# Generate report
if args.report:
generate_report(
golden_metrics,
synthetic_metrics,
Path(args.output),
)
# Cleanup
await judge.close()
await generator.close()
# Exit with error code if tests failed
if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0:
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())