breakpilot-core/voice-service/scripts/run_bqas.py

#!/usr/bin/env python3
"""
BQAS Runner Script
Run BQAS tests and generate reports
"""
import asyncio
import argparse
import sys
import json
from pathlib import Path
from datetime import datetime

# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from bqas.judge import LLMJudge
from bqas.config import BQASConfig
from bqas.regression_tracker import RegressionTracker
from bqas.synthetic_generator import SyntheticGenerator
from bqas.backlog_generator import BacklogGenerator
from bqas.metrics import BQASMetrics, TestResult


async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list:
    """Run the golden test suite."""
    import yaml

    results = []
    golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"

    for yaml_file in golden_dir.glob("*.yaml"):
        print(f"\n📋 Loading {yaml_file.name}...")

        with open(yaml_file) as f:
            data = yaml.safe_load(f)

        tests = data.get("tests", []) + data.get("edge_cases", [])

        for test in tests:
            test_id = test.get("id", "UNKNOWN")
            print(f"  Testing {test_id}...", end=" ", flush=True)

            result = await judge.evaluate_test_case(
                test_id=test_id,
                test_name=test.get("name", ""),
                user_input=test.get("input", ""),
                expected_intent=test.get("expected_intent", "unknown"),
                detected_intent=test.get("expected_intent", "unknown"),  # Mock for now
                response="Verstanden.",
                min_score=test.get("min_score", 3.5),
            )

            results.append(result)

            if result.passed:
                print(f"✅ {result.composite_score:.2f}")
            else:
                print(f"❌ {result.composite_score:.2f} ({result.reasoning[:50]})")

    return results


async def run_synthetic_tests(
    config: BQASConfig,
    judge: LLMJudge,
    generator: SyntheticGenerator,
) -> list:
    """Run synthetic tests."""
    results = []

    print("\n🔄 Generating synthetic tests...")

    intents = ["student_observation", "worksheet_generate", "reminder"]

    for intent in intents:
        print(f"\n  Intent: {intent}")
        variations = generator._generate_fallback(intent, count=5)

        for i, var in enumerate(variations):
            test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}"
            print(f"    {test_id}...", end=" ", flush=True)

            result = await judge.evaluate_test_case(
                test_id=test_id,
                test_name=f"Synthetic {intent}",
                user_input=var.input,
                expected_intent=var.expected_intent,
                detected_intent=var.expected_intent,
                response="Verstanden.",
                min_score=3.0,
            )

            results.append(result)

            if result.passed:
                print(f"✅ {result.composite_score:.2f}")
            else:
                print(f"❌ {result.composite_score:.2f}")

    return results


def generate_report(
    golden_metrics: BQASMetrics,
    synthetic_metrics: BQASMetrics,
    output_path: Path,
):
    """Generate HTML report."""
    html = f"""<!DOCTYPE html>
<html>
<head>
    <title>BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
    <style>
        body {{ font-family: sans-serif; margin: 20px; }}
        h1 {{ color: #333; }}
        .summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
        .card {{ background: #f5f5f5; padding: 20px; border-radius: 8px; }}
        .passed {{ color: #22c55e; }}
        .failed {{ color: #ef4444; }}
        table {{ border-collapse: collapse; width: 100%; }}
        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        th {{ background: #f0f0f0; }}
    </style>
</head>
<body>
    <h1>BQAS Test Report</h1>

    <div class="summary">
        <div class="card">
            <h3>Golden Suite</h3>
            <p>Total: {golden_metrics.total_tests}</p>
            <p class="passed">Passed: {golden_metrics.passed_tests}</p>
            <p class="failed">Failed: {golden_metrics.failed_tests}</p>
            <p>Avg Score: {golden_metrics.avg_composite_score:.3f}</p>
        </div>

        <div class="card">
            <h3>Synthetic Tests</h3>
            <p>Total: {synthetic_metrics.total_tests}</p>
            <p class="passed">Passed: {synthetic_metrics.passed_tests}</p>
            <p class="failed">Failed: {synthetic_metrics.failed_tests}</p>
            <p>Avg Score: {synthetic_metrics.avg_composite_score:.3f}</p>
        </div>
    </div>

    <h2>Scores by Intent</h2>
    <table>
        <tr><th>Intent</th><th>Score</th></tr>
        {''.join(f"<tr><td>{k}</td><td>{v:.3f}</td></tr>" for k, v in golden_metrics.scores_by_intent.items())}
    </table>

    <h2>Failed Tests</h2>
    <ul>
        {''.join(f"<li>{tid}</li>" for tid in golden_metrics.failed_test_ids[:20])}
    </ul>

    <footer>
        <p>Generated: {datetime.now().isoformat()}</p>
    </footer>
</body>
</html>"""

    output_path.write_text(html)
    print(f"\n📊 Report saved to: {output_path}")


async def main():
    parser = argparse.ArgumentParser(description="BQAS Test Runner")
    parser.add_argument("--all", action="store_true", help="Run all tests")
    parser.add_argument("--golden", action="store_true", help="Run golden suite only")
    parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only")
    parser.add_argument("--check-regression", action="store_true", help="Check for regression")
    parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold")
    parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures")
    parser.add_argument("--report", action="store_true", help="Generate HTML report")
    parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path")

    args = parser.parse_args()

    # Default to --all if no specific test type selected
    if not (args.golden or args.synthetic or args.check_regression):
        args.all = True

    print("=" * 60)
    print("BQAS - Breakpilot Quality Assurance System")
    print("=" * 60)

    config = BQASConfig.from_env()
    judge = LLMJudge(config=config)
    tracker = RegressionTracker(config=config)
    generator = SyntheticGenerator(config=config)
    backlog = BacklogGenerator(config=config)

    # Check if judge is available
    print("\n🔍 Checking LLM availability...")
    is_available = await judge.health_check()
    if not is_available:
        print("❌ LLM Judge not available. Make sure Ollama is running with the model.")
        print(f"   Expected model: {config.judge_model}")
        print(f"   Ollama URL: {config.ollama_base_url}")
        sys.exit(1)
    print("✅ LLM Judge available")

    golden_results = []
    synthetic_results = []

    # Run tests
    if args.all or args.golden:
        print("\n" + "=" * 60)
        print("Running Golden Suite")
        print("=" * 60)
        golden_results = await run_golden_suite(config, judge)

    if args.all or args.synthetic:
        print("\n" + "=" * 60)
        print("Running Synthetic Tests")
        print("=" * 60)
        synthetic_results = await run_synthetic_tests(config, judge, generator)

    # Calculate metrics
    golden_metrics = BQASMetrics.from_results(golden_results)
    synthetic_metrics = BQASMetrics.from_results(synthetic_results)

    # Print summary
    print("\n" + golden_metrics.summary())

    # Record run
    if golden_results:
        run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score)
        print(f"\n📝 Run recorded: #{run.id}")

    # Check regression
    if args.check_regression:
        print("\n🔍 Checking for regression...")
        is_regression, delta, msg = tracker.check_regression(
            golden_metrics.avg_composite_score,
            args.threshold,
        )
        print(f"   {msg}")

        if is_regression and args.create_issues:
            print("\n📮 Creating regression alert...")
            runs = tracker.get_last_runs(1)
            if runs:
                url = await backlog.create_regression_alert(
                    golden_metrics.avg_composite_score,
                    golden_metrics.avg_composite_score + delta,
                    delta,
                    runs[0],
                )
                if url:
                    print(f"   Issue created: {url}")

    # Create issues for failures
    if args.create_issues and golden_metrics.failed_tests > 0:
        print("\n📮 Creating issue for test failures...")
        failed = [r for r in golden_results if not r.passed]
        runs = tracker.get_last_runs(1)
        if runs:
            url = await backlog.create_issue(
                runs[0],
                golden_metrics,
                failed,
            )
            if url:
                print(f"   Issue created: {url}")

    # Generate report
    if args.report:
        generate_report(
            golden_metrics,
            synthetic_metrics,
            Path(args.output),
        )

    # Cleanup
    await judge.close()
    await generator.close()

    # Exit with error code if tests failed
    if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0:
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())