287 lines
9.2 KiB
Python
Executable File
287 lines
9.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
BQAS Runner Script
|
|
Run BQAS tests and generate reports
|
|
"""
|
|
import asyncio
|
|
import argparse
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Add parent to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from bqas.judge import LLMJudge
|
|
from bqas.config import BQASConfig
|
|
from bqas.regression_tracker import RegressionTracker
|
|
from bqas.synthetic_generator import SyntheticGenerator
|
|
from bqas.backlog_generator import BacklogGenerator
|
|
from bqas.metrics import BQASMetrics, TestResult
|
|
|
|
|
|
async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list:
|
|
"""Run the golden test suite."""
|
|
import yaml
|
|
|
|
results = []
|
|
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
|
|
|
for yaml_file in golden_dir.glob("*.yaml"):
|
|
print(f"\n📋 Loading {yaml_file.name}...")
|
|
|
|
with open(yaml_file) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
tests = data.get("tests", []) + data.get("edge_cases", [])
|
|
|
|
for test in tests:
|
|
test_id = test.get("id", "UNKNOWN")
|
|
print(f" Testing {test_id}...", end=" ", flush=True)
|
|
|
|
result = await judge.evaluate_test_case(
|
|
test_id=test_id,
|
|
test_name=test.get("name", ""),
|
|
user_input=test.get("input", ""),
|
|
expected_intent=test.get("expected_intent", "unknown"),
|
|
detected_intent=test.get("expected_intent", "unknown"), # Mock for now
|
|
response="Verstanden.",
|
|
min_score=test.get("min_score", 3.5),
|
|
)
|
|
|
|
results.append(result)
|
|
|
|
if result.passed:
|
|
print(f"✅ {result.composite_score:.2f}")
|
|
else:
|
|
print(f"❌ {result.composite_score:.2f} ({result.reasoning[:50]})")
|
|
|
|
return results
|
|
|
|
|
|
async def run_synthetic_tests(
|
|
config: BQASConfig,
|
|
judge: LLMJudge,
|
|
generator: SyntheticGenerator,
|
|
) -> list:
|
|
"""Run synthetic tests."""
|
|
results = []
|
|
|
|
print("\n🔄 Generating synthetic tests...")
|
|
|
|
intents = ["student_observation", "worksheet_generate", "reminder"]
|
|
|
|
for intent in intents:
|
|
print(f"\n Intent: {intent}")
|
|
variations = generator._generate_fallback(intent, count=5)
|
|
|
|
for i, var in enumerate(variations):
|
|
test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}"
|
|
print(f" {test_id}...", end=" ", flush=True)
|
|
|
|
result = await judge.evaluate_test_case(
|
|
test_id=test_id,
|
|
test_name=f"Synthetic {intent}",
|
|
user_input=var.input,
|
|
expected_intent=var.expected_intent,
|
|
detected_intent=var.expected_intent,
|
|
response="Verstanden.",
|
|
min_score=3.0,
|
|
)
|
|
|
|
results.append(result)
|
|
|
|
if result.passed:
|
|
print(f"✅ {result.composite_score:.2f}")
|
|
else:
|
|
print(f"❌ {result.composite_score:.2f}")
|
|
|
|
return results
|
|
|
|
|
|
def generate_report(
|
|
golden_metrics: BQASMetrics,
|
|
synthetic_metrics: BQASMetrics,
|
|
output_path: Path,
|
|
):
|
|
"""Generate HTML report."""
|
|
html = f"""<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
|
|
<style>
|
|
body {{ font-family: sans-serif; margin: 20px; }}
|
|
h1 {{ color: #333; }}
|
|
.summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
|
|
.card {{ background: #f5f5f5; padding: 20px; border-radius: 8px; }}
|
|
.passed {{ color: #22c55e; }}
|
|
.failed {{ color: #ef4444; }}
|
|
table {{ border-collapse: collapse; width: 100%; }}
|
|
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
|
th {{ background: #f0f0f0; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>BQAS Test Report</h1>
|
|
|
|
<div class="summary">
|
|
<div class="card">
|
|
<h3>Golden Suite</h3>
|
|
<p>Total: {golden_metrics.total_tests}</p>
|
|
<p class="passed">Passed: {golden_metrics.passed_tests}</p>
|
|
<p class="failed">Failed: {golden_metrics.failed_tests}</p>
|
|
<p>Avg Score: {golden_metrics.avg_composite_score:.3f}</p>
|
|
</div>
|
|
|
|
<div class="card">
|
|
<h3>Synthetic Tests</h3>
|
|
<p>Total: {synthetic_metrics.total_tests}</p>
|
|
<p class="passed">Passed: {synthetic_metrics.passed_tests}</p>
|
|
<p class="failed">Failed: {synthetic_metrics.failed_tests}</p>
|
|
<p>Avg Score: {synthetic_metrics.avg_composite_score:.3f}</p>
|
|
</div>
|
|
</div>
|
|
|
|
<h2>Scores by Intent</h2>
|
|
<table>
|
|
<tr><th>Intent</th><th>Score</th></tr>
|
|
{''.join(f"<tr><td>{k}</td><td>{v:.3f}</td></tr>" for k, v in golden_metrics.scores_by_intent.items())}
|
|
</table>
|
|
|
|
<h2>Failed Tests</h2>
|
|
<ul>
|
|
{''.join(f"<li>{tid}</li>" for tid in golden_metrics.failed_test_ids[:20])}
|
|
</ul>
|
|
|
|
<footer>
|
|
<p>Generated: {datetime.now().isoformat()}</p>
|
|
</footer>
|
|
</body>
|
|
</html>"""
|
|
|
|
output_path.write_text(html)
|
|
print(f"\n📊 Report saved to: {output_path}")
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="BQAS Test Runner")
|
|
parser.add_argument("--all", action="store_true", help="Run all tests")
|
|
parser.add_argument("--golden", action="store_true", help="Run golden suite only")
|
|
parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only")
|
|
parser.add_argument("--check-regression", action="store_true", help="Check for regression")
|
|
parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold")
|
|
parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures")
|
|
parser.add_argument("--report", action="store_true", help="Generate HTML report")
|
|
parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Default to --all if no specific test type selected
|
|
if not (args.golden or args.synthetic or args.check_regression):
|
|
args.all = True
|
|
|
|
print("=" * 60)
|
|
print("BQAS - Breakpilot Quality Assurance System")
|
|
print("=" * 60)
|
|
|
|
config = BQASConfig.from_env()
|
|
judge = LLMJudge(config=config)
|
|
tracker = RegressionTracker(config=config)
|
|
generator = SyntheticGenerator(config=config)
|
|
backlog = BacklogGenerator(config=config)
|
|
|
|
# Check if judge is available
|
|
print("\n🔍 Checking LLM availability...")
|
|
is_available = await judge.health_check()
|
|
if not is_available:
|
|
print("❌ LLM Judge not available. Make sure Ollama is running with the model.")
|
|
print(f" Expected model: {config.judge_model}")
|
|
print(f" Ollama URL: {config.ollama_base_url}")
|
|
sys.exit(1)
|
|
print("✅ LLM Judge available")
|
|
|
|
golden_results = []
|
|
synthetic_results = []
|
|
|
|
# Run tests
|
|
if args.all or args.golden:
|
|
print("\n" + "=" * 60)
|
|
print("Running Golden Suite")
|
|
print("=" * 60)
|
|
golden_results = await run_golden_suite(config, judge)
|
|
|
|
if args.all or args.synthetic:
|
|
print("\n" + "=" * 60)
|
|
print("Running Synthetic Tests")
|
|
print("=" * 60)
|
|
synthetic_results = await run_synthetic_tests(config, judge, generator)
|
|
|
|
# Calculate metrics
|
|
golden_metrics = BQASMetrics.from_results(golden_results)
|
|
synthetic_metrics = BQASMetrics.from_results(synthetic_results)
|
|
|
|
# Print summary
|
|
print("\n" + golden_metrics.summary())
|
|
|
|
# Record run
|
|
if golden_results:
|
|
run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score)
|
|
print(f"\n📝 Run recorded: #{run.id}")
|
|
|
|
# Check regression
|
|
if args.check_regression:
|
|
print("\n🔍 Checking for regression...")
|
|
is_regression, delta, msg = tracker.check_regression(
|
|
golden_metrics.avg_composite_score,
|
|
args.threshold,
|
|
)
|
|
print(f" {msg}")
|
|
|
|
if is_regression and args.create_issues:
|
|
print("\n📮 Creating regression alert...")
|
|
runs = tracker.get_last_runs(1)
|
|
if runs:
|
|
url = await backlog.create_regression_alert(
|
|
golden_metrics.avg_composite_score,
|
|
golden_metrics.avg_composite_score + delta,
|
|
delta,
|
|
runs[0],
|
|
)
|
|
if url:
|
|
print(f" Issue created: {url}")
|
|
|
|
# Create issues for failures
|
|
if args.create_issues and golden_metrics.failed_tests > 0:
|
|
print("\n📮 Creating issue for test failures...")
|
|
failed = [r for r in golden_results if not r.passed]
|
|
runs = tracker.get_last_runs(1)
|
|
if runs:
|
|
url = await backlog.create_issue(
|
|
runs[0],
|
|
golden_metrics,
|
|
failed,
|
|
)
|
|
if url:
|
|
print(f" Issue created: {url}")
|
|
|
|
# Generate report
|
|
if args.report:
|
|
generate_report(
|
|
golden_metrics,
|
|
synthetic_metrics,
|
|
Path(args.output),
|
|
)
|
|
|
|
# Cleanup
|
|
await judge.close()
|
|
await generator.close()
|
|
|
|
# Exit with error code if tests failed
|
|
if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|