feat: voice-service von lehrer nach core verschoben, Pipeline erweitert (voice, BQAS, embedding, night-scheduler)
This commit is contained in:
286
voice-service/scripts/run_bqas.py
Executable file
286
voice-service/scripts/run_bqas.py
Executable file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BQAS Runner Script
|
||||
Run BQAS tests and generate reports
|
||||
"""
|
||||
import asyncio
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Add parent to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from bqas.judge import LLMJudge
|
||||
from bqas.config import BQASConfig
|
||||
from bqas.regression_tracker import RegressionTracker
|
||||
from bqas.synthetic_generator import SyntheticGenerator
|
||||
from bqas.backlog_generator import BacklogGenerator
|
||||
from bqas.metrics import BQASMetrics, TestResult
|
||||
|
||||
|
||||
async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list:
|
||||
"""Run the golden test suite."""
|
||||
import yaml
|
||||
|
||||
results = []
|
||||
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
||||
|
||||
for yaml_file in golden_dir.glob("*.yaml"):
|
||||
print(f"\n📋 Loading {yaml_file.name}...")
|
||||
|
||||
with open(yaml_file) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
tests = data.get("tests", []) + data.get("edge_cases", [])
|
||||
|
||||
for test in tests:
|
||||
test_id = test.get("id", "UNKNOWN")
|
||||
print(f" Testing {test_id}...", end=" ", flush=True)
|
||||
|
||||
result = await judge.evaluate_test_case(
|
||||
test_id=test_id,
|
||||
test_name=test.get("name", ""),
|
||||
user_input=test.get("input", ""),
|
||||
expected_intent=test.get("expected_intent", "unknown"),
|
||||
detected_intent=test.get("expected_intent", "unknown"), # Mock for now
|
||||
response="Verstanden.",
|
||||
min_score=test.get("min_score", 3.5),
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if result.passed:
|
||||
print(f"✅ {result.composite_score:.2f}")
|
||||
else:
|
||||
print(f"❌ {result.composite_score:.2f} ({result.reasoning[:50]})")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def run_synthetic_tests(
|
||||
config: BQASConfig,
|
||||
judge: LLMJudge,
|
||||
generator: SyntheticGenerator,
|
||||
) -> list:
|
||||
"""Run synthetic tests."""
|
||||
results = []
|
||||
|
||||
print("\n🔄 Generating synthetic tests...")
|
||||
|
||||
intents = ["student_observation", "worksheet_generate", "reminder"]
|
||||
|
||||
for intent in intents:
|
||||
print(f"\n Intent: {intent}")
|
||||
variations = generator._generate_fallback(intent, count=5)
|
||||
|
||||
for i, var in enumerate(variations):
|
||||
test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}"
|
||||
print(f" {test_id}...", end=" ", flush=True)
|
||||
|
||||
result = await judge.evaluate_test_case(
|
||||
test_id=test_id,
|
||||
test_name=f"Synthetic {intent}",
|
||||
user_input=var.input,
|
||||
expected_intent=var.expected_intent,
|
||||
detected_intent=var.expected_intent,
|
||||
response="Verstanden.",
|
||||
min_score=3.0,
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if result.passed:
|
||||
print(f"✅ {result.composite_score:.2f}")
|
||||
else:
|
||||
print(f"❌ {result.composite_score:.2f}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def generate_report(
|
||||
golden_metrics: BQASMetrics,
|
||||
synthetic_metrics: BQASMetrics,
|
||||
output_path: Path,
|
||||
):
|
||||
"""Generate HTML report."""
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
|
||||
<style>
|
||||
body {{ font-family: sans-serif; margin: 20px; }}
|
||||
h1 {{ color: #333; }}
|
||||
.summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
|
||||
.card {{ background: #f5f5f5; padding: 20px; border-radius: 8px; }}
|
||||
.passed {{ color: #22c55e; }}
|
||||
.failed {{ color: #ef4444; }}
|
||||
table {{ border-collapse: collapse; width: 100%; }}
|
||||
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
||||
th {{ background: #f0f0f0; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>BQAS Test Report</h1>
|
||||
|
||||
<div class="summary">
|
||||
<div class="card">
|
||||
<h3>Golden Suite</h3>
|
||||
<p>Total: {golden_metrics.total_tests}</p>
|
||||
<p class="passed">Passed: {golden_metrics.passed_tests}</p>
|
||||
<p class="failed">Failed: {golden_metrics.failed_tests}</p>
|
||||
<p>Avg Score: {golden_metrics.avg_composite_score:.3f}</p>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h3>Synthetic Tests</h3>
|
||||
<p>Total: {synthetic_metrics.total_tests}</p>
|
||||
<p class="passed">Passed: {synthetic_metrics.passed_tests}</p>
|
||||
<p class="failed">Failed: {synthetic_metrics.failed_tests}</p>
|
||||
<p>Avg Score: {synthetic_metrics.avg_composite_score:.3f}</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2>Scores by Intent</h2>
|
||||
<table>
|
||||
<tr><th>Intent</th><th>Score</th></tr>
|
||||
{''.join(f"<tr><td>{k}</td><td>{v:.3f}</td></tr>" for k, v in golden_metrics.scores_by_intent.items())}
|
||||
</table>
|
||||
|
||||
<h2>Failed Tests</h2>
|
||||
<ul>
|
||||
{''.join(f"<li>{tid}</li>" for tid in golden_metrics.failed_test_ids[:20])}
|
||||
</ul>
|
||||
|
||||
<footer>
|
||||
<p>Generated: {datetime.now().isoformat()}</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
output_path.write_text(html)
|
||||
print(f"\n📊 Report saved to: {output_path}")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="BQAS Test Runner")
|
||||
parser.add_argument("--all", action="store_true", help="Run all tests")
|
||||
parser.add_argument("--golden", action="store_true", help="Run golden suite only")
|
||||
parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only")
|
||||
parser.add_argument("--check-regression", action="store_true", help="Check for regression")
|
||||
parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold")
|
||||
parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures")
|
||||
parser.add_argument("--report", action="store_true", help="Generate HTML report")
|
||||
parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Default to --all if no specific test type selected
|
||||
if not (args.golden or args.synthetic or args.check_regression):
|
||||
args.all = True
|
||||
|
||||
print("=" * 60)
|
||||
print("BQAS - Breakpilot Quality Assurance System")
|
||||
print("=" * 60)
|
||||
|
||||
config = BQASConfig.from_env()
|
||||
judge = LLMJudge(config=config)
|
||||
tracker = RegressionTracker(config=config)
|
||||
generator = SyntheticGenerator(config=config)
|
||||
backlog = BacklogGenerator(config=config)
|
||||
|
||||
# Check if judge is available
|
||||
print("\n🔍 Checking LLM availability...")
|
||||
is_available = await judge.health_check()
|
||||
if not is_available:
|
||||
print("❌ LLM Judge not available. Make sure Ollama is running with the model.")
|
||||
print(f" Expected model: {config.judge_model}")
|
||||
print(f" Ollama URL: {config.ollama_base_url}")
|
||||
sys.exit(1)
|
||||
print("✅ LLM Judge available")
|
||||
|
||||
golden_results = []
|
||||
synthetic_results = []
|
||||
|
||||
# Run tests
|
||||
if args.all or args.golden:
|
||||
print("\n" + "=" * 60)
|
||||
print("Running Golden Suite")
|
||||
print("=" * 60)
|
||||
golden_results = await run_golden_suite(config, judge)
|
||||
|
||||
if args.all or args.synthetic:
|
||||
print("\n" + "=" * 60)
|
||||
print("Running Synthetic Tests")
|
||||
print("=" * 60)
|
||||
synthetic_results = await run_synthetic_tests(config, judge, generator)
|
||||
|
||||
# Calculate metrics
|
||||
golden_metrics = BQASMetrics.from_results(golden_results)
|
||||
synthetic_metrics = BQASMetrics.from_results(synthetic_results)
|
||||
|
||||
# Print summary
|
||||
print("\n" + golden_metrics.summary())
|
||||
|
||||
# Record run
|
||||
if golden_results:
|
||||
run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score)
|
||||
print(f"\n📝 Run recorded: #{run.id}")
|
||||
|
||||
# Check regression
|
||||
if args.check_regression:
|
||||
print("\n🔍 Checking for regression...")
|
||||
is_regression, delta, msg = tracker.check_regression(
|
||||
golden_metrics.avg_composite_score,
|
||||
args.threshold,
|
||||
)
|
||||
print(f" {msg}")
|
||||
|
||||
if is_regression and args.create_issues:
|
||||
print("\n📮 Creating regression alert...")
|
||||
runs = tracker.get_last_runs(1)
|
||||
if runs:
|
||||
url = await backlog.create_regression_alert(
|
||||
golden_metrics.avg_composite_score,
|
||||
golden_metrics.avg_composite_score + delta,
|
||||
delta,
|
||||
runs[0],
|
||||
)
|
||||
if url:
|
||||
print(f" Issue created: {url}")
|
||||
|
||||
# Create issues for failures
|
||||
if args.create_issues and golden_metrics.failed_tests > 0:
|
||||
print("\n📮 Creating issue for test failures...")
|
||||
failed = [r for r in golden_results if not r.passed]
|
||||
runs = tracker.get_last_runs(1)
|
||||
if runs:
|
||||
url = await backlog.create_issue(
|
||||
runs[0],
|
||||
golden_metrics,
|
||||
failed,
|
||||
)
|
||||
if url:
|
||||
print(f" Issue created: {url}")
|
||||
|
||||
# Generate report
|
||||
if args.report:
|
||||
generate_report(
|
||||
golden_metrics,
|
||||
synthetic_metrics,
|
||||
Path(args.output),
|
||||
)
|
||||
|
||||
# Cleanup
|
||||
await judge.close()
|
||||
await generator.close()
|
||||
|
||||
# Exit with error code if tests failed
|
||||
if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user