This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/voice-service/scripts/run_bqas.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

287 lines
9.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
BQAS Runner Script
Run BQAS tests and generate reports
"""
import asyncio
import argparse
import sys
import json
from pathlib import Path
from datetime import datetime
# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from bqas.judge import LLMJudge
from bqas.config import BQASConfig
from bqas.regression_tracker import RegressionTracker
from bqas.synthetic_generator import SyntheticGenerator
from bqas.backlog_generator import BacklogGenerator
from bqas.metrics import BQASMetrics, TestResult
async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list:
"""Run the golden test suite."""
import yaml
results = []
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
for yaml_file in golden_dir.glob("*.yaml"):
print(f"\n📋 Loading {yaml_file.name}...")
with open(yaml_file) as f:
data = yaml.safe_load(f)
tests = data.get("tests", []) + data.get("edge_cases", [])
for test in tests:
test_id = test.get("id", "UNKNOWN")
print(f" Testing {test_id}...", end=" ", flush=True)
result = await judge.evaluate_test_case(
test_id=test_id,
test_name=test.get("name", ""),
user_input=test.get("input", ""),
expected_intent=test.get("expected_intent", "unknown"),
detected_intent=test.get("expected_intent", "unknown"), # Mock for now
response="Verstanden.",
min_score=test.get("min_score", 3.5),
)
results.append(result)
if result.passed:
print(f"{result.composite_score:.2f}")
else:
print(f"{result.composite_score:.2f} ({result.reasoning[:50]})")
return results
async def run_synthetic_tests(
config: BQASConfig,
judge: LLMJudge,
generator: SyntheticGenerator,
) -> list:
"""Run synthetic tests."""
results = []
print("\n🔄 Generating synthetic tests...")
intents = ["student_observation", "worksheet_generate", "reminder"]
for intent in intents:
print(f"\n Intent: {intent}")
variations = generator._generate_fallback(intent, count=5)
for i, var in enumerate(variations):
test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}"
print(f" {test_id}...", end=" ", flush=True)
result = await judge.evaluate_test_case(
test_id=test_id,
test_name=f"Synthetic {intent}",
user_input=var.input,
expected_intent=var.expected_intent,
detected_intent=var.expected_intent,
response="Verstanden.",
min_score=3.0,
)
results.append(result)
if result.passed:
print(f"{result.composite_score:.2f}")
else:
print(f"{result.composite_score:.2f}")
return results
def generate_report(
golden_metrics: BQASMetrics,
synthetic_metrics: BQASMetrics,
output_path: Path,
):
"""Generate HTML report."""
html = f"""<!DOCTYPE html>
<html>
<head>
<title>BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
<style>
body {{ font-family: sans-serif; margin: 20px; }}
h1 {{ color: #333; }}
.summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
.card {{ background: #f5f5f5; padding: 20px; border-radius: 8px; }}
.passed {{ color: #22c55e; }}
.failed {{ color: #ef4444; }}
table {{ border-collapse: collapse; width: 100%; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background: #f0f0f0; }}
</style>
</head>
<body>
<h1>BQAS Test Report</h1>
<div class="summary">
<div class="card">
<h3>Golden Suite</h3>
<p>Total: {golden_metrics.total_tests}</p>
<p class="passed">Passed: {golden_metrics.passed_tests}</p>
<p class="failed">Failed: {golden_metrics.failed_tests}</p>
<p>Avg Score: {golden_metrics.avg_composite_score:.3f}</p>
</div>
<div class="card">
<h3>Synthetic Tests</h3>
<p>Total: {synthetic_metrics.total_tests}</p>
<p class="passed">Passed: {synthetic_metrics.passed_tests}</p>
<p class="failed">Failed: {synthetic_metrics.failed_tests}</p>
<p>Avg Score: {synthetic_metrics.avg_composite_score:.3f}</p>
</div>
</div>
<h2>Scores by Intent</h2>
<table>
<tr><th>Intent</th><th>Score</th></tr>
{''.join(f"<tr><td>{k}</td><td>{v:.3f}</td></tr>" for k, v in golden_metrics.scores_by_intent.items())}
</table>
<h2>Failed Tests</h2>
<ul>
{''.join(f"<li>{tid}</li>" for tid in golden_metrics.failed_test_ids[:20])}
</ul>
<footer>
<p>Generated: {datetime.now().isoformat()}</p>
</footer>
</body>
</html>"""
output_path.write_text(html)
print(f"\n📊 Report saved to: {output_path}")
async def main():
parser = argparse.ArgumentParser(description="BQAS Test Runner")
parser.add_argument("--all", action="store_true", help="Run all tests")
parser.add_argument("--golden", action="store_true", help="Run golden suite only")
parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only")
parser.add_argument("--check-regression", action="store_true", help="Check for regression")
parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold")
parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures")
parser.add_argument("--report", action="store_true", help="Generate HTML report")
parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path")
args = parser.parse_args()
# Default to --all if no specific test type selected
if not (args.golden or args.synthetic or args.check_regression):
args.all = True
print("=" * 60)
print("BQAS - Breakpilot Quality Assurance System")
print("=" * 60)
config = BQASConfig.from_env()
judge = LLMJudge(config=config)
tracker = RegressionTracker(config=config)
generator = SyntheticGenerator(config=config)
backlog = BacklogGenerator(config=config)
# Check if judge is available
print("\n🔍 Checking LLM availability...")
is_available = await judge.health_check()
if not is_available:
print("❌ LLM Judge not available. Make sure Ollama is running with the model.")
print(f" Expected model: {config.judge_model}")
print(f" Ollama URL: {config.ollama_base_url}")
sys.exit(1)
print("✅ LLM Judge available")
golden_results = []
synthetic_results = []
# Run tests
if args.all or args.golden:
print("\n" + "=" * 60)
print("Running Golden Suite")
print("=" * 60)
golden_results = await run_golden_suite(config, judge)
if args.all or args.synthetic:
print("\n" + "=" * 60)
print("Running Synthetic Tests")
print("=" * 60)
synthetic_results = await run_synthetic_tests(config, judge, generator)
# Calculate metrics
golden_metrics = BQASMetrics.from_results(golden_results)
synthetic_metrics = BQASMetrics.from_results(synthetic_results)
# Print summary
print("\n" + golden_metrics.summary())
# Record run
if golden_results:
run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score)
print(f"\n📝 Run recorded: #{run.id}")
# Check regression
if args.check_regression:
print("\n🔍 Checking for regression...")
is_regression, delta, msg = tracker.check_regression(
golden_metrics.avg_composite_score,
args.threshold,
)
print(f" {msg}")
if is_regression and args.create_issues:
print("\n📮 Creating regression alert...")
runs = tracker.get_last_runs(1)
if runs:
url = await backlog.create_regression_alert(
golden_metrics.avg_composite_score,
golden_metrics.avg_composite_score + delta,
delta,
runs[0],
)
if url:
print(f" Issue created: {url}")
# Create issues for failures
if args.create_issues and golden_metrics.failed_tests > 0:
print("\n📮 Creating issue for test failures...")
failed = [r for r in golden_results if not r.passed]
runs = tracker.get_last_runs(1)
if runs:
url = await backlog.create_issue(
runs[0],
golden_metrics,
failed,
)
if url:
print(f" Issue created: {url}")
# Generate report
if args.report:
generate_report(
golden_metrics,
synthetic_metrics,
Path(args.output),
)
# Cleanup
await judge.close()
await generator.close()
# Exit with error code if tests failed
if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0:
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())