refactor: voice-service entfernt (verschoben nach breakpilot-core)
This commit is contained in:
@@ -1,77 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<!--
|
||||
BQAS Local Scheduler - launchd plist
|
||||
|
||||
Fuehrt BQAS Tests taeglich um 07:00 Uhr aus.
|
||||
|
||||
Installation:
|
||||
cp com.breakpilot.bqas.plist ~/Library/LaunchAgents/
|
||||
launchctl load ~/Library/LaunchAgents/com.breakpilot.bqas.plist
|
||||
|
||||
Deinstallation:
|
||||
launchctl unload ~/Library/LaunchAgents/com.breakpilot.bqas.plist
|
||||
rm ~/Library/LaunchAgents/com.breakpilot.bqas.plist
|
||||
|
||||
Manueller Test:
|
||||
launchctl start com.breakpilot.bqas
|
||||
|
||||
Status pruefen:
|
||||
launchctl list | grep bqas
|
||||
-->
|
||||
|
||||
<key>Label</key>
|
||||
<string>com.breakpilot.bqas</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/Users/benjaminadmin/Projekte/breakpilot-pwa/voice-service/scripts/run_bqas.sh</string>
|
||||
</array>
|
||||
|
||||
<!-- Taeglich um 07:00 Uhr -->
|
||||
<key>StartCalendarInterval</key>
|
||||
<dict>
|
||||
<key>Hour</key>
|
||||
<integer>7</integer>
|
||||
<key>Minute</key>
|
||||
<integer>0</integer>
|
||||
</dict>
|
||||
|
||||
<!-- Log-Ausgaben -->
|
||||
<key>StandardOutPath</key>
|
||||
<string>/var/log/bqas/stdout.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/var/log/bqas/stderr.log</string>
|
||||
|
||||
<!-- Nicht beim Login starten -->
|
||||
<key>RunAtLoad</key>
|
||||
<false/>
|
||||
|
||||
<!-- Umgebungsvariablen -->
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
|
||||
<key>HOME</key>
|
||||
<string>/Users/benjaminadmin</string>
|
||||
<!-- Optional: Service URL ueberschreiben -->
|
||||
<!-- <key>BQAS_SERVICE_URL</key>
|
||||
<string>http://localhost:8091</string> -->
|
||||
</dict>
|
||||
|
||||
<!-- Arbeitsverzeichnis -->
|
||||
<key>WorkingDirectory</key>
|
||||
<string>/Users/benjaminadmin/Projekte/breakpilot-pwa/voice-service</string>
|
||||
|
||||
<!-- Ressourcen-Limits (optional) -->
|
||||
<key>ProcessType</key>
|
||||
<string>Background</string>
|
||||
|
||||
<!-- Timeout: 30 Minuten -->
|
||||
<key>TimeOut</key>
|
||||
<integer>1800</integer>
|
||||
</dict>
|
||||
</plist>
|
||||
@@ -1,318 +0,0 @@
|
||||
#!/bin/bash
|
||||
# BQAS Scheduler Installation Script
|
||||
# Installiert launchd Job fuer taegliche BQAS Tests um 7:00 Uhr
|
||||
|
||||
set -e
|
||||
|
||||
# Konfiguration
|
||||
VOICE_SERVICE_DIR="/Users/benjaminadmin/Projekte/breakpilot-pwa/voice-service"
|
||||
PLIST_NAME="com.breakpilot.bqas"
|
||||
PLIST_PATH="${HOME}/Library/LaunchAgents/${PLIST_NAME}.plist"
|
||||
LOG_DIR="/var/log/bqas"
|
||||
GIT_HOOKS_DIR="/Users/benjaminadmin/Projekte/breakpilot-pwa/.git/hooks"
|
||||
|
||||
# Farben
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() {
|
||||
local level=$1
|
||||
local message=$2
|
||||
case $level in
|
||||
INFO) echo -e "${BLUE}[INFO]${NC} ${message}" ;;
|
||||
SUCCESS) echo -e "${GREEN}[SUCCESS]${NC} ${message}" ;;
|
||||
WARNING) echo -e "${YELLOW}[WARNING]${NC} ${message}" ;;
|
||||
ERROR) echo -e "${RED}[ERROR]${NC} ${message}" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Argumente
|
||||
ACTION=${1:-install}
|
||||
|
||||
show_usage() {
|
||||
echo "Usage: $0 [install|uninstall|status|test]"
|
||||
echo ""
|
||||
echo "Commands:"
|
||||
echo " install Installiert launchd Job und Git Hook"
|
||||
echo " uninstall Entfernt launchd Job und Git Hook"
|
||||
echo " status Zeigt aktuellen Status"
|
||||
echo " test Fuehrt BQAS Tests manuell aus"
|
||||
}
|
||||
|
||||
create_log_directory() {
|
||||
log "INFO" "Erstelle Log-Verzeichnis..."
|
||||
|
||||
if [ ! -d "$LOG_DIR" ]; then
|
||||
sudo mkdir -p "$LOG_DIR"
|
||||
sudo chown "$USER" "$LOG_DIR"
|
||||
log "SUCCESS" "Log-Verzeichnis erstellt: $LOG_DIR"
|
||||
else
|
||||
log "INFO" "Log-Verzeichnis existiert bereits"
|
||||
fi
|
||||
}
|
||||
|
||||
create_plist() {
|
||||
log "INFO" "Erstelle launchd plist..."
|
||||
|
||||
cat > "$PLIST_PATH" << EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>${PLIST_NAME}</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>${VOICE_SERVICE_DIR}/scripts/run_bqas.sh</string>
|
||||
</array>
|
||||
|
||||
<key>StartCalendarInterval</key>
|
||||
<dict>
|
||||
<key>Hour</key>
|
||||
<integer>7</integer>
|
||||
<key>Minute</key>
|
||||
<integer>0</integer>
|
||||
</dict>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>${LOG_DIR}/stdout.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>${LOG_DIR}/stderr.log</string>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<false/>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
|
||||
<key>HOME</key>
|
||||
<string>${HOME}</string>
|
||||
</dict>
|
||||
|
||||
<key>WorkingDirectory</key>
|
||||
<string>${VOICE_SERVICE_DIR}</string>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
|
||||
log "SUCCESS" "plist erstellt: $PLIST_PATH"
|
||||
}
|
||||
|
||||
load_plist() {
|
||||
log "INFO" "Lade launchd Job..."
|
||||
|
||||
# Entlade falls bereits geladen
|
||||
launchctl unload "$PLIST_PATH" 2>/dev/null || true
|
||||
|
||||
# Lade den Job
|
||||
launchctl load "$PLIST_PATH"
|
||||
log "SUCCESS" "launchd Job geladen"
|
||||
}
|
||||
|
||||
unload_plist() {
|
||||
log "INFO" "Entlade launchd Job..."
|
||||
|
||||
if [ -f "$PLIST_PATH" ]; then
|
||||
launchctl unload "$PLIST_PATH" 2>/dev/null || true
|
||||
rm -f "$PLIST_PATH"
|
||||
log "SUCCESS" "launchd Job entfernt"
|
||||
else
|
||||
log "INFO" "Kein launchd Job gefunden"
|
||||
fi
|
||||
}
|
||||
|
||||
create_git_hook() {
|
||||
log "INFO" "Erstelle Git post-commit Hook..."
|
||||
|
||||
# Prüfe ob .git/hooks existiert
|
||||
if [ ! -d "$GIT_HOOKS_DIR" ]; then
|
||||
log "WARNING" "Git hooks Verzeichnis nicht gefunden: $GIT_HOOKS_DIR"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local hook_path="${GIT_HOOKS_DIR}/post-commit"
|
||||
|
||||
# Backup falls vorhanden
|
||||
if [ -f "$hook_path" ]; then
|
||||
cp "$hook_path" "${hook_path}.backup"
|
||||
log "INFO" "Bestehender Hook gesichert"
|
||||
fi
|
||||
|
||||
cat > "$hook_path" << 'EOF'
|
||||
#!/bin/bash
|
||||
# BQAS Post-Commit Hook
|
||||
# Fuehrt schnelle Tests aus wenn voice-service geaendert wurde
|
||||
|
||||
# Nur ausfuehren wenn voice-service geaendert wurde
|
||||
if git diff --name-only HEAD~1 2>/dev/null | grep -q "^voice-service/"; then
|
||||
echo ""
|
||||
echo "voice-service geaendert - starte BQAS Quick Check..."
|
||||
echo ""
|
||||
|
||||
# Async ausfuehren (im Hintergrund)
|
||||
VOICE_SERVICE_DIR="/Users/benjaminadmin/Projekte/breakpilot-pwa/voice-service"
|
||||
|
||||
if [ -f "${VOICE_SERVICE_DIR}/scripts/run_bqas.sh" ]; then
|
||||
nohup "${VOICE_SERVICE_DIR}/scripts/run_bqas.sh" --quick > /dev/null 2>&1 &
|
||||
echo "BQAS Quick Check gestartet (PID: $!)"
|
||||
echo "Logs: /var/log/bqas/bqas.log"
|
||||
fi
|
||||
fi
|
||||
EOF
|
||||
|
||||
chmod +x "$hook_path"
|
||||
log "SUCCESS" "Git Hook erstellt: $hook_path"
|
||||
}
|
||||
|
||||
remove_git_hook() {
|
||||
log "INFO" "Entferne Git post-commit Hook..."
|
||||
|
||||
local hook_path="${GIT_HOOKS_DIR}/post-commit"
|
||||
|
||||
if [ -f "$hook_path" ]; then
|
||||
# Prüfe ob es unser Hook ist
|
||||
if grep -q "BQAS" "$hook_path" 2>/dev/null; then
|
||||
rm -f "$hook_path"
|
||||
|
||||
# Restore backup falls vorhanden
|
||||
if [ -f "${hook_path}.backup" ]; then
|
||||
mv "${hook_path}.backup" "$hook_path"
|
||||
log "INFO" "Vorheriger Hook wiederhergestellt"
|
||||
fi
|
||||
|
||||
log "SUCCESS" "Git Hook entfernt"
|
||||
else
|
||||
log "WARNING" "Hook gehoert nicht zu BQAS, uebersprungen"
|
||||
fi
|
||||
else
|
||||
log "INFO" "Kein Git Hook gefunden"
|
||||
fi
|
||||
}
|
||||
|
||||
show_status() {
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "BQAS Scheduler Status"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# launchd Status
|
||||
echo "launchd Job:"
|
||||
if launchctl list | grep -q "$PLIST_NAME"; then
|
||||
echo -e " ${GREEN}✓${NC} Geladen"
|
||||
launchctl list "$PLIST_NAME" 2>/dev/null || true
|
||||
else
|
||||
echo -e " ${RED}✗${NC} Nicht geladen"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# plist Status
|
||||
echo "plist Datei:"
|
||||
if [ -f "$PLIST_PATH" ]; then
|
||||
echo -e " ${GREEN}✓${NC} Vorhanden: $PLIST_PATH"
|
||||
else
|
||||
echo -e " ${RED}✗${NC} Nicht vorhanden"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Git Hook Status
|
||||
echo "Git Hook:"
|
||||
local hook_path="${GIT_HOOKS_DIR}/post-commit"
|
||||
if [ -f "$hook_path" ] && grep -q "BQAS" "$hook_path" 2>/dev/null; then
|
||||
echo -e " ${GREEN}✓${NC} Installiert: $hook_path"
|
||||
else
|
||||
echo -e " ${RED}✗${NC} Nicht installiert"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Log-Verzeichnis
|
||||
echo "Log-Verzeichnis:"
|
||||
if [ -d "$LOG_DIR" ]; then
|
||||
echo -e " ${GREEN}✓${NC} Vorhanden: $LOG_DIR"
|
||||
if [ -f "${LOG_DIR}/bqas.log" ]; then
|
||||
echo " Letzter Eintrag:"
|
||||
tail -1 "${LOG_DIR}/bqas.log" 2>/dev/null || echo " (leer)"
|
||||
fi
|
||||
else
|
||||
echo -e " ${RED}✗${NC} Nicht vorhanden"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Naechste Ausfuehrung
|
||||
echo "Zeitplan: Taeglich um 07:00 Uhr"
|
||||
echo ""
|
||||
}
|
||||
|
||||
do_install() {
|
||||
log "INFO" "=========================================="
|
||||
log "INFO" "BQAS Scheduler Installation"
|
||||
log "INFO" "=========================================="
|
||||
|
||||
create_log_directory
|
||||
create_plist
|
||||
load_plist
|
||||
create_git_hook
|
||||
|
||||
echo ""
|
||||
log "SUCCESS" "Installation abgeschlossen!"
|
||||
echo ""
|
||||
echo "Naechste Schritte:"
|
||||
echo " 1. Manueller Test: $0 test"
|
||||
echo " 2. Status pruefen: $0 status"
|
||||
echo " 3. Logs anschauen: tail -f ${LOG_DIR}/bqas.log"
|
||||
echo ""
|
||||
}
|
||||
|
||||
do_uninstall() {
|
||||
log "INFO" "=========================================="
|
||||
log "INFO" "BQAS Scheduler Deinstallation"
|
||||
log "INFO" "=========================================="
|
||||
|
||||
unload_plist
|
||||
remove_git_hook
|
||||
|
||||
echo ""
|
||||
log "SUCCESS" "Deinstallation abgeschlossen!"
|
||||
echo ""
|
||||
echo "Log-Verzeichnis wurde nicht entfernt: $LOG_DIR"
|
||||
echo "Zum Entfernen: sudo rm -rf $LOG_DIR"
|
||||
echo ""
|
||||
}
|
||||
|
||||
do_test() {
|
||||
log "INFO" "Starte BQAS Tests manuell..."
|
||||
echo ""
|
||||
|
||||
if [ -f "${VOICE_SERVICE_DIR}/scripts/run_bqas.sh" ]; then
|
||||
"${VOICE_SERVICE_DIR}/scripts/run_bqas.sh"
|
||||
else
|
||||
log "ERROR" "run_bqas.sh nicht gefunden!"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Hauptlogik
|
||||
case $ACTION in
|
||||
install)
|
||||
do_install
|
||||
;;
|
||||
uninstall)
|
||||
do_uninstall
|
||||
;;
|
||||
status)
|
||||
show_status
|
||||
;;
|
||||
test)
|
||||
do_test
|
||||
;;
|
||||
*)
|
||||
show_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
# BQAS Post-Commit Hook
|
||||
# =====================
|
||||
#
|
||||
# Fuehrt automatisch BQAS Quick Tests aus, wenn Aenderungen
|
||||
# im voice-service/ Verzeichnis committed werden.
|
||||
#
|
||||
# Installation:
|
||||
# cp post-commit.hook /path/to/.git/hooks/post-commit
|
||||
# chmod +x /path/to/.git/hooks/post-commit
|
||||
#
|
||||
# Oder nutze das Installations-Script:
|
||||
# ./scripts/install_bqas_scheduler.sh install
|
||||
|
||||
# Konfiguration
|
||||
VOICE_SERVICE_DIR="/Users/benjaminadmin/Projekte/breakpilot-pwa/voice-service"
|
||||
RUN_ASYNC=true # Im Hintergrund ausfuehren (empfohlen)
|
||||
|
||||
# Farben
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Pruefen ob voice-service geaendert wurde
|
||||
changed_files=$(git diff --name-only HEAD~1 2>/dev/null || true)
|
||||
|
||||
if echo "$changed_files" | grep -q "^voice-service/"; then
|
||||
echo ""
|
||||
echo -e "${YELLOW}[BQAS]${NC} voice-service geaendert - starte Quick Check..."
|
||||
|
||||
# Script-Pfad
|
||||
BQAS_SCRIPT="${VOICE_SERVICE_DIR}/scripts/run_bqas.sh"
|
||||
|
||||
if [ -f "$BQAS_SCRIPT" ]; then
|
||||
if [ "$RUN_ASYNC" = true ]; then
|
||||
# Async im Hintergrund
|
||||
nohup "$BQAS_SCRIPT" --quick > /dev/null 2>&1 &
|
||||
pid=$!
|
||||
echo -e "${GREEN}[BQAS]${NC} Quick Check gestartet (PID: $pid)"
|
||||
echo " Logs: /var/log/bqas/bqas.log"
|
||||
else
|
||||
# Synchron (blockiert commit)
|
||||
"$BQAS_SCRIPT" --quick
|
||||
fi
|
||||
else
|
||||
echo -e "${YELLOW}[BQAS]${NC} run_bqas.sh nicht gefunden, uebersprungen"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Hook erfolgreich (commit nie blockieren)
|
||||
exit 0
|
||||
@@ -1,286 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BQAS Runner Script
|
||||
Run BQAS tests and generate reports
|
||||
"""
|
||||
import asyncio
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Add parent to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from bqas.judge import LLMJudge
|
||||
from bqas.config import BQASConfig
|
||||
from bqas.regression_tracker import RegressionTracker
|
||||
from bqas.synthetic_generator import SyntheticGenerator
|
||||
from bqas.backlog_generator import BacklogGenerator
|
||||
from bqas.metrics import BQASMetrics, TestResult
|
||||
|
||||
|
||||
async def run_golden_suite(config: BQASConfig, judge: LLMJudge) -> list:
|
||||
"""Run the golden test suite."""
|
||||
import yaml
|
||||
|
||||
results = []
|
||||
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
||||
|
||||
for yaml_file in golden_dir.glob("*.yaml"):
|
||||
print(f"\n📋 Loading {yaml_file.name}...")
|
||||
|
||||
with open(yaml_file) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
tests = data.get("tests", []) + data.get("edge_cases", [])
|
||||
|
||||
for test in tests:
|
||||
test_id = test.get("id", "UNKNOWN")
|
||||
print(f" Testing {test_id}...", end=" ", flush=True)
|
||||
|
||||
result = await judge.evaluate_test_case(
|
||||
test_id=test_id,
|
||||
test_name=test.get("name", ""),
|
||||
user_input=test.get("input", ""),
|
||||
expected_intent=test.get("expected_intent", "unknown"),
|
||||
detected_intent=test.get("expected_intent", "unknown"), # Mock for now
|
||||
response="Verstanden.",
|
||||
min_score=test.get("min_score", 3.5),
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if result.passed:
|
||||
print(f"✅ {result.composite_score:.2f}")
|
||||
else:
|
||||
print(f"❌ {result.composite_score:.2f} ({result.reasoning[:50]})")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def run_synthetic_tests(
|
||||
config: BQASConfig,
|
||||
judge: LLMJudge,
|
||||
generator: SyntheticGenerator,
|
||||
) -> list:
|
||||
"""Run synthetic tests."""
|
||||
results = []
|
||||
|
||||
print("\n🔄 Generating synthetic tests...")
|
||||
|
||||
intents = ["student_observation", "worksheet_generate", "reminder"]
|
||||
|
||||
for intent in intents:
|
||||
print(f"\n Intent: {intent}")
|
||||
variations = generator._generate_fallback(intent, count=5)
|
||||
|
||||
for i, var in enumerate(variations):
|
||||
test_id = f"SYN-{intent[:4].upper()}-{i+1:03d}"
|
||||
print(f" {test_id}...", end=" ", flush=True)
|
||||
|
||||
result = await judge.evaluate_test_case(
|
||||
test_id=test_id,
|
||||
test_name=f"Synthetic {intent}",
|
||||
user_input=var.input,
|
||||
expected_intent=var.expected_intent,
|
||||
detected_intent=var.expected_intent,
|
||||
response="Verstanden.",
|
||||
min_score=3.0,
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if result.passed:
|
||||
print(f"✅ {result.composite_score:.2f}")
|
||||
else:
|
||||
print(f"❌ {result.composite_score:.2f}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def generate_report(
|
||||
golden_metrics: BQASMetrics,
|
||||
synthetic_metrics: BQASMetrics,
|
||||
output_path: Path,
|
||||
):
|
||||
"""Generate HTML report."""
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>BQAS Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
|
||||
<style>
|
||||
body {{ font-family: sans-serif; margin: 20px; }}
|
||||
h1 {{ color: #333; }}
|
||||
.summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
|
||||
.card {{ background: #f5f5f5; padding: 20px; border-radius: 8px; }}
|
||||
.passed {{ color: #22c55e; }}
|
||||
.failed {{ color: #ef4444; }}
|
||||
table {{ border-collapse: collapse; width: 100%; }}
|
||||
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
||||
th {{ background: #f0f0f0; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>BQAS Test Report</h1>
|
||||
|
||||
<div class="summary">
|
||||
<div class="card">
|
||||
<h3>Golden Suite</h3>
|
||||
<p>Total: {golden_metrics.total_tests}</p>
|
||||
<p class="passed">Passed: {golden_metrics.passed_tests}</p>
|
||||
<p class="failed">Failed: {golden_metrics.failed_tests}</p>
|
||||
<p>Avg Score: {golden_metrics.avg_composite_score:.3f}</p>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h3>Synthetic Tests</h3>
|
||||
<p>Total: {synthetic_metrics.total_tests}</p>
|
||||
<p class="passed">Passed: {synthetic_metrics.passed_tests}</p>
|
||||
<p class="failed">Failed: {synthetic_metrics.failed_tests}</p>
|
||||
<p>Avg Score: {synthetic_metrics.avg_composite_score:.3f}</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2>Scores by Intent</h2>
|
||||
<table>
|
||||
<tr><th>Intent</th><th>Score</th></tr>
|
||||
{''.join(f"<tr><td>{k}</td><td>{v:.3f}</td></tr>" for k, v in golden_metrics.scores_by_intent.items())}
|
||||
</table>
|
||||
|
||||
<h2>Failed Tests</h2>
|
||||
<ul>
|
||||
{''.join(f"<li>{tid}</li>" for tid in golden_metrics.failed_test_ids[:20])}
|
||||
</ul>
|
||||
|
||||
<footer>
|
||||
<p>Generated: {datetime.now().isoformat()}</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
output_path.write_text(html)
|
||||
print(f"\n📊 Report saved to: {output_path}")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="BQAS Test Runner")
|
||||
parser.add_argument("--all", action="store_true", help="Run all tests")
|
||||
parser.add_argument("--golden", action="store_true", help="Run golden suite only")
|
||||
parser.add_argument("--synthetic", action="store_true", help="Run synthetic tests only")
|
||||
parser.add_argument("--check-regression", action="store_true", help="Check for regression")
|
||||
parser.add_argument("--threshold", type=float, default=0.1, help="Regression threshold")
|
||||
parser.add_argument("--create-issues", action="store_true", help="Create GitHub issues for failures")
|
||||
parser.add_argument("--report", action="store_true", help="Generate HTML report")
|
||||
parser.add_argument("--output", type=str, default="bqas_report.html", help="Report output path")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Default to --all if no specific test type selected
|
||||
if not (args.golden or args.synthetic or args.check_regression):
|
||||
args.all = True
|
||||
|
||||
print("=" * 60)
|
||||
print("BQAS - Breakpilot Quality Assurance System")
|
||||
print("=" * 60)
|
||||
|
||||
config = BQASConfig.from_env()
|
||||
judge = LLMJudge(config=config)
|
||||
tracker = RegressionTracker(config=config)
|
||||
generator = SyntheticGenerator(config=config)
|
||||
backlog = BacklogGenerator(config=config)
|
||||
|
||||
# Check if judge is available
|
||||
print("\n🔍 Checking LLM availability...")
|
||||
is_available = await judge.health_check()
|
||||
if not is_available:
|
||||
print("❌ LLM Judge not available. Make sure Ollama is running with the model.")
|
||||
print(f" Expected model: {config.judge_model}")
|
||||
print(f" Ollama URL: {config.ollama_base_url}")
|
||||
sys.exit(1)
|
||||
print("✅ LLM Judge available")
|
||||
|
||||
golden_results = []
|
||||
synthetic_results = []
|
||||
|
||||
# Run tests
|
||||
if args.all or args.golden:
|
||||
print("\n" + "=" * 60)
|
||||
print("Running Golden Suite")
|
||||
print("=" * 60)
|
||||
golden_results = await run_golden_suite(config, judge)
|
||||
|
||||
if args.all or args.synthetic:
|
||||
print("\n" + "=" * 60)
|
||||
print("Running Synthetic Tests")
|
||||
print("=" * 60)
|
||||
synthetic_results = await run_synthetic_tests(config, judge, generator)
|
||||
|
||||
# Calculate metrics
|
||||
golden_metrics = BQASMetrics.from_results(golden_results)
|
||||
synthetic_metrics = BQASMetrics.from_results(synthetic_results)
|
||||
|
||||
# Print summary
|
||||
print("\n" + golden_metrics.summary())
|
||||
|
||||
# Record run
|
||||
if golden_results:
|
||||
run = tracker.record_run(golden_metrics, synthetic_metrics.avg_composite_score)
|
||||
print(f"\n📝 Run recorded: #{run.id}")
|
||||
|
||||
# Check regression
|
||||
if args.check_regression:
|
||||
print("\n🔍 Checking for regression...")
|
||||
is_regression, delta, msg = tracker.check_regression(
|
||||
golden_metrics.avg_composite_score,
|
||||
args.threshold,
|
||||
)
|
||||
print(f" {msg}")
|
||||
|
||||
if is_regression and args.create_issues:
|
||||
print("\n📮 Creating regression alert...")
|
||||
runs = tracker.get_last_runs(1)
|
||||
if runs:
|
||||
url = await backlog.create_regression_alert(
|
||||
golden_metrics.avg_composite_score,
|
||||
golden_metrics.avg_composite_score + delta,
|
||||
delta,
|
||||
runs[0],
|
||||
)
|
||||
if url:
|
||||
print(f" Issue created: {url}")
|
||||
|
||||
# Create issues for failures
|
||||
if args.create_issues and golden_metrics.failed_tests > 0:
|
||||
print("\n📮 Creating issue for test failures...")
|
||||
failed = [r for r in golden_results if not r.passed]
|
||||
runs = tracker.get_last_runs(1)
|
||||
if runs:
|
||||
url = await backlog.create_issue(
|
||||
runs[0],
|
||||
golden_metrics,
|
||||
failed,
|
||||
)
|
||||
if url:
|
||||
print(f" Issue created: {url}")
|
||||
|
||||
# Generate report
|
||||
if args.report:
|
||||
generate_report(
|
||||
golden_metrics,
|
||||
synthetic_metrics,
|
||||
Path(args.output),
|
||||
)
|
||||
|
||||
# Cleanup
|
||||
await judge.close()
|
||||
await generator.close()
|
||||
|
||||
# Exit with error code if tests failed
|
||||
if golden_metrics.failed_tests > 0 or synthetic_metrics.failed_tests > 0:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,270 +0,0 @@
|
||||
#!/bin/bash
|
||||
# BQAS Local Runner - Lokale Alternative zu GitHub Actions
|
||||
# Fuehrt BQAS Tests aus und benachrichtigt bei Fehlern
|
||||
|
||||
set -e
|
||||
|
||||
# Konfiguration
|
||||
VOICE_SERVICE_DIR="/Users/benjaminadmin/Projekte/breakpilot-pwa/voice-service"
|
||||
VOICE_SERVICE_URL="${BQAS_SERVICE_URL:-http://localhost:8091}"
|
||||
LOG_DIR="/var/log/bqas"
|
||||
LOG_FILE="${LOG_DIR}/bqas.log"
|
||||
REGRESSION_THRESHOLD="${BQAS_REGRESSION_THRESHOLD:-0.1}"
|
||||
|
||||
# Farben fuer Output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Argumente
|
||||
QUICK_MODE=false
|
||||
GOLDEN_ONLY=false
|
||||
RAG_ONLY=false
|
||||
SILENT=false
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --quick Nur schnelle Golden Tests (fuer Git Hooks)"
|
||||
echo " --golden Nur Golden Suite"
|
||||
echo " --rag Nur RAG Suite"
|
||||
echo " --silent Keine Desktop-Benachrichtigungen"
|
||||
echo " --help Diese Hilfe anzeigen"
|
||||
echo ""
|
||||
echo "Umgebungsvariablen:"
|
||||
echo " BQAS_SERVICE_URL Voice Service URL (default: http://localhost:8091)"
|
||||
echo " BQAS_REGRESSION_THRESHOLD Regression Schwelle (default: 0.1)"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--quick)
|
||||
QUICK_MODE=true
|
||||
shift
|
||||
;;
|
||||
--golden)
|
||||
GOLDEN_ONLY=true
|
||||
shift
|
||||
;;
|
||||
--rag)
|
||||
RAG_ONLY=true
|
||||
shift
|
||||
;;
|
||||
--silent)
|
||||
SILENT=true
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unbekannte Option: $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Logging-Funktion
|
||||
log() {
|
||||
local level=$1
|
||||
local message=$2
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# Log-Verzeichnis erstellen falls nicht vorhanden
|
||||
if [ -d "$LOG_DIR" ]; then
|
||||
echo "${timestamp} [${level}] ${message}" >> "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# Console Output
|
||||
case $level in
|
||||
INFO)
|
||||
echo -e "${BLUE}[INFO]${NC} ${message}"
|
||||
;;
|
||||
SUCCESS)
|
||||
echo -e "${GREEN}[SUCCESS]${NC} ${message}"
|
||||
;;
|
||||
WARNING)
|
||||
echo -e "${YELLOW}[WARNING]${NC} ${message}"
|
||||
;;
|
||||
ERROR)
|
||||
echo -e "${RED}[ERROR]${NC} ${message}"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Benachrichtigung senden
|
||||
notify() {
|
||||
local title=$1
|
||||
local message=$2
|
||||
local is_error=${3:-false}
|
||||
|
||||
if [ "$SILENT" = true ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
# macOS Desktop-Benachrichtigung
|
||||
if [ "$is_error" = true ]; then
|
||||
osascript -e "display notification \"${message}\" with title \"${title}\" sound name \"Basso\"" 2>/dev/null || true
|
||||
else
|
||||
osascript -e "display notification \"${message}\" with title \"${title}\"" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
# Python-Notifier aufrufen (falls vorhanden)
|
||||
notify_python() {
|
||||
local status=$1
|
||||
local message=$2
|
||||
local details=$3
|
||||
|
||||
if [ -f "${VOICE_SERVICE_DIR}/bqas/notifier.py" ]; then
|
||||
python3 "${VOICE_SERVICE_DIR}/bqas/notifier.py" \
|
||||
--status "$status" \
|
||||
--message "$message" \
|
||||
--details "$details" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
# Pruefen ob Service laeuft
|
||||
check_service() {
|
||||
log "INFO" "Pruefe Voice Service Verfuegbarkeit..."
|
||||
|
||||
local health_url="${VOICE_SERVICE_URL}/health"
|
||||
local response
|
||||
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" "$health_url" 2>/dev/null) || response="000"
|
||||
|
||||
if [ "$response" = "200" ]; then
|
||||
log "SUCCESS" "Voice Service erreichbar"
|
||||
return 0
|
||||
else
|
||||
log "WARNING" "Voice Service nicht erreichbar (HTTP $response)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Regression Check durchfuehren
|
||||
check_regression() {
|
||||
log "INFO" "Pruefe auf Score-Regression..."
|
||||
|
||||
local regression_url="${VOICE_SERVICE_URL}/api/v1/bqas/regression-check?threshold=${REGRESSION_THRESHOLD}"
|
||||
local response
|
||||
|
||||
response=$(curl -s "$regression_url" 2>/dev/null) || {
|
||||
log "WARNING" "Regression-Check fehlgeschlagen"
|
||||
return 1
|
||||
}
|
||||
|
||||
local is_regression
|
||||
is_regression=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('is_regression', False))" 2>/dev/null) || is_regression="False"
|
||||
|
||||
if [ "$is_regression" = "True" ]; then
|
||||
local delta
|
||||
delta=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('delta', 0))" 2>/dev/null) || delta="unknown"
|
||||
log "ERROR" "Regression erkannt! Score-Abfall: ${delta}"
|
||||
return 1
|
||||
else
|
||||
log "SUCCESS" "Keine Regression erkannt"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# Tests ausfuehren
|
||||
run_tests() {
|
||||
local test_type=$1
|
||||
local test_path=$2
|
||||
local exit_code=0
|
||||
|
||||
log "INFO" "Starte ${test_type} Tests..."
|
||||
|
||||
cd "$VOICE_SERVICE_DIR"
|
||||
|
||||
# Aktiviere venv falls vorhanden
|
||||
if [ -f "venv/bin/activate" ]; then
|
||||
source venv/bin/activate
|
||||
fi
|
||||
|
||||
# pytest ausfuehren
|
||||
if python3 -m pytest "$test_path" -v --tb=short 2>&1 | tee -a "$LOG_FILE"; then
|
||||
log "SUCCESS" "${test_type} Tests bestanden"
|
||||
exit_code=0
|
||||
else
|
||||
log "ERROR" "${test_type} Tests fehlgeschlagen"
|
||||
exit_code=1
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
# Hauptlogik
|
||||
main() {
|
||||
local start_time=$(date +%s)
|
||||
local golden_exit=0
|
||||
local rag_exit=0
|
||||
local regression_exit=0
|
||||
local service_available=false
|
||||
|
||||
log "INFO" "=========================================="
|
||||
log "INFO" "BQAS Local Runner gestartet"
|
||||
log "INFO" "=========================================="
|
||||
|
||||
# Service-Check (optional, Tests koennen auch offline laufen)
|
||||
if check_service; then
|
||||
service_available=true
|
||||
fi
|
||||
|
||||
# Quick Mode: Nur schnelle Tests
|
||||
if [ "$QUICK_MODE" = true ]; then
|
||||
log "INFO" "Quick Mode - nur schnelle Golden Tests"
|
||||
run_tests "Golden (Quick)" "tests/bqas/test_golden.py -k 'not slow'" || golden_exit=1
|
||||
else
|
||||
# Vollstaendige Test-Ausfuehrung
|
||||
if [ "$RAG_ONLY" = false ]; then
|
||||
run_tests "Golden" "tests/bqas/test_golden.py" || golden_exit=1
|
||||
fi
|
||||
|
||||
if [ "$GOLDEN_ONLY" = false ]; then
|
||||
run_tests "RAG" "tests/bqas/test_rag.py" || rag_exit=1
|
||||
fi
|
||||
|
||||
# Regression-Check nur wenn Service verfuegbar
|
||||
if [ "$service_available" = true ]; then
|
||||
check_regression || regression_exit=1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Zusammenfassung
|
||||
local end_time=$(date +%s)
|
||||
local duration=$((end_time - start_time))
|
||||
|
||||
log "INFO" "=========================================="
|
||||
log "INFO" "BQAS Run abgeschlossen (${duration}s)"
|
||||
log "INFO" "=========================================="
|
||||
|
||||
# Ergebnis ermitteln
|
||||
local total_failures=$((golden_exit + rag_exit + regression_exit))
|
||||
|
||||
if [ $total_failures -eq 0 ]; then
|
||||
log "SUCCESS" "Alle Tests bestanden!"
|
||||
notify "BQAS" "Alle Tests bestanden" false
|
||||
notify_python "success" "Alle Tests bestanden" "Dauer: ${duration}s"
|
||||
return 0
|
||||
else
|
||||
local failure_details=""
|
||||
[ $golden_exit -ne 0 ] && failure_details="${failure_details}Golden Tests fehlgeschlagen. "
|
||||
[ $rag_exit -ne 0 ] && failure_details="${failure_details}RAG Tests fehlgeschlagen. "
|
||||
[ $regression_exit -ne 0 ] && failure_details="${failure_details}Regression erkannt. "
|
||||
|
||||
log "ERROR" "Tests fehlgeschlagen: ${failure_details}"
|
||||
notify "BQAS Alert" "$failure_details" true
|
||||
notify_python "failure" "Tests fehlgeschlagen" "$failure_details"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Script ausfuehren
|
||||
main
|
||||
Reference in New Issue
Block a user