feat: voice-service von lehrer nach core verschoben, Pipeline erweitert (voice, BQAS, embedding, night-scheduler)

This commit is contained in:
Benjamin Boenisch
2026-02-15 13:26:06 +01:00
parent a7e4500ea6
commit 1089c73b46
59 changed files with 12921 additions and 20 deletions

View File

@@ -0,0 +1,340 @@
"""
Regression Tracker
Tracks test scores over time to detect quality regressions
"""
import sqlite3
import json
import subprocess
import structlog
from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Dict, Any
from dataclasses import dataclass, asdict
from pathlib import Path
from bqas.config import BQASConfig
from bqas.metrics import BQASMetrics
logger = structlog.get_logger(__name__)
@dataclass
class TestRun:
"""Record of a single test run."""
id: Optional[int] = None
timestamp: datetime = None
git_commit: str = ""
git_branch: str = ""
golden_score: float = 0.0
synthetic_score: float = 0.0
total_tests: int = 0
passed_tests: int = 0
failed_tests: int = 0
failures: List[str] = None
duration_seconds: float = 0.0
metadata: Dict[str, Any] = None
def __post_init__(self):
if self.timestamp is None:
self.timestamp = datetime.utcnow()
if self.failures is None:
self.failures = []
if self.metadata is None:
self.metadata = {}
class RegressionTracker:
"""
Tracks BQAS test scores over time.
Features:
- SQLite persistence
- Regression detection
- Trend analysis
- Alerting
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
self.db_path = Path(self.config.db_path)
self._init_db()
def _init_db(self):
"""Initialize SQLite database."""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS test_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
git_commit TEXT,
git_branch TEXT,
golden_score REAL,
synthetic_score REAL,
total_tests INTEGER,
passed_tests INTEGER,
failed_tests INTEGER,
failures TEXT,
duration_seconds REAL,
metadata TEXT
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp
ON test_runs(timestamp)
""")
conn.commit()
conn.close()
def _get_git_info(self) -> Tuple[str, str]:
"""Get current git commit and branch."""
try:
commit = subprocess.check_output(
["git", "rev-parse", "HEAD"],
stderr=subprocess.DEVNULL,
).decode().strip()[:8]
branch = subprocess.check_output(
["git", "rev-parse", "--abbrev-ref", "HEAD"],
stderr=subprocess.DEVNULL,
).decode().strip()
return commit, branch
except Exception:
return "unknown", "unknown"
def record_run(self, metrics: BQASMetrics, synthetic_score: float = 0.0) -> TestRun:
"""
Record a test run.
Args:
metrics: Aggregated metrics from the test run
synthetic_score: Optional synthetic test score
Returns:
Recorded TestRun
"""
git_commit, git_branch = self._get_git_info()
run = TestRun(
timestamp=metrics.timestamp,
git_commit=git_commit,
git_branch=git_branch,
golden_score=metrics.avg_composite_score,
synthetic_score=synthetic_score,
total_tests=metrics.total_tests,
passed_tests=metrics.passed_tests,
failed_tests=metrics.failed_tests,
failures=metrics.failed_test_ids,
duration_seconds=metrics.total_duration_ms / 1000,
metadata={"scores_by_intent": metrics.scores_by_intent},
)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
INSERT INTO test_runs (
timestamp, git_commit, git_branch, golden_score,
synthetic_score, total_tests, passed_tests, failed_tests,
failures, duration_seconds, metadata
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
run.timestamp.isoformat(),
run.git_commit,
run.git_branch,
run.golden_score,
run.synthetic_score,
run.total_tests,
run.passed_tests,
run.failed_tests,
json.dumps(run.failures),
run.duration_seconds,
json.dumps(run.metadata),
))
run.id = cursor.lastrowid
conn.commit()
conn.close()
logger.info(
"Test run recorded",
run_id=run.id,
score=run.golden_score,
passed=run.passed_tests,
failed=run.failed_tests,
)
return run
def get_last_runs(self, n: int = 5) -> List[TestRun]:
"""Get the last N test runs."""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT id, timestamp, git_commit, git_branch, golden_score,
synthetic_score, total_tests, passed_tests, failed_tests,
failures, duration_seconds, metadata
FROM test_runs
ORDER BY timestamp DESC
LIMIT ?
""", (n,))
runs = []
for row in cursor.fetchall():
runs.append(TestRun(
id=row[0],
timestamp=datetime.fromisoformat(row[1]),
git_commit=row[2],
git_branch=row[3],
golden_score=row[4],
synthetic_score=row[5],
total_tests=row[6],
passed_tests=row[7],
failed_tests=row[8],
failures=json.loads(row[9]) if row[9] else [],
duration_seconds=row[10],
metadata=json.loads(row[11]) if row[11] else {},
))
conn.close()
return runs
def get_runs_since(self, days: int = 30) -> List[TestRun]:
"""Get all runs in the last N days."""
since = datetime.utcnow() - timedelta(days=days)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT id, timestamp, git_commit, git_branch, golden_score,
synthetic_score, total_tests, passed_tests, failed_tests,
failures, duration_seconds, metadata
FROM test_runs
WHERE timestamp >= ?
ORDER BY timestamp ASC
""", (since.isoformat(),))
runs = []
for row in cursor.fetchall():
runs.append(TestRun(
id=row[0],
timestamp=datetime.fromisoformat(row[1]),
git_commit=row[2],
git_branch=row[3],
golden_score=row[4],
synthetic_score=row[5],
total_tests=row[6],
passed_tests=row[7],
failed_tests=row[8],
failures=json.loads(row[9]) if row[9] else [],
duration_seconds=row[10],
metadata=json.loads(row[11]) if row[11] else {},
))
conn.close()
return runs
def check_regression(
self,
current_score: float,
threshold: Optional[float] = None,
) -> Tuple[bool, float, str]:
"""
Check if current score indicates a regression.
Args:
current_score: Current test run score
threshold: Optional threshold override
Returns:
(is_regression, delta, message)
"""
threshold = threshold or self.config.regression_threshold
last_runs = self.get_last_runs(n=5)
if len(last_runs) < 2:
return False, 0.0, "Not enough historical data"
# Calculate average of last runs
avg_score = sum(r.golden_score for r in last_runs) / len(last_runs)
delta = avg_score - current_score
if delta > threshold:
msg = f"Regression detected: score dropped from {avg_score:.3f} to {current_score:.3f} (delta: {delta:.3f})"
logger.warning(msg)
return True, delta, msg
return False, delta, f"Score stable: {current_score:.3f} (avg: {avg_score:.3f}, delta: {delta:.3f})"
def get_trend(self, days: int = 30) -> Dict[str, Any]:
"""
Get score trend for the last N days.
Returns:
Dictionary with dates, scores, and trend direction
"""
runs = self.get_runs_since(days)
if not runs:
return {
"dates": [],
"scores": [],
"trend": "unknown",
"avg_score": 0.0,
}
dates = [r.timestamp.isoformat() for r in runs]
scores = [r.golden_score for r in runs]
avg_score = sum(scores) / len(scores)
# Determine trend
if len(scores) >= 3:
recent = scores[-3:]
older = scores[:3]
recent_avg = sum(recent) / len(recent)
older_avg = sum(older) / len(older)
if recent_avg > older_avg + 0.05:
trend = "improving"
elif recent_avg < older_avg - 0.05:
trend = "declining"
else:
trend = "stable"
else:
trend = "insufficient_data"
return {
"dates": dates,
"scores": scores,
"trend": trend,
"avg_score": round(avg_score, 3),
"min_score": round(min(scores), 3),
"max_score": round(max(scores), 3),
}
def get_failing_intents(self, n: int = 5) -> Dict[str, float]:
"""Get intents with lowest scores from recent runs."""
runs = self.get_last_runs(n)
intent_scores: Dict[str, List[float]] = {}
for run in runs:
if "scores_by_intent" in run.metadata:
for intent, score in run.metadata["scores_by_intent"].items():
if intent not in intent_scores:
intent_scores[intent] = []
intent_scores[intent].append(score)
# Calculate averages and sort
avg_scores = {
intent: sum(scores) / len(scores)
for intent, scores in intent_scores.items()
}
# Return sorted from worst to best
return dict(sorted(avg_scores.items(), key=lambda x: x[1]))