feat: voice-service von lehrer nach core verschoben, Pipeline erweitert (voice, BQAS, embedding, night-scheduler)
This commit is contained in:
340
voice-service/bqas/regression_tracker.py
Normal file
340
voice-service/bqas/regression_tracker.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
Regression Tracker
|
||||
Tracks test scores over time to detect quality regressions
|
||||
"""
|
||||
import sqlite3
|
||||
import json
|
||||
import subprocess
|
||||
import structlog
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Optional, Tuple, Dict, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
|
||||
from bqas.config import BQASConfig
|
||||
from bqas.metrics import BQASMetrics
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestRun:
|
||||
"""Record of a single test run."""
|
||||
id: Optional[int] = None
|
||||
timestamp: datetime = None
|
||||
git_commit: str = ""
|
||||
git_branch: str = ""
|
||||
golden_score: float = 0.0
|
||||
synthetic_score: float = 0.0
|
||||
total_tests: int = 0
|
||||
passed_tests: int = 0
|
||||
failed_tests: int = 0
|
||||
failures: List[str] = None
|
||||
duration_seconds: float = 0.0
|
||||
metadata: Dict[str, Any] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.timestamp is None:
|
||||
self.timestamp = datetime.utcnow()
|
||||
if self.failures is None:
|
||||
self.failures = []
|
||||
if self.metadata is None:
|
||||
self.metadata = {}
|
||||
|
||||
|
||||
class RegressionTracker:
|
||||
"""
|
||||
Tracks BQAS test scores over time.
|
||||
|
||||
Features:
|
||||
- SQLite persistence
|
||||
- Regression detection
|
||||
- Trend analysis
|
||||
- Alerting
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[BQASConfig] = None):
|
||||
self.config = config or BQASConfig.from_env()
|
||||
self.db_path = Path(self.config.db_path)
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""Initialize SQLite database."""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS test_runs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
timestamp TEXT NOT NULL,
|
||||
git_commit TEXT,
|
||||
git_branch TEXT,
|
||||
golden_score REAL,
|
||||
synthetic_score REAL,
|
||||
total_tests INTEGER,
|
||||
passed_tests INTEGER,
|
||||
failed_tests INTEGER,
|
||||
failures TEXT,
|
||||
duration_seconds REAL,
|
||||
metadata TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp
|
||||
ON test_runs(timestamp)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def _get_git_info(self) -> Tuple[str, str]:
|
||||
"""Get current git commit and branch."""
|
||||
try:
|
||||
commit = subprocess.check_output(
|
||||
["git", "rev-parse", "HEAD"],
|
||||
stderr=subprocess.DEVNULL,
|
||||
).decode().strip()[:8]
|
||||
|
||||
branch = subprocess.check_output(
|
||||
["git", "rev-parse", "--abbrev-ref", "HEAD"],
|
||||
stderr=subprocess.DEVNULL,
|
||||
).decode().strip()
|
||||
|
||||
return commit, branch
|
||||
except Exception:
|
||||
return "unknown", "unknown"
|
||||
|
||||
def record_run(self, metrics: BQASMetrics, synthetic_score: float = 0.0) -> TestRun:
|
||||
"""
|
||||
Record a test run.
|
||||
|
||||
Args:
|
||||
metrics: Aggregated metrics from the test run
|
||||
synthetic_score: Optional synthetic test score
|
||||
|
||||
Returns:
|
||||
Recorded TestRun
|
||||
"""
|
||||
git_commit, git_branch = self._get_git_info()
|
||||
|
||||
run = TestRun(
|
||||
timestamp=metrics.timestamp,
|
||||
git_commit=git_commit,
|
||||
git_branch=git_branch,
|
||||
golden_score=metrics.avg_composite_score,
|
||||
synthetic_score=synthetic_score,
|
||||
total_tests=metrics.total_tests,
|
||||
passed_tests=metrics.passed_tests,
|
||||
failed_tests=metrics.failed_tests,
|
||||
failures=metrics.failed_test_ids,
|
||||
duration_seconds=metrics.total_duration_ms / 1000,
|
||||
metadata={"scores_by_intent": metrics.scores_by_intent},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO test_runs (
|
||||
timestamp, git_commit, git_branch, golden_score,
|
||||
synthetic_score, total_tests, passed_tests, failed_tests,
|
||||
failures, duration_seconds, metadata
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
run.timestamp.isoformat(),
|
||||
run.git_commit,
|
||||
run.git_branch,
|
||||
run.golden_score,
|
||||
run.synthetic_score,
|
||||
run.total_tests,
|
||||
run.passed_tests,
|
||||
run.failed_tests,
|
||||
json.dumps(run.failures),
|
||||
run.duration_seconds,
|
||||
json.dumps(run.metadata),
|
||||
))
|
||||
|
||||
run.id = cursor.lastrowid
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(
|
||||
"Test run recorded",
|
||||
run_id=run.id,
|
||||
score=run.golden_score,
|
||||
passed=run.passed_tests,
|
||||
failed=run.failed_tests,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
def get_last_runs(self, n: int = 5) -> List[TestRun]:
|
||||
"""Get the last N test runs."""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT id, timestamp, git_commit, git_branch, golden_score,
|
||||
synthetic_score, total_tests, passed_tests, failed_tests,
|
||||
failures, duration_seconds, metadata
|
||||
FROM test_runs
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT ?
|
||||
""", (n,))
|
||||
|
||||
runs = []
|
||||
for row in cursor.fetchall():
|
||||
runs.append(TestRun(
|
||||
id=row[0],
|
||||
timestamp=datetime.fromisoformat(row[1]),
|
||||
git_commit=row[2],
|
||||
git_branch=row[3],
|
||||
golden_score=row[4],
|
||||
synthetic_score=row[5],
|
||||
total_tests=row[6],
|
||||
passed_tests=row[7],
|
||||
failed_tests=row[8],
|
||||
failures=json.loads(row[9]) if row[9] else [],
|
||||
duration_seconds=row[10],
|
||||
metadata=json.loads(row[11]) if row[11] else {},
|
||||
))
|
||||
|
||||
conn.close()
|
||||
return runs
|
||||
|
||||
def get_runs_since(self, days: int = 30) -> List[TestRun]:
|
||||
"""Get all runs in the last N days."""
|
||||
since = datetime.utcnow() - timedelta(days=days)
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT id, timestamp, git_commit, git_branch, golden_score,
|
||||
synthetic_score, total_tests, passed_tests, failed_tests,
|
||||
failures, duration_seconds, metadata
|
||||
FROM test_runs
|
||||
WHERE timestamp >= ?
|
||||
ORDER BY timestamp ASC
|
||||
""", (since.isoformat(),))
|
||||
|
||||
runs = []
|
||||
for row in cursor.fetchall():
|
||||
runs.append(TestRun(
|
||||
id=row[0],
|
||||
timestamp=datetime.fromisoformat(row[1]),
|
||||
git_commit=row[2],
|
||||
git_branch=row[3],
|
||||
golden_score=row[4],
|
||||
synthetic_score=row[5],
|
||||
total_tests=row[6],
|
||||
passed_tests=row[7],
|
||||
failed_tests=row[8],
|
||||
failures=json.loads(row[9]) if row[9] else [],
|
||||
duration_seconds=row[10],
|
||||
metadata=json.loads(row[11]) if row[11] else {},
|
||||
))
|
||||
|
||||
conn.close()
|
||||
return runs
|
||||
|
||||
def check_regression(
|
||||
self,
|
||||
current_score: float,
|
||||
threshold: Optional[float] = None,
|
||||
) -> Tuple[bool, float, str]:
|
||||
"""
|
||||
Check if current score indicates a regression.
|
||||
|
||||
Args:
|
||||
current_score: Current test run score
|
||||
threshold: Optional threshold override
|
||||
|
||||
Returns:
|
||||
(is_regression, delta, message)
|
||||
"""
|
||||
threshold = threshold or self.config.regression_threshold
|
||||
last_runs = self.get_last_runs(n=5)
|
||||
|
||||
if len(last_runs) < 2:
|
||||
return False, 0.0, "Not enough historical data"
|
||||
|
||||
# Calculate average of last runs
|
||||
avg_score = sum(r.golden_score for r in last_runs) / len(last_runs)
|
||||
delta = avg_score - current_score
|
||||
|
||||
if delta > threshold:
|
||||
msg = f"Regression detected: score dropped from {avg_score:.3f} to {current_score:.3f} (delta: {delta:.3f})"
|
||||
logger.warning(msg)
|
||||
return True, delta, msg
|
||||
|
||||
return False, delta, f"Score stable: {current_score:.3f} (avg: {avg_score:.3f}, delta: {delta:.3f})"
|
||||
|
||||
def get_trend(self, days: int = 30) -> Dict[str, Any]:
|
||||
"""
|
||||
Get score trend for the last N days.
|
||||
|
||||
Returns:
|
||||
Dictionary with dates, scores, and trend direction
|
||||
"""
|
||||
runs = self.get_runs_since(days)
|
||||
|
||||
if not runs:
|
||||
return {
|
||||
"dates": [],
|
||||
"scores": [],
|
||||
"trend": "unknown",
|
||||
"avg_score": 0.0,
|
||||
}
|
||||
|
||||
dates = [r.timestamp.isoformat() for r in runs]
|
||||
scores = [r.golden_score for r in runs]
|
||||
avg_score = sum(scores) / len(scores)
|
||||
|
||||
# Determine trend
|
||||
if len(scores) >= 3:
|
||||
recent = scores[-3:]
|
||||
older = scores[:3]
|
||||
recent_avg = sum(recent) / len(recent)
|
||||
older_avg = sum(older) / len(older)
|
||||
|
||||
if recent_avg > older_avg + 0.05:
|
||||
trend = "improving"
|
||||
elif recent_avg < older_avg - 0.05:
|
||||
trend = "declining"
|
||||
else:
|
||||
trend = "stable"
|
||||
else:
|
||||
trend = "insufficient_data"
|
||||
|
||||
return {
|
||||
"dates": dates,
|
||||
"scores": scores,
|
||||
"trend": trend,
|
||||
"avg_score": round(avg_score, 3),
|
||||
"min_score": round(min(scores), 3),
|
||||
"max_score": round(max(scores), 3),
|
||||
}
|
||||
|
||||
def get_failing_intents(self, n: int = 5) -> Dict[str, float]:
|
||||
"""Get intents with lowest scores from recent runs."""
|
||||
runs = self.get_last_runs(n)
|
||||
|
||||
intent_scores: Dict[str, List[float]] = {}
|
||||
|
||||
for run in runs:
|
||||
if "scores_by_intent" in run.metadata:
|
||||
for intent, score in run.metadata["scores_by_intent"].items():
|
||||
if intent not in intent_scores:
|
||||
intent_scores[intent] = []
|
||||
intent_scores[intent].append(score)
|
||||
|
||||
# Calculate averages and sort
|
||||
avg_scores = {
|
||||
intent: sum(scores) / len(scores)
|
||||
for intent, scores in intent_scores.items()
|
||||
}
|
||||
|
||||
# Return sorted from worst to best
|
||||
return dict(sorted(avg_scores.items(), key=lambda x: x[1]))
|
||||
Reference in New Issue
Block a user