This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/voice-service/bqas/regression_tracker.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

341 lines
10 KiB
Python

"""
Regression Tracker
Tracks test scores over time to detect quality regressions
"""
import sqlite3
import json
import subprocess
import structlog
from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Dict, Any
from dataclasses import dataclass, asdict
from pathlib import Path
from bqas.config import BQASConfig
from bqas.metrics import BQASMetrics
logger = structlog.get_logger(__name__)
@dataclass
class TestRun:
"""Record of a single test run."""
id: Optional[int] = None
timestamp: datetime = None
git_commit: str = ""
git_branch: str = ""
golden_score: float = 0.0
synthetic_score: float = 0.0
total_tests: int = 0
passed_tests: int = 0
failed_tests: int = 0
failures: List[str] = None
duration_seconds: float = 0.0
metadata: Dict[str, Any] = None
def __post_init__(self):
if self.timestamp is None:
self.timestamp = datetime.utcnow()
if self.failures is None:
self.failures = []
if self.metadata is None:
self.metadata = {}
class RegressionTracker:
"""
Tracks BQAS test scores over time.
Features:
- SQLite persistence
- Regression detection
- Trend analysis
- Alerting
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
self.db_path = Path(self.config.db_path)
self._init_db()
def _init_db(self):
"""Initialize SQLite database."""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS test_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
git_commit TEXT,
git_branch TEXT,
golden_score REAL,
synthetic_score REAL,
total_tests INTEGER,
passed_tests INTEGER,
failed_tests INTEGER,
failures TEXT,
duration_seconds REAL,
metadata TEXT
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp
ON test_runs(timestamp)
""")
conn.commit()
conn.close()
def _get_git_info(self) -> Tuple[str, str]:
"""Get current git commit and branch."""
try:
commit = subprocess.check_output(
["git", "rev-parse", "HEAD"],
stderr=subprocess.DEVNULL,
).decode().strip()[:8]
branch = subprocess.check_output(
["git", "rev-parse", "--abbrev-ref", "HEAD"],
stderr=subprocess.DEVNULL,
).decode().strip()
return commit, branch
except Exception:
return "unknown", "unknown"
def record_run(self, metrics: BQASMetrics, synthetic_score: float = 0.0) -> TestRun:
"""
Record a test run.
Args:
metrics: Aggregated metrics from the test run
synthetic_score: Optional synthetic test score
Returns:
Recorded TestRun
"""
git_commit, git_branch = self._get_git_info()
run = TestRun(
timestamp=metrics.timestamp,
git_commit=git_commit,
git_branch=git_branch,
golden_score=metrics.avg_composite_score,
synthetic_score=synthetic_score,
total_tests=metrics.total_tests,
passed_tests=metrics.passed_tests,
failed_tests=metrics.failed_tests,
failures=metrics.failed_test_ids,
duration_seconds=metrics.total_duration_ms / 1000,
metadata={"scores_by_intent": metrics.scores_by_intent},
)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
INSERT INTO test_runs (
timestamp, git_commit, git_branch, golden_score,
synthetic_score, total_tests, passed_tests, failed_tests,
failures, duration_seconds, metadata
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
run.timestamp.isoformat(),
run.git_commit,
run.git_branch,
run.golden_score,
run.synthetic_score,
run.total_tests,
run.passed_tests,
run.failed_tests,
json.dumps(run.failures),
run.duration_seconds,
json.dumps(run.metadata),
))
run.id = cursor.lastrowid
conn.commit()
conn.close()
logger.info(
"Test run recorded",
run_id=run.id,
score=run.golden_score,
passed=run.passed_tests,
failed=run.failed_tests,
)
return run
def get_last_runs(self, n: int = 5) -> List[TestRun]:
"""Get the last N test runs."""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT id, timestamp, git_commit, git_branch, golden_score,
synthetic_score, total_tests, passed_tests, failed_tests,
failures, duration_seconds, metadata
FROM test_runs
ORDER BY timestamp DESC
LIMIT ?
""", (n,))
runs = []
for row in cursor.fetchall():
runs.append(TestRun(
id=row[0],
timestamp=datetime.fromisoformat(row[1]),
git_commit=row[2],
git_branch=row[3],
golden_score=row[4],
synthetic_score=row[5],
total_tests=row[6],
passed_tests=row[7],
failed_tests=row[8],
failures=json.loads(row[9]) if row[9] else [],
duration_seconds=row[10],
metadata=json.loads(row[11]) if row[11] else {},
))
conn.close()
return runs
def get_runs_since(self, days: int = 30) -> List[TestRun]:
"""Get all runs in the last N days."""
since = datetime.utcnow() - timedelta(days=days)
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT id, timestamp, git_commit, git_branch, golden_score,
synthetic_score, total_tests, passed_tests, failed_tests,
failures, duration_seconds, metadata
FROM test_runs
WHERE timestamp >= ?
ORDER BY timestamp ASC
""", (since.isoformat(),))
runs = []
for row in cursor.fetchall():
runs.append(TestRun(
id=row[0],
timestamp=datetime.fromisoformat(row[1]),
git_commit=row[2],
git_branch=row[3],
golden_score=row[4],
synthetic_score=row[5],
total_tests=row[6],
passed_tests=row[7],
failed_tests=row[8],
failures=json.loads(row[9]) if row[9] else [],
duration_seconds=row[10],
metadata=json.loads(row[11]) if row[11] else {},
))
conn.close()
return runs
def check_regression(
self,
current_score: float,
threshold: Optional[float] = None,
) -> Tuple[bool, float, str]:
"""
Check if current score indicates a regression.
Args:
current_score: Current test run score
threshold: Optional threshold override
Returns:
(is_regression, delta, message)
"""
threshold = threshold or self.config.regression_threshold
last_runs = self.get_last_runs(n=5)
if len(last_runs) < 2:
return False, 0.0, "Not enough historical data"
# Calculate average of last runs
avg_score = sum(r.golden_score for r in last_runs) / len(last_runs)
delta = avg_score - current_score
if delta > threshold:
msg = f"Regression detected: score dropped from {avg_score:.3f} to {current_score:.3f} (delta: {delta:.3f})"
logger.warning(msg)
return True, delta, msg
return False, delta, f"Score stable: {current_score:.3f} (avg: {avg_score:.3f}, delta: {delta:.3f})"
def get_trend(self, days: int = 30) -> Dict[str, Any]:
"""
Get score trend for the last N days.
Returns:
Dictionary with dates, scores, and trend direction
"""
runs = self.get_runs_since(days)
if not runs:
return {
"dates": [],
"scores": [],
"trend": "unknown",
"avg_score": 0.0,
}
dates = [r.timestamp.isoformat() for r in runs]
scores = [r.golden_score for r in runs]
avg_score = sum(scores) / len(scores)
# Determine trend
if len(scores) >= 3:
recent = scores[-3:]
older = scores[:3]
recent_avg = sum(recent) / len(recent)
older_avg = sum(older) / len(older)
if recent_avg > older_avg + 0.05:
trend = "improving"
elif recent_avg < older_avg - 0.05:
trend = "declining"
else:
trend = "stable"
else:
trend = "insufficient_data"
return {
"dates": dates,
"scores": scores,
"trend": trend,
"avg_score": round(avg_score, 3),
"min_score": round(min(scores), 3),
"max_score": round(max(scores), 3),
}
def get_failing_intents(self, n: int = 5) -> Dict[str, float]:
"""Get intents with lowest scores from recent runs."""
runs = self.get_last_runs(n)
intent_scores: Dict[str, List[float]] = {}
for run in runs:
if "scores_by_intent" in run.metadata:
for intent, score in run.metadata["scores_by_intent"].items():
if intent not in intent_scores:
intent_scores[intent] = []
intent_scores[intent].append(score)
# Calculate averages and sort
avg_scores = {
intent: sum(scores) / len(scores)
for intent, scores in intent_scores.items()
}
# Return sorted from worst to best
return dict(sorted(avg_scores.items(), key=lambda x: x[1]))