feat: voice-service von lehrer nach core verschoben, Pipeline erweitert (voice, BQAS, embedding, night-scheduler)

2026-02-15 13:26:06 +01:00
parent a7e4500ea6
commit 1089c73b46
59 changed files with 12921 additions and 20 deletions
@@ -0,0 +1,340 @@
+"""
+Regression Tracker
+Tracks test scores over time to detect quality regressions
+"""
+import sqlite3
+import json
+import subprocess
+import structlog
+from datetime import datetime, timedelta
+from typing import List, Optional, Tuple, Dict, Any
+from dataclasses import dataclass, asdict
+from pathlib import Path
+
+from bqas.config import BQASConfig
+from bqas.metrics import BQASMetrics
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class TestRun:
+    """Record of a single test run."""
+    id: Optional[int] = None
+    timestamp: datetime = None
+    git_commit: str = ""
+    git_branch: str = ""
+    golden_score: float = 0.0
+    synthetic_score: float = 0.0
+    total_tests: int = 0
+    passed_tests: int = 0
+    failed_tests: int = 0
+    failures: List[str] = None
+    duration_seconds: float = 0.0
+    metadata: Dict[str, Any] = None
+
+    def __post_init__(self):
+        if self.timestamp is None:
+            self.timestamp = datetime.utcnow()
+        if self.failures is None:
+            self.failures = []
+        if self.metadata is None:
+            self.metadata = {}
+
+
+class RegressionTracker:
+    """
+    Tracks BQAS test scores over time.
+
+    Features:
+    - SQLite persistence
+    - Regression detection
+    - Trend analysis
+    - Alerting
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+        self.db_path = Path(self.config.db_path)
+        self._init_db()
+
+    def _init_db(self):
+        """Initialize SQLite database."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS test_runs (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                timestamp TEXT NOT NULL,
+                git_commit TEXT,
+                git_branch TEXT,
+                golden_score REAL,
+                synthetic_score REAL,
+                total_tests INTEGER,
+                passed_tests INTEGER,
+                failed_tests INTEGER,
+                failures TEXT,
+                duration_seconds REAL,
+                metadata TEXT
+            )
+        """)
+
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_timestamp
+            ON test_runs(timestamp)
+        """)
+
+        conn.commit()
+        conn.close()
+
+    def _get_git_info(self) -> Tuple[str, str]:
+        """Get current git commit and branch."""
+        try:
+            commit = subprocess.check_output(
+                ["git", "rev-parse", "HEAD"],
+                stderr=subprocess.DEVNULL,
+            ).decode().strip()[:8]
+
+            branch = subprocess.check_output(
+                ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+                stderr=subprocess.DEVNULL,
+            ).decode().strip()
+
+            return commit, branch
+        except Exception:
+            return "unknown", "unknown"
+
+    def record_run(self, metrics: BQASMetrics, synthetic_score: float = 0.0) -> TestRun:
+        """
+        Record a test run.
+
+        Args:
+            metrics: Aggregated metrics from the test run
+            synthetic_score: Optional synthetic test score
+
+        Returns:
+            Recorded TestRun
+        """
+        git_commit, git_branch = self._get_git_info()
+
+        run = TestRun(
+            timestamp=metrics.timestamp,
+            git_commit=git_commit,
+            git_branch=git_branch,
+            golden_score=metrics.avg_composite_score,
+            synthetic_score=synthetic_score,
+            total_tests=metrics.total_tests,
+            passed_tests=metrics.passed_tests,
+            failed_tests=metrics.failed_tests,
+            failures=metrics.failed_test_ids,
+            duration_seconds=metrics.total_duration_ms / 1000,
+            metadata={"scores_by_intent": metrics.scores_by_intent},
+        )
+
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            INSERT INTO test_runs (
+                timestamp, git_commit, git_branch, golden_score,
+                synthetic_score, total_tests, passed_tests, failed_tests,
+                failures, duration_seconds, metadata
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """, (
+            run.timestamp.isoformat(),
+            run.git_commit,
+            run.git_branch,
+            run.golden_score,
+            run.synthetic_score,
+            run.total_tests,
+            run.passed_tests,
+            run.failed_tests,
+            json.dumps(run.failures),
+            run.duration_seconds,
+            json.dumps(run.metadata),
+        ))
+
+        run.id = cursor.lastrowid
+        conn.commit()
+        conn.close()
+
+        logger.info(
+            "Test run recorded",
+            run_id=run.id,
+            score=run.golden_score,
+            passed=run.passed_tests,
+            failed=run.failed_tests,
+        )
+
+        return run
+
+    def get_last_runs(self, n: int = 5) -> List[TestRun]:
+        """Get the last N test runs."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT id, timestamp, git_commit, git_branch, golden_score,
+                   synthetic_score, total_tests, passed_tests, failed_tests,
+                   failures, duration_seconds, metadata
+            FROM test_runs
+            ORDER BY timestamp DESC
+            LIMIT ?
+        """, (n,))
+
+        runs = []
+        for row in cursor.fetchall():
+            runs.append(TestRun(
+                id=row[0],
+                timestamp=datetime.fromisoformat(row[1]),
+                git_commit=row[2],
+                git_branch=row[3],
+                golden_score=row[4],
+                synthetic_score=row[5],
+                total_tests=row[6],
+                passed_tests=row[7],
+                failed_tests=row[8],
+                failures=json.loads(row[9]) if row[9] else [],
+                duration_seconds=row[10],
+                metadata=json.loads(row[11]) if row[11] else {},
+            ))
+
+        conn.close()
+        return runs
+
+    def get_runs_since(self, days: int = 30) -> List[TestRun]:
+        """Get all runs in the last N days."""
+        since = datetime.utcnow() - timedelta(days=days)
+
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT id, timestamp, git_commit, git_branch, golden_score,
+                   synthetic_score, total_tests, passed_tests, failed_tests,
+                   failures, duration_seconds, metadata
+            FROM test_runs
+            WHERE timestamp >= ?
+            ORDER BY timestamp ASC
+        """, (since.isoformat(),))
+
+        runs = []
+        for row in cursor.fetchall():
+            runs.append(TestRun(
+                id=row[0],
+                timestamp=datetime.fromisoformat(row[1]),
+                git_commit=row[2],
+                git_branch=row[3],
+                golden_score=row[4],
+                synthetic_score=row[5],
+                total_tests=row[6],
+                passed_tests=row[7],
+                failed_tests=row[8],
+                failures=json.loads(row[9]) if row[9] else [],
+                duration_seconds=row[10],
+                metadata=json.loads(row[11]) if row[11] else {},
+            ))
+
+        conn.close()
+        return runs
+
+    def check_regression(
+        self,
+        current_score: float,
+        threshold: Optional[float] = None,
+    ) -> Tuple[bool, float, str]:
+        """
+        Check if current score indicates a regression.
+
+        Args:
+            current_score: Current test run score
+            threshold: Optional threshold override
+
+        Returns:
+            (is_regression, delta, message)
+        """
+        threshold = threshold or self.config.regression_threshold
+        last_runs = self.get_last_runs(n=5)
+
+        if len(last_runs) < 2:
+            return False, 0.0, "Not enough historical data"
+
+        # Calculate average of last runs
+        avg_score = sum(r.golden_score for r in last_runs) / len(last_runs)
+        delta = avg_score - current_score
+
+        if delta > threshold:
+            msg = f"Regression detected: score dropped from {avg_score:.3f} to {current_score:.3f} (delta: {delta:.3f})"
+            logger.warning(msg)
+            return True, delta, msg
+
+        return False, delta, f"Score stable: {current_score:.3f} (avg: {avg_score:.3f}, delta: {delta:.3f})"
+
+    def get_trend(self, days: int = 30) -> Dict[str, Any]:
+        """
+        Get score trend for the last N days.
+
+        Returns:
+            Dictionary with dates, scores, and trend direction
+        """
+        runs = self.get_runs_since(days)
+
+        if not runs:
+            return {
+                "dates": [],
+                "scores": [],
+                "trend": "unknown",
+                "avg_score": 0.0,
+            }
+
+        dates = [r.timestamp.isoformat() for r in runs]
+        scores = [r.golden_score for r in runs]
+        avg_score = sum(scores) / len(scores)
+
+        # Determine trend
+        if len(scores) >= 3:
+            recent = scores[-3:]
+            older = scores[:3]
+            recent_avg = sum(recent) / len(recent)
+            older_avg = sum(older) / len(older)
+
+            if recent_avg > older_avg + 0.05:
+                trend = "improving"
+            elif recent_avg < older_avg - 0.05:
+                trend = "declining"
+            else:
+                trend = "stable"
+        else:
+            trend = "insufficient_data"
+
+        return {
+            "dates": dates,
+            "scores": scores,
+            "trend": trend,
+            "avg_score": round(avg_score, 3),
+            "min_score": round(min(scores), 3),
+            "max_score": round(max(scores), 3),
+        }
+
+    def get_failing_intents(self, n: int = 5) -> Dict[str, float]:
+        """Get intents with lowest scores from recent runs."""
+        runs = self.get_last_runs(n)
+
+        intent_scores: Dict[str, List[float]] = {}
+
+        for run in runs:
+            if "scores_by_intent" in run.metadata:
+                for intent, score in run.metadata["scores_by_intent"].items():
+                    if intent not in intent_scores:
+                        intent_scores[intent] = []
+                    intent_scores[intent].append(score)
+
+        # Calculate averages and sort
+        avg_scores = {
+            intent: sum(scores) / len(scores)
+            for intent, scores in intent_scores.items()
+        }
+
+        # Return sorted from worst to best
+        return dict(sorted(avg_scores.items(), key=lambda x: x[1]))