This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/voice-service/tests/bqas/test_regression.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

208 lines
7.2 KiB
Python

"""
Regression Tests
Tests for regression tracking and alerting
"""
import pytest
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from bqas.regression_tracker import RegressionTracker, TestRun
from bqas.metrics import BQASMetrics, TestResult
from bqas.config import BQASConfig
class TestRegressionTracker:
"""Tests for regression tracking."""
@pytest.fixture
def temp_tracker(self):
"""Create a tracker with temporary database."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
yield tracker
# Cleanup
Path(f.name).unlink(missing_ok=True)
def test_record_run(self, temp_tracker: RegressionTracker):
"""Test recording a test run."""
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.2,
avg_relevance=4.0,
avg_coherence=4.1,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={"student_observation": 4.2, "worksheet_generate": 3.8},
failed_test_ids=["INT-001", "INT-002"],
total_duration_ms=5000,
timestamp=datetime.now(timezone.utc),
)
run = temp_tracker.record_run(metrics)
assert run.id is not None
assert run.golden_score == 4.0
assert run.total_tests == 10
assert run.passed_tests == 8
def test_get_last_runs(self, temp_tracker: RegressionTracker):
"""Test retrieving last runs."""
# Record multiple runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10 - i,
failed_tests=i,
avg_intent_accuracy=90.0 - i * 5,
avg_faithfulness=4.5 - i * 0.1,
avg_relevance=4.5 - i * 0.1,
avg_coherence=4.5 - i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.5 - i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
runs = temp_tracker.get_last_runs(n=3)
assert len(runs) == 3
# Most recent should be first
assert runs[0].passed_tests == 6 # Last recorded
def test_check_regression_no_data(self, temp_tracker: RegressionTracker):
"""Test regression check with no historical data."""
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert not is_regression
assert "Not enough historical data" in msg
def test_check_regression_stable(self, temp_tracker: RegressionTracker):
"""Test regression check with stable scores."""
# Record stable runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with same score
is_regression, delta, msg = temp_tracker.check_regression(4.5)
assert not is_regression
assert abs(delta) < 0.1
def test_check_regression_detected(self, temp_tracker: RegressionTracker):
"""Test regression detection."""
# Record good runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with significantly lower score
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert is_regression
assert delta > 0.1
assert "Regression detected" in msg
def test_get_trend(self, temp_tracker: RegressionTracker):
"""Test trend calculation."""
# Record improving runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=80.0 + i * 5,
avg_faithfulness=4.0 + i * 0.1,
avg_relevance=4.0 + i * 0.1,
avg_coherence=4.0 + i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.0 + i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
trend = temp_tracker.get_trend(days=30)
assert len(trend["dates"]) == 5
assert len(trend["scores"]) == 5
assert trend["trend"] in ["improving", "stable", "declining", "insufficient_data"]
class TestRegressionAlerts:
"""Tests for regression alerting."""
def test_failing_intents(self):
"""Test identification of failing intents."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
# Record runs with intent scores
for _ in range(3):
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.0,
avg_relevance=4.0,
avg_coherence=4.0,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={
"student_observation": 4.5,
"worksheet_generate": 3.2, # Low
"parent_letter": 4.0,
},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
tracker.record_run(metrics)
failing = tracker.get_failing_intents()
assert "worksheet_generate" in failing
assert failing["worksheet_generate"] < failing["student_observation"]
Path(f.name).unlink(missing_ok=True)