Files
breakpilot-lehrer/voice-service/tests/bqas/test_regression.py
Benjamin Admin 9912997187
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service)
- 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen
- Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps
- CAMUNDA_URL aus backend-lehrer environment entfernt
- Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt
- Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00

208 lines
7.2 KiB
Python

"""
Regression Tests
Tests for regression tracking and alerting
"""
import pytest
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from bqas.regression_tracker import RegressionTracker, TestRun
from bqas.metrics import BQASMetrics, TestResult
from bqas.config import BQASConfig
class TestRegressionTracker:
"""Tests for regression tracking."""
@pytest.fixture
def temp_tracker(self):
"""Create a tracker with temporary database."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
yield tracker
# Cleanup
Path(f.name).unlink(missing_ok=True)
def test_record_run(self, temp_tracker: RegressionTracker):
"""Test recording a test run."""
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.2,
avg_relevance=4.0,
avg_coherence=4.1,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={"student_observation": 4.2, "worksheet_generate": 3.8},
failed_test_ids=["INT-001", "INT-002"],
total_duration_ms=5000,
timestamp=datetime.now(timezone.utc),
)
run = temp_tracker.record_run(metrics)
assert run.id is not None
assert run.golden_score == 4.0
assert run.total_tests == 10
assert run.passed_tests == 8
def test_get_last_runs(self, temp_tracker: RegressionTracker):
"""Test retrieving last runs."""
# Record multiple runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10 - i,
failed_tests=i,
avg_intent_accuracy=90.0 - i * 5,
avg_faithfulness=4.5 - i * 0.1,
avg_relevance=4.5 - i * 0.1,
avg_coherence=4.5 - i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.5 - i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
runs = temp_tracker.get_last_runs(n=3)
assert len(runs) == 3
# Most recent should be first
assert runs[0].passed_tests == 6 # Last recorded
def test_check_regression_no_data(self, temp_tracker: RegressionTracker):
"""Test regression check with no historical data."""
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert not is_regression
assert "Not enough historical data" in msg
def test_check_regression_stable(self, temp_tracker: RegressionTracker):
"""Test regression check with stable scores."""
# Record stable runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with same score
is_regression, delta, msg = temp_tracker.check_regression(4.5)
assert not is_regression
assert abs(delta) < 0.1
def test_check_regression_detected(self, temp_tracker: RegressionTracker):
"""Test regression detection."""
# Record good runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with significantly lower score
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert is_regression
assert delta > 0.1
assert "Regression detected" in msg
def test_get_trend(self, temp_tracker: RegressionTracker):
"""Test trend calculation."""
# Record improving runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=80.0 + i * 5,
avg_faithfulness=4.0 + i * 0.1,
avg_relevance=4.0 + i * 0.1,
avg_coherence=4.0 + i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.0 + i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
trend = temp_tracker.get_trend(days=30)
assert len(trend["dates"]) == 5
assert len(trend["scores"]) == 5
assert trend["trend"] in ["improving", "stable", "declining", "insufficient_data"]
class TestRegressionAlerts:
"""Tests for regression alerting."""
def test_failing_intents(self):
"""Test identification of failing intents."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
# Record runs with intent scores
for _ in range(3):
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.0,
avg_relevance=4.0,
avg_coherence=4.0,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={
"student_observation": 4.5,
"worksheet_generate": 3.2, # Low
"parent_letter": 4.0,
},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
tracker.record_run(metrics)
failing = tracker.get_failing_intents()
assert "worksheet_generate" in failing
assert failing["worksheet_generate"] < failing["student_observation"]
Path(f.name).unlink(missing_ok=True)