Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
208 lines
7.2 KiB
Python
208 lines
7.2 KiB
Python
"""
|
|
Regression Tests
|
|
Tests for regression tracking and alerting
|
|
"""
|
|
import pytest
|
|
import tempfile
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
|
|
from bqas.regression_tracker import RegressionTracker, TestRun
|
|
from bqas.metrics import BQASMetrics, TestResult
|
|
from bqas.config import BQASConfig
|
|
|
|
|
|
class TestRegressionTracker:
|
|
"""Tests for regression tracking."""
|
|
|
|
@pytest.fixture
|
|
def temp_tracker(self):
|
|
"""Create a tracker with temporary database."""
|
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
|
config = BQASConfig(db_path=f.name)
|
|
tracker = RegressionTracker(config=config)
|
|
yield tracker
|
|
# Cleanup
|
|
Path(f.name).unlink(missing_ok=True)
|
|
|
|
def test_record_run(self, temp_tracker: RegressionTracker):
|
|
"""Test recording a test run."""
|
|
metrics = BQASMetrics(
|
|
total_tests=10,
|
|
passed_tests=8,
|
|
failed_tests=2,
|
|
avg_intent_accuracy=85.0,
|
|
avg_faithfulness=4.2,
|
|
avg_relevance=4.0,
|
|
avg_coherence=4.1,
|
|
safety_pass_rate=1.0,
|
|
avg_composite_score=4.0,
|
|
scores_by_intent={"student_observation": 4.2, "worksheet_generate": 3.8},
|
|
failed_test_ids=["INT-001", "INT-002"],
|
|
total_duration_ms=5000,
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
run = temp_tracker.record_run(metrics)
|
|
|
|
assert run.id is not None
|
|
assert run.golden_score == 4.0
|
|
assert run.total_tests == 10
|
|
assert run.passed_tests == 8
|
|
|
|
def test_get_last_runs(self, temp_tracker: RegressionTracker):
|
|
"""Test retrieving last runs."""
|
|
# Record multiple runs
|
|
for i in range(5):
|
|
metrics = BQASMetrics(
|
|
total_tests=10,
|
|
passed_tests=10 - i,
|
|
failed_tests=i,
|
|
avg_intent_accuracy=90.0 - i * 5,
|
|
avg_faithfulness=4.5 - i * 0.1,
|
|
avg_relevance=4.5 - i * 0.1,
|
|
avg_coherence=4.5 - i * 0.1,
|
|
safety_pass_rate=1.0,
|
|
avg_composite_score=4.5 - i * 0.1,
|
|
scores_by_intent={},
|
|
failed_test_ids=[],
|
|
total_duration_ms=1000,
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
temp_tracker.record_run(metrics)
|
|
|
|
runs = temp_tracker.get_last_runs(n=3)
|
|
assert len(runs) == 3
|
|
|
|
# Most recent should be first
|
|
assert runs[0].passed_tests == 6 # Last recorded
|
|
|
|
def test_check_regression_no_data(self, temp_tracker: RegressionTracker):
|
|
"""Test regression check with no historical data."""
|
|
is_regression, delta, msg = temp_tracker.check_regression(4.0)
|
|
|
|
assert not is_regression
|
|
assert "Not enough historical data" in msg
|
|
|
|
def test_check_regression_stable(self, temp_tracker: RegressionTracker):
|
|
"""Test regression check with stable scores."""
|
|
# Record stable runs
|
|
for _ in range(5):
|
|
metrics = BQASMetrics(
|
|
total_tests=10,
|
|
passed_tests=10,
|
|
failed_tests=0,
|
|
avg_intent_accuracy=90.0,
|
|
avg_faithfulness=4.5,
|
|
avg_relevance=4.5,
|
|
avg_coherence=4.5,
|
|
safety_pass_rate=1.0,
|
|
avg_composite_score=4.5,
|
|
scores_by_intent={},
|
|
failed_test_ids=[],
|
|
total_duration_ms=1000,
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
temp_tracker.record_run(metrics)
|
|
|
|
# Check with same score
|
|
is_regression, delta, msg = temp_tracker.check_regression(4.5)
|
|
|
|
assert not is_regression
|
|
assert abs(delta) < 0.1
|
|
|
|
def test_check_regression_detected(self, temp_tracker: RegressionTracker):
|
|
"""Test regression detection."""
|
|
# Record good runs
|
|
for _ in range(5):
|
|
metrics = BQASMetrics(
|
|
total_tests=10,
|
|
passed_tests=10,
|
|
failed_tests=0,
|
|
avg_intent_accuracy=90.0,
|
|
avg_faithfulness=4.5,
|
|
avg_relevance=4.5,
|
|
avg_coherence=4.5,
|
|
safety_pass_rate=1.0,
|
|
avg_composite_score=4.5,
|
|
scores_by_intent={},
|
|
failed_test_ids=[],
|
|
total_duration_ms=1000,
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
temp_tracker.record_run(metrics)
|
|
|
|
# Check with significantly lower score
|
|
is_regression, delta, msg = temp_tracker.check_regression(4.0)
|
|
|
|
assert is_regression
|
|
assert delta > 0.1
|
|
assert "Regression detected" in msg
|
|
|
|
def test_get_trend(self, temp_tracker: RegressionTracker):
|
|
"""Test trend calculation."""
|
|
# Record improving runs
|
|
for i in range(5):
|
|
metrics = BQASMetrics(
|
|
total_tests=10,
|
|
passed_tests=10,
|
|
failed_tests=0,
|
|
avg_intent_accuracy=80.0 + i * 5,
|
|
avg_faithfulness=4.0 + i * 0.1,
|
|
avg_relevance=4.0 + i * 0.1,
|
|
avg_coherence=4.0 + i * 0.1,
|
|
safety_pass_rate=1.0,
|
|
avg_composite_score=4.0 + i * 0.1,
|
|
scores_by_intent={},
|
|
failed_test_ids=[],
|
|
total_duration_ms=1000,
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
temp_tracker.record_run(metrics)
|
|
|
|
trend = temp_tracker.get_trend(days=30)
|
|
|
|
assert len(trend["dates"]) == 5
|
|
assert len(trend["scores"]) == 5
|
|
assert trend["trend"] in ["improving", "stable", "declining", "insufficient_data"]
|
|
|
|
|
|
class TestRegressionAlerts:
|
|
"""Tests for regression alerting."""
|
|
|
|
def test_failing_intents(self):
|
|
"""Test identification of failing intents."""
|
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
|
config = BQASConfig(db_path=f.name)
|
|
tracker = RegressionTracker(config=config)
|
|
|
|
# Record runs with intent scores
|
|
for _ in range(3):
|
|
metrics = BQASMetrics(
|
|
total_tests=10,
|
|
passed_tests=8,
|
|
failed_tests=2,
|
|
avg_intent_accuracy=85.0,
|
|
avg_faithfulness=4.0,
|
|
avg_relevance=4.0,
|
|
avg_coherence=4.0,
|
|
safety_pass_rate=1.0,
|
|
avg_composite_score=4.0,
|
|
scores_by_intent={
|
|
"student_observation": 4.5,
|
|
"worksheet_generate": 3.2, # Low
|
|
"parent_letter": 4.0,
|
|
},
|
|
failed_test_ids=[],
|
|
total_duration_ms=1000,
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
tracker.record_run(metrics)
|
|
|
|
failing = tracker.get_failing_intents()
|
|
|
|
assert "worksheet_generate" in failing
|
|
assert failing["worksheet_generate"] < failing["student_observation"]
|
|
|
|
Path(f.name).unlink(missing_ok=True)
|