""" Regression Tests Tests for regression tracking and alerting """ import pytest import tempfile from datetime import datetime, timedelta, timezone from pathlib import Path from bqas.regression_tracker import RegressionTracker, TestRun from bqas.metrics import BQASMetrics, TestResult from bqas.config import BQASConfig class TestRegressionTracker: """Tests for regression tracking.""" @pytest.fixture def temp_tracker(self): """Create a tracker with temporary database.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: config = BQASConfig(db_path=f.name) tracker = RegressionTracker(config=config) yield tracker # Cleanup Path(f.name).unlink(missing_ok=True) def test_record_run(self, temp_tracker: RegressionTracker): """Test recording a test run.""" metrics = BQASMetrics( total_tests=10, passed_tests=8, failed_tests=2, avg_intent_accuracy=85.0, avg_faithfulness=4.2, avg_relevance=4.0, avg_coherence=4.1, safety_pass_rate=1.0, avg_composite_score=4.0, scores_by_intent={"student_observation": 4.2, "worksheet_generate": 3.8}, failed_test_ids=["INT-001", "INT-002"], total_duration_ms=5000, timestamp=datetime.now(timezone.utc), ) run = temp_tracker.record_run(metrics) assert run.id is not None assert run.golden_score == 4.0 assert run.total_tests == 10 assert run.passed_tests == 8 def test_get_last_runs(self, temp_tracker: RegressionTracker): """Test retrieving last runs.""" # Record multiple runs for i in range(5): metrics = BQASMetrics( total_tests=10, passed_tests=10 - i, failed_tests=i, avg_intent_accuracy=90.0 - i * 5, avg_faithfulness=4.5 - i * 0.1, avg_relevance=4.5 - i * 0.1, avg_coherence=4.5 - i * 0.1, safety_pass_rate=1.0, avg_composite_score=4.5 - i * 0.1, scores_by_intent={}, failed_test_ids=[], total_duration_ms=1000, timestamp=datetime.now(timezone.utc), ) temp_tracker.record_run(metrics) runs = temp_tracker.get_last_runs(n=3) assert len(runs) == 3 # Most recent should be first assert runs[0].passed_tests == 6 # Last recorded def test_check_regression_no_data(self, temp_tracker: RegressionTracker): """Test regression check with no historical data.""" is_regression, delta, msg = temp_tracker.check_regression(4.0) assert not is_regression assert "Not enough historical data" in msg def test_check_regression_stable(self, temp_tracker: RegressionTracker): """Test regression check with stable scores.""" # Record stable runs for _ in range(5): metrics = BQASMetrics( total_tests=10, passed_tests=10, failed_tests=0, avg_intent_accuracy=90.0, avg_faithfulness=4.5, avg_relevance=4.5, avg_coherence=4.5, safety_pass_rate=1.0, avg_composite_score=4.5, scores_by_intent={}, failed_test_ids=[], total_duration_ms=1000, timestamp=datetime.now(timezone.utc), ) temp_tracker.record_run(metrics) # Check with same score is_regression, delta, msg = temp_tracker.check_regression(4.5) assert not is_regression assert abs(delta) < 0.1 def test_check_regression_detected(self, temp_tracker: RegressionTracker): """Test regression detection.""" # Record good runs for _ in range(5): metrics = BQASMetrics( total_tests=10, passed_tests=10, failed_tests=0, avg_intent_accuracy=90.0, avg_faithfulness=4.5, avg_relevance=4.5, avg_coherence=4.5, safety_pass_rate=1.0, avg_composite_score=4.5, scores_by_intent={}, failed_test_ids=[], total_duration_ms=1000, timestamp=datetime.now(timezone.utc), ) temp_tracker.record_run(metrics) # Check with significantly lower score is_regression, delta, msg = temp_tracker.check_regression(4.0) assert is_regression assert delta > 0.1 assert "Regression detected" in msg def test_get_trend(self, temp_tracker: RegressionTracker): """Test trend calculation.""" # Record improving runs for i in range(5): metrics = BQASMetrics( total_tests=10, passed_tests=10, failed_tests=0, avg_intent_accuracy=80.0 + i * 5, avg_faithfulness=4.0 + i * 0.1, avg_relevance=4.0 + i * 0.1, avg_coherence=4.0 + i * 0.1, safety_pass_rate=1.0, avg_composite_score=4.0 + i * 0.1, scores_by_intent={}, failed_test_ids=[], total_duration_ms=1000, timestamp=datetime.now(timezone.utc), ) temp_tracker.record_run(metrics) trend = temp_tracker.get_trend(days=30) assert len(trend["dates"]) == 5 assert len(trend["scores"]) == 5 assert trend["trend"] in ["improving", "stable", "declining", "insufficient_data"] class TestRegressionAlerts: """Tests for regression alerting.""" def test_failing_intents(self): """Test identification of failing intents.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: config = BQASConfig(db_path=f.name) tracker = RegressionTracker(config=config) # Record runs with intent scores for _ in range(3): metrics = BQASMetrics( total_tests=10, passed_tests=8, failed_tests=2, avg_intent_accuracy=85.0, avg_faithfulness=4.0, avg_relevance=4.0, avg_coherence=4.0, safety_pass_rate=1.0, avg_composite_score=4.0, scores_by_intent={ "student_observation": 4.5, "worksheet_generate": 3.2, # Low "parent_letter": 4.0, }, failed_test_ids=[], total_duration_ms=1000, timestamp=datetime.now(timezone.utc), ) tracker.record_run(metrics) failing = tracker.get_failing_intents() assert "worksheet_generate" in failing assert failing["worksheet_generate"] < failing["student_observation"] Path(f.name).unlink(missing_ok=True)