This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/voice-service/tests/bqas/test_regression.py
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

208 lines
7.2 KiB
Python

"""
Regression Tests
Tests for regression tracking and alerting
"""
import pytest
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from bqas.regression_tracker import RegressionTracker, TestRun
from bqas.metrics import BQASMetrics, TestResult
from bqas.config import BQASConfig
class TestRegressionTracker:
"""Tests for regression tracking."""
@pytest.fixture
def temp_tracker(self):
"""Create a tracker with temporary database."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
yield tracker
# Cleanup
Path(f.name).unlink(missing_ok=True)
def test_record_run(self, temp_tracker: RegressionTracker):
"""Test recording a test run."""
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.2,
avg_relevance=4.0,
avg_coherence=4.1,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={"student_observation": 4.2, "worksheet_generate": 3.8},
failed_test_ids=["INT-001", "INT-002"],
total_duration_ms=5000,
timestamp=datetime.now(timezone.utc),
)
run = temp_tracker.record_run(metrics)
assert run.id is not None
assert run.golden_score == 4.0
assert run.total_tests == 10
assert run.passed_tests == 8
def test_get_last_runs(self, temp_tracker: RegressionTracker):
"""Test retrieving last runs."""
# Record multiple runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10 - i,
failed_tests=i,
avg_intent_accuracy=90.0 - i * 5,
avg_faithfulness=4.5 - i * 0.1,
avg_relevance=4.5 - i * 0.1,
avg_coherence=4.5 - i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.5 - i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
runs = temp_tracker.get_last_runs(n=3)
assert len(runs) == 3
# Most recent should be first
assert runs[0].passed_tests == 6 # Last recorded
def test_check_regression_no_data(self, temp_tracker: RegressionTracker):
"""Test regression check with no historical data."""
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert not is_regression
assert "Not enough historical data" in msg
def test_check_regression_stable(self, temp_tracker: RegressionTracker):
"""Test regression check with stable scores."""
# Record stable runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with same score
is_regression, delta, msg = temp_tracker.check_regression(4.5)
assert not is_regression
assert abs(delta) < 0.1
def test_check_regression_detected(self, temp_tracker: RegressionTracker):
"""Test regression detection."""
# Record good runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with significantly lower score
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert is_regression
assert delta > 0.1
assert "Regression detected" in msg
def test_get_trend(self, temp_tracker: RegressionTracker):
"""Test trend calculation."""
# Record improving runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=80.0 + i * 5,
avg_faithfulness=4.0 + i * 0.1,
avg_relevance=4.0 + i * 0.1,
avg_coherence=4.0 + i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.0 + i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
trend = temp_tracker.get_trend(days=30)
assert len(trend["dates"]) == 5
assert len(trend["scores"]) == 5
assert trend["trend"] in ["improving", "stable", "declining", "insufficient_data"]
class TestRegressionAlerts:
"""Tests for regression alerting."""
def test_failing_intents(self):
"""Test identification of failing intents."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
# Record runs with intent scores
for _ in range(3):
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.0,
avg_relevance=4.0,
avg_coherence=4.0,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={
"student_observation": 4.5,
"worksheet_generate": 3.2, # Low
"parent_letter": 4.0,
},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
tracker.record_run(metrics)
failing = tracker.get_failing_intents()
assert "worksheet_generate" in failing
assert failing["worksheet_generate"] < failing["student_observation"]
Path(f.name).unlink(missing_ok=True)