fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,207 @@
"""
Regression Tests
Tests for regression tracking and alerting
"""
import pytest
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from bqas.regression_tracker import RegressionTracker, TestRun
from bqas.metrics import BQASMetrics, TestResult
from bqas.config import BQASConfig
class TestRegressionTracker:
"""Tests for regression tracking."""
@pytest.fixture
def temp_tracker(self):
"""Create a tracker with temporary database."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
yield tracker
# Cleanup
Path(f.name).unlink(missing_ok=True)
def test_record_run(self, temp_tracker: RegressionTracker):
"""Test recording a test run."""
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.2,
avg_relevance=4.0,
avg_coherence=4.1,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={"student_observation": 4.2, "worksheet_generate": 3.8},
failed_test_ids=["INT-001", "INT-002"],
total_duration_ms=5000,
timestamp=datetime.now(timezone.utc),
)
run = temp_tracker.record_run(metrics)
assert run.id is not None
assert run.golden_score == 4.0
assert run.total_tests == 10
assert run.passed_tests == 8
def test_get_last_runs(self, temp_tracker: RegressionTracker):
"""Test retrieving last runs."""
# Record multiple runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10 - i,
failed_tests=i,
avg_intent_accuracy=90.0 - i * 5,
avg_faithfulness=4.5 - i * 0.1,
avg_relevance=4.5 - i * 0.1,
avg_coherence=4.5 - i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.5 - i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
runs = temp_tracker.get_last_runs(n=3)
assert len(runs) == 3
# Most recent should be first
assert runs[0].passed_tests == 6 # Last recorded
def test_check_regression_no_data(self, temp_tracker: RegressionTracker):
"""Test regression check with no historical data."""
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert not is_regression
assert "Not enough historical data" in msg
def test_check_regression_stable(self, temp_tracker: RegressionTracker):
"""Test regression check with stable scores."""
# Record stable runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with same score
is_regression, delta, msg = temp_tracker.check_regression(4.5)
assert not is_regression
assert abs(delta) < 0.1
def test_check_regression_detected(self, temp_tracker: RegressionTracker):
"""Test regression detection."""
# Record good runs
for _ in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=90.0,
avg_faithfulness=4.5,
avg_relevance=4.5,
avg_coherence=4.5,
safety_pass_rate=1.0,
avg_composite_score=4.5,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
# Check with significantly lower score
is_regression, delta, msg = temp_tracker.check_regression(4.0)
assert is_regression
assert delta > 0.1
assert "Regression detected" in msg
def test_get_trend(self, temp_tracker: RegressionTracker):
"""Test trend calculation."""
# Record improving runs
for i in range(5):
metrics = BQASMetrics(
total_tests=10,
passed_tests=10,
failed_tests=0,
avg_intent_accuracy=80.0 + i * 5,
avg_faithfulness=4.0 + i * 0.1,
avg_relevance=4.0 + i * 0.1,
avg_coherence=4.0 + i * 0.1,
safety_pass_rate=1.0,
avg_composite_score=4.0 + i * 0.1,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
temp_tracker.record_run(metrics)
trend = temp_tracker.get_trend(days=30)
assert len(trend["dates"]) == 5
assert len(trend["scores"]) == 5
assert trend["trend"] in ["improving", "stable", "declining", "insufficient_data"]
class TestRegressionAlerts:
"""Tests for regression alerting."""
def test_failing_intents(self):
"""Test identification of failing intents."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
config = BQASConfig(db_path=f.name)
tracker = RegressionTracker(config=config)
# Record runs with intent scores
for _ in range(3):
metrics = BQASMetrics(
total_tests=10,
passed_tests=8,
failed_tests=2,
avg_intent_accuracy=85.0,
avg_faithfulness=4.0,
avg_relevance=4.0,
avg_coherence=4.0,
safety_pass_rate=1.0,
avg_composite_score=4.0,
scores_by_intent={
"student_observation": 4.5,
"worksheet_generate": 3.2, # Low
"parent_letter": 4.0,
},
failed_test_ids=[],
total_duration_ms=1000,
timestamp=datetime.now(timezone.utc),
)
tracker.record_run(metrics)
failing = tracker.get_failing_intents()
assert "worksheet_generate" in failing
assert failing["worksheet_generate"] < failing["student_observation"]
Path(f.name).unlink(missing_ok=True)