A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
188 lines
6.5 KiB
Python
188 lines
6.5 KiB
Python
"""
|
|
Golden Suite Tests
|
|
Tests against validated reference test cases
|
|
"""
|
|
import pytest
|
|
from typing import Dict, Any, List
|
|
|
|
from bqas.judge import LLMJudge
|
|
from bqas.metrics import TestResult, BQASMetrics
|
|
|
|
|
|
class TestGoldenSuite:
|
|
"""Tests using the golden test suite."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_available(self, llm_judge: LLMJudge):
|
|
"""Verify LLM judge is available."""
|
|
is_available = await llm_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("LLM judge not available (Ollama not running or model not loaded)")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_single_intent_evaluation(self, llm_judge: LLMJudge):
|
|
"""Test single intent evaluation."""
|
|
is_available = await llm_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("LLM judge not available")
|
|
|
|
result = await llm_judge.evaluate(
|
|
user_input="Notiz zu Max: heute wiederholt gestoert",
|
|
detected_intent="student_observation",
|
|
response="Verstanden, ich habe mir das notiert.",
|
|
expected_intent="student_observation",
|
|
)
|
|
|
|
assert result.intent_accuracy >= 80
|
|
assert result.faithfulness >= 3
|
|
assert result.relevance >= 3
|
|
assert result.coherence >= 3
|
|
assert result.safety == "pass"
|
|
assert result.composite_score >= 3.5
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [
|
|
{
|
|
"id": "INT-001",
|
|
"input": "Notiz zu Max: heute wiederholt gestoert",
|
|
"expected_intent": "student_observation",
|
|
"min_score": 3.5,
|
|
},
|
|
{
|
|
"id": "INT-007",
|
|
"input": "Nimm Vokabeln Lektion 4, mach 3 Lueckentexte",
|
|
"expected_intent": "worksheet_generate",
|
|
"min_score": 3.5,
|
|
},
|
|
{
|
|
"id": "INT-013",
|
|
"input": "Neutraler Elternbrief wegen wiederholter Stoerungen",
|
|
"expected_intent": "parent_letter",
|
|
"min_score": 3.5,
|
|
},
|
|
], ids=lambda t: t["id"])
|
|
async def test_sample_golden_cases(
|
|
self,
|
|
llm_judge: LLMJudge,
|
|
voice_service_client,
|
|
test_case: Dict[str, Any],
|
|
):
|
|
"""Test sample golden cases."""
|
|
is_available = await llm_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("LLM judge not available")
|
|
|
|
# Call voice service intent endpoint
|
|
try:
|
|
response = await voice_service_client.post(
|
|
"/api/v1/intent",
|
|
json={"text": test_case["input"]},
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
# Service might not have this endpoint - use mock
|
|
detected_intent = test_case["expected_intent"]
|
|
response_text = "Verstanden."
|
|
else:
|
|
result = response.json()
|
|
detected_intent = result.get("intent", "unknown")
|
|
response_text = result.get("response", "Verstanden.")
|
|
|
|
except Exception:
|
|
# Use expected values for testing judge itself
|
|
detected_intent = test_case["expected_intent"]
|
|
response_text = "Verstanden."
|
|
|
|
# Evaluate with judge
|
|
judge_result = await llm_judge.evaluate(
|
|
user_input=test_case["input"],
|
|
detected_intent=detected_intent,
|
|
response=response_text,
|
|
expected_intent=test_case["expected_intent"],
|
|
)
|
|
|
|
assert judge_result.composite_score >= test_case.get("min_score", 3.5), \
|
|
f"Score {judge_result.composite_score} < {test_case['min_score']}: {judge_result.reasoning}"
|
|
|
|
|
|
class TestIntentAccuracy:
|
|
"""Tests for intent detection accuracy."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_student_observation_patterns(self, llm_judge: LLMJudge):
|
|
"""Test student observation intent patterns."""
|
|
is_available = await llm_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("LLM judge not available")
|
|
|
|
patterns = [
|
|
"Notiz zu Lisa: sehr aufmerksam heute",
|
|
"Beobachtung Tim: braucht Hilfe bei Bruchrechnung",
|
|
"Anna hat heute wiederholt gestört",
|
|
]
|
|
|
|
for pattern in patterns:
|
|
result = await llm_judge.evaluate(
|
|
user_input=pattern,
|
|
detected_intent="student_observation",
|
|
response="Notiz gespeichert.",
|
|
expected_intent="student_observation",
|
|
)
|
|
|
|
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_worksheet_generation_patterns(self, llm_judge: LLMJudge):
|
|
"""Test worksheet generation intent patterns."""
|
|
is_available = await llm_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("LLM judge not available")
|
|
|
|
patterns = [
|
|
"Erstelle Arbeitsblatt zu Bruchrechnung",
|
|
"Mach mir 5 Aufgaben zu Vokabeln",
|
|
"Ich brauche ein Uebungsblatt fuer Prozentrechnung",
|
|
]
|
|
|
|
for pattern in patterns:
|
|
result = await llm_judge.evaluate(
|
|
user_input=pattern,
|
|
detected_intent="worksheet_generate",
|
|
response="Ich erstelle das Arbeitsblatt.",
|
|
expected_intent="worksheet_generate",
|
|
)
|
|
|
|
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
|
|
|
|
|
|
class TestMetrics:
|
|
"""Tests for metrics calculation."""
|
|
|
|
def test_metrics_from_results(self, sample_test_result: TestResult):
|
|
"""Test metrics calculation from results."""
|
|
results = [sample_test_result]
|
|
metrics = BQASMetrics.from_results(results)
|
|
|
|
assert metrics.total_tests == 1
|
|
assert metrics.passed_tests == 1
|
|
assert metrics.failed_tests == 0
|
|
assert metrics.avg_composite_score == sample_test_result.composite_score
|
|
|
|
def test_metrics_empty_results(self):
|
|
"""Test metrics with empty results."""
|
|
metrics = BQASMetrics.from_results([])
|
|
|
|
assert metrics.total_tests == 0
|
|
assert metrics.passed_tests == 0
|
|
assert metrics.avg_composite_score == 0.0
|
|
|
|
def test_metrics_summary(self, sample_test_result: TestResult):
|
|
"""Test metrics summary generation."""
|
|
results = [sample_test_result]
|
|
metrics = BQASMetrics.from_results(results)
|
|
summary = metrics.summary()
|
|
|
|
assert "BQAS Test Run Summary" in summary
|
|
assert "Total Tests: 1" in summary
|
|
assert "Passed: 1" in summary
|