This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/voice-service/tests/bqas/test_golden.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

188 lines
6.5 KiB
Python

"""
Golden Suite Tests
Tests against validated reference test cases
"""
import pytest
from typing import Dict, Any, List
from bqas.judge import LLMJudge
from bqas.metrics import TestResult, BQASMetrics
class TestGoldenSuite:
"""Tests using the golden test suite."""
@pytest.mark.asyncio
async def test_judge_available(self, llm_judge: LLMJudge):
"""Verify LLM judge is available."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available (Ollama not running or model not loaded)")
@pytest.mark.asyncio
async def test_single_intent_evaluation(self, llm_judge: LLMJudge):
"""Test single intent evaluation."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
result = await llm_judge.evaluate(
user_input="Notiz zu Max: heute wiederholt gestoert",
detected_intent="student_observation",
response="Verstanden, ich habe mir das notiert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 80
assert result.faithfulness >= 3
assert result.relevance >= 3
assert result.coherence >= 3
assert result.safety == "pass"
assert result.composite_score >= 3.5
@pytest.mark.asyncio
@pytest.mark.parametrize("test_case", [
{
"id": "INT-001",
"input": "Notiz zu Max: heute wiederholt gestoert",
"expected_intent": "student_observation",
"min_score": 3.5,
},
{
"id": "INT-007",
"input": "Nimm Vokabeln Lektion 4, mach 3 Lueckentexte",
"expected_intent": "worksheet_generate",
"min_score": 3.5,
},
{
"id": "INT-013",
"input": "Neutraler Elternbrief wegen wiederholter Stoerungen",
"expected_intent": "parent_letter",
"min_score": 3.5,
},
], ids=lambda t: t["id"])
async def test_sample_golden_cases(
self,
llm_judge: LLMJudge,
voice_service_client,
test_case: Dict[str, Any],
):
"""Test sample golden cases."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
# Call voice service intent endpoint
try:
response = await voice_service_client.post(
"/api/v1/intent",
json={"text": test_case["input"]},
)
if response.status_code != 200:
# Service might not have this endpoint - use mock
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
else:
result = response.json()
detected_intent = result.get("intent", "unknown")
response_text = result.get("response", "Verstanden.")
except Exception:
# Use expected values for testing judge itself
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
# Evaluate with judge
judge_result = await llm_judge.evaluate(
user_input=test_case["input"],
detected_intent=detected_intent,
response=response_text,
expected_intent=test_case["expected_intent"],
)
assert judge_result.composite_score >= test_case.get("min_score", 3.5), \
f"Score {judge_result.composite_score} < {test_case['min_score']}: {judge_result.reasoning}"
class TestIntentAccuracy:
"""Tests for intent detection accuracy."""
@pytest.mark.asyncio
async def test_student_observation_patterns(self, llm_judge: LLMJudge):
"""Test student observation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Notiz zu Lisa: sehr aufmerksam heute",
"Beobachtung Tim: braucht Hilfe bei Bruchrechnung",
"Anna hat heute wiederholt gestört",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="student_observation",
response="Notiz gespeichert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
@pytest.mark.asyncio
async def test_worksheet_generation_patterns(self, llm_judge: LLMJudge):
"""Test worksheet generation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Erstelle Arbeitsblatt zu Bruchrechnung",
"Mach mir 5 Aufgaben zu Vokabeln",
"Ich brauche ein Uebungsblatt fuer Prozentrechnung",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="worksheet_generate",
response="Ich erstelle das Arbeitsblatt.",
expected_intent="worksheet_generate",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
class TestMetrics:
"""Tests for metrics calculation."""
def test_metrics_from_results(self, sample_test_result: TestResult):
"""Test metrics calculation from results."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
assert metrics.total_tests == 1
assert metrics.passed_tests == 1
assert metrics.failed_tests == 0
assert metrics.avg_composite_score == sample_test_result.composite_score
def test_metrics_empty_results(self):
"""Test metrics with empty results."""
metrics = BQASMetrics.from_results([])
assert metrics.total_tests == 0
assert metrics.passed_tests == 0
assert metrics.avg_composite_score == 0.0
def test_metrics_summary(self, sample_test_result: TestResult):
"""Test metrics summary generation."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
summary = metrics.summary()
assert "BQAS Test Run Summary" in summary
assert "Total Tests: 1" in summary
assert "Passed: 1" in summary