Files
breakpilot-lehrer/voice-service/tests/bqas/test_golden.py
Benjamin Admin 9912997187
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service)
- 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen
- Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps
- CAMUNDA_URL aus backend-lehrer environment entfernt
- Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt
- Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00

188 lines
6.5 KiB
Python

"""
Golden Suite Tests
Tests against validated reference test cases
"""
import pytest
from typing import Dict, Any, List
from bqas.judge import LLMJudge
from bqas.metrics import TestResult, BQASMetrics
class TestGoldenSuite:
"""Tests using the golden test suite."""
@pytest.mark.asyncio
async def test_judge_available(self, llm_judge: LLMJudge):
"""Verify LLM judge is available."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available (Ollama not running or model not loaded)")
@pytest.mark.asyncio
async def test_single_intent_evaluation(self, llm_judge: LLMJudge):
"""Test single intent evaluation."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
result = await llm_judge.evaluate(
user_input="Notiz zu Max: heute wiederholt gestoert",
detected_intent="student_observation",
response="Verstanden, ich habe mir das notiert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 80
assert result.faithfulness >= 3
assert result.relevance >= 3
assert result.coherence >= 3
assert result.safety == "pass"
assert result.composite_score >= 3.5
@pytest.mark.asyncio
@pytest.mark.parametrize("test_case", [
{
"id": "INT-001",
"input": "Notiz zu Max: heute wiederholt gestoert",
"expected_intent": "student_observation",
"min_score": 3.5,
},
{
"id": "INT-007",
"input": "Nimm Vokabeln Lektion 4, mach 3 Lueckentexte",
"expected_intent": "worksheet_generate",
"min_score": 3.5,
},
{
"id": "INT-013",
"input": "Neutraler Elternbrief wegen wiederholter Stoerungen",
"expected_intent": "parent_letter",
"min_score": 3.5,
},
], ids=lambda t: t["id"])
async def test_sample_golden_cases(
self,
llm_judge: LLMJudge,
voice_service_client,
test_case: Dict[str, Any],
):
"""Test sample golden cases."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
# Call voice service intent endpoint
try:
response = await voice_service_client.post(
"/api/v1/intent",
json={"text": test_case["input"]},
)
if response.status_code != 200:
# Service might not have this endpoint - use mock
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
else:
result = response.json()
detected_intent = result.get("intent", "unknown")
response_text = result.get("response", "Verstanden.")
except Exception:
# Use expected values for testing judge itself
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
# Evaluate with judge
judge_result = await llm_judge.evaluate(
user_input=test_case["input"],
detected_intent=detected_intent,
response=response_text,
expected_intent=test_case["expected_intent"],
)
assert judge_result.composite_score >= test_case.get("min_score", 3.5), \
f"Score {judge_result.composite_score} < {test_case['min_score']}: {judge_result.reasoning}"
class TestIntentAccuracy:
"""Tests for intent detection accuracy."""
@pytest.mark.asyncio
async def test_student_observation_patterns(self, llm_judge: LLMJudge):
"""Test student observation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Notiz zu Lisa: sehr aufmerksam heute",
"Beobachtung Tim: braucht Hilfe bei Bruchrechnung",
"Anna hat heute wiederholt gestört",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="student_observation",
response="Notiz gespeichert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
@pytest.mark.asyncio
async def test_worksheet_generation_patterns(self, llm_judge: LLMJudge):
"""Test worksheet generation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Erstelle Arbeitsblatt zu Bruchrechnung",
"Mach mir 5 Aufgaben zu Vokabeln",
"Ich brauche ein Uebungsblatt fuer Prozentrechnung",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="worksheet_generate",
response="Ich erstelle das Arbeitsblatt.",
expected_intent="worksheet_generate",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
class TestMetrics:
"""Tests for metrics calculation."""
def test_metrics_from_results(self, sample_test_result: TestResult):
"""Test metrics calculation from results."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
assert metrics.total_tests == 1
assert metrics.passed_tests == 1
assert metrics.failed_tests == 0
assert metrics.avg_composite_score == sample_test_result.composite_score
def test_metrics_empty_results(self):
"""Test metrics with empty results."""
metrics = BQASMetrics.from_results([])
assert metrics.total_tests == 0
assert metrics.passed_tests == 0
assert metrics.avg_composite_score == 0.0
def test_metrics_summary(self, sample_test_result: TestResult):
"""Test metrics summary generation."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
summary = metrics.summary()
assert "BQAS Test Run Summary" in summary
assert "Total Tests: 1" in summary
assert "Passed: 1" in summary