refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
187
voice-service/tests/bqas/test_golden.py
Normal file
187
voice-service/tests/bqas/test_golden.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""
|
||||
Golden Suite Tests
|
||||
Tests against validated reference test cases
|
||||
"""
|
||||
import pytest
|
||||
from typing import Dict, Any, List
|
||||
|
||||
from bqas.judge import LLMJudge
|
||||
from bqas.metrics import TestResult, BQASMetrics
|
||||
|
||||
|
||||
class TestGoldenSuite:
|
||||
"""Tests using the golden test suite."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_judge_available(self, llm_judge: LLMJudge):
|
||||
"""Verify LLM judge is available."""
|
||||
is_available = await llm_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("LLM judge not available (Ollama not running or model not loaded)")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_intent_evaluation(self, llm_judge: LLMJudge):
|
||||
"""Test single intent evaluation."""
|
||||
is_available = await llm_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("LLM judge not available")
|
||||
|
||||
result = await llm_judge.evaluate(
|
||||
user_input="Notiz zu Max: heute wiederholt gestoert",
|
||||
detected_intent="student_observation",
|
||||
response="Verstanden, ich habe mir das notiert.",
|
||||
expected_intent="student_observation",
|
||||
)
|
||||
|
||||
assert result.intent_accuracy >= 80
|
||||
assert result.faithfulness >= 3
|
||||
assert result.relevance >= 3
|
||||
assert result.coherence >= 3
|
||||
assert result.safety == "pass"
|
||||
assert result.composite_score >= 3.5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [
|
||||
{
|
||||
"id": "INT-001",
|
||||
"input": "Notiz zu Max: heute wiederholt gestoert",
|
||||
"expected_intent": "student_observation",
|
||||
"min_score": 3.5,
|
||||
},
|
||||
{
|
||||
"id": "INT-007",
|
||||
"input": "Nimm Vokabeln Lektion 4, mach 3 Lueckentexte",
|
||||
"expected_intent": "worksheet_generate",
|
||||
"min_score": 3.5,
|
||||
},
|
||||
{
|
||||
"id": "INT-013",
|
||||
"input": "Neutraler Elternbrief wegen wiederholter Stoerungen",
|
||||
"expected_intent": "parent_letter",
|
||||
"min_score": 3.5,
|
||||
},
|
||||
], ids=lambda t: t["id"])
|
||||
async def test_sample_golden_cases(
|
||||
self,
|
||||
llm_judge: LLMJudge,
|
||||
voice_service_client,
|
||||
test_case: Dict[str, Any],
|
||||
):
|
||||
"""Test sample golden cases."""
|
||||
is_available = await llm_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("LLM judge not available")
|
||||
|
||||
# Call voice service intent endpoint
|
||||
try:
|
||||
response = await voice_service_client.post(
|
||||
"/api/v1/intent",
|
||||
json={"text": test_case["input"]},
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
# Service might not have this endpoint - use mock
|
||||
detected_intent = test_case["expected_intent"]
|
||||
response_text = "Verstanden."
|
||||
else:
|
||||
result = response.json()
|
||||
detected_intent = result.get("intent", "unknown")
|
||||
response_text = result.get("response", "Verstanden.")
|
||||
|
||||
except Exception:
|
||||
# Use expected values for testing judge itself
|
||||
detected_intent = test_case["expected_intent"]
|
||||
response_text = "Verstanden."
|
||||
|
||||
# Evaluate with judge
|
||||
judge_result = await llm_judge.evaluate(
|
||||
user_input=test_case["input"],
|
||||
detected_intent=detected_intent,
|
||||
response=response_text,
|
||||
expected_intent=test_case["expected_intent"],
|
||||
)
|
||||
|
||||
assert judge_result.composite_score >= test_case.get("min_score", 3.5), \
|
||||
f"Score {judge_result.composite_score} < {test_case['min_score']}: {judge_result.reasoning}"
|
||||
|
||||
|
||||
class TestIntentAccuracy:
|
||||
"""Tests for intent detection accuracy."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_student_observation_patterns(self, llm_judge: LLMJudge):
|
||||
"""Test student observation intent patterns."""
|
||||
is_available = await llm_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("LLM judge not available")
|
||||
|
||||
patterns = [
|
||||
"Notiz zu Lisa: sehr aufmerksam heute",
|
||||
"Beobachtung Tim: braucht Hilfe bei Bruchrechnung",
|
||||
"Anna hat heute wiederholt gestört",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
result = await llm_judge.evaluate(
|
||||
user_input=pattern,
|
||||
detected_intent="student_observation",
|
||||
response="Notiz gespeichert.",
|
||||
expected_intent="student_observation",
|
||||
)
|
||||
|
||||
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worksheet_generation_patterns(self, llm_judge: LLMJudge):
|
||||
"""Test worksheet generation intent patterns."""
|
||||
is_available = await llm_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("LLM judge not available")
|
||||
|
||||
patterns = [
|
||||
"Erstelle Arbeitsblatt zu Bruchrechnung",
|
||||
"Mach mir 5 Aufgaben zu Vokabeln",
|
||||
"Ich brauche ein Uebungsblatt fuer Prozentrechnung",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
result = await llm_judge.evaluate(
|
||||
user_input=pattern,
|
||||
detected_intent="worksheet_generate",
|
||||
response="Ich erstelle das Arbeitsblatt.",
|
||||
expected_intent="worksheet_generate",
|
||||
)
|
||||
|
||||
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
|
||||
|
||||
|
||||
class TestMetrics:
|
||||
"""Tests for metrics calculation."""
|
||||
|
||||
def test_metrics_from_results(self, sample_test_result: TestResult):
|
||||
"""Test metrics calculation from results."""
|
||||
results = [sample_test_result]
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
|
||||
assert metrics.total_tests == 1
|
||||
assert metrics.passed_tests == 1
|
||||
assert metrics.failed_tests == 0
|
||||
assert metrics.avg_composite_score == sample_test_result.composite_score
|
||||
|
||||
def test_metrics_empty_results(self):
|
||||
"""Test metrics with empty results."""
|
||||
metrics = BQASMetrics.from_results([])
|
||||
|
||||
assert metrics.total_tests == 0
|
||||
assert metrics.passed_tests == 0
|
||||
assert metrics.avg_composite_score == 0.0
|
||||
|
||||
def test_metrics_summary(self, sample_test_result: TestResult):
|
||||
"""Test metrics summary generation."""
|
||||
results = [sample_test_result]
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
summary = metrics.summary()
|
||||
|
||||
assert "BQAS Test Run Summary" in summary
|
||||
assert "Total Tests: 1" in summary
|
||||
assert "Passed: 1" in summary
|
||||
Reference in New Issue
Block a user