Initial commit: breakpilot-lehrer - Lehrer KI Platform

Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-11 23:47:26 +01:00
commit 5a31f52310
1224 changed files with 425430 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
"""
Golden Suite Tests
Tests against validated reference test cases
"""
import pytest
from typing import Dict, Any, List
from bqas.judge import LLMJudge
from bqas.metrics import TestResult, BQASMetrics
class TestGoldenSuite:
"""Tests using the golden test suite."""
@pytest.mark.asyncio
async def test_judge_available(self, llm_judge: LLMJudge):
"""Verify LLM judge is available."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available (Ollama not running or model not loaded)")
@pytest.mark.asyncio
async def test_single_intent_evaluation(self, llm_judge: LLMJudge):
"""Test single intent evaluation."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
result = await llm_judge.evaluate(
user_input="Notiz zu Max: heute wiederholt gestoert",
detected_intent="student_observation",
response="Verstanden, ich habe mir das notiert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 80
assert result.faithfulness >= 3
assert result.relevance >= 3
assert result.coherence >= 3
assert result.safety == "pass"
assert result.composite_score >= 3.5
@pytest.mark.asyncio
@pytest.mark.parametrize("test_case", [
{
"id": "INT-001",
"input": "Notiz zu Max: heute wiederholt gestoert",
"expected_intent": "student_observation",
"min_score": 3.5,
},
{
"id": "INT-007",
"input": "Nimm Vokabeln Lektion 4, mach 3 Lueckentexte",
"expected_intent": "worksheet_generate",
"min_score": 3.5,
},
{
"id": "INT-013",
"input": "Neutraler Elternbrief wegen wiederholter Stoerungen",
"expected_intent": "parent_letter",
"min_score": 3.5,
},
], ids=lambda t: t["id"])
async def test_sample_golden_cases(
self,
llm_judge: LLMJudge,
voice_service_client,
test_case: Dict[str, Any],
):
"""Test sample golden cases."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
# Call voice service intent endpoint
try:
response = await voice_service_client.post(
"/api/v1/intent",
json={"text": test_case["input"]},
)
if response.status_code != 200:
# Service might not have this endpoint - use mock
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
else:
result = response.json()
detected_intent = result.get("intent", "unknown")
response_text = result.get("response", "Verstanden.")
except Exception:
# Use expected values for testing judge itself
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
# Evaluate with judge
judge_result = await llm_judge.evaluate(
user_input=test_case["input"],
detected_intent=detected_intent,
response=response_text,
expected_intent=test_case["expected_intent"],
)
assert judge_result.composite_score >= test_case.get("min_score", 3.5), \
f"Score {judge_result.composite_score} < {test_case['min_score']}: {judge_result.reasoning}"
class TestIntentAccuracy:
"""Tests for intent detection accuracy."""
@pytest.mark.asyncio
async def test_student_observation_patterns(self, llm_judge: LLMJudge):
"""Test student observation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Notiz zu Lisa: sehr aufmerksam heute",
"Beobachtung Tim: braucht Hilfe bei Bruchrechnung",
"Anna hat heute wiederholt gestört",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="student_observation",
response="Notiz gespeichert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
@pytest.mark.asyncio
async def test_worksheet_generation_patterns(self, llm_judge: LLMJudge):
"""Test worksheet generation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Erstelle Arbeitsblatt zu Bruchrechnung",
"Mach mir 5 Aufgaben zu Vokabeln",
"Ich brauche ein Uebungsblatt fuer Prozentrechnung",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="worksheet_generate",
response="Ich erstelle das Arbeitsblatt.",
expected_intent="worksheet_generate",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
class TestMetrics:
"""Tests for metrics calculation."""
def test_metrics_from_results(self, sample_test_result: TestResult):
"""Test metrics calculation from results."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
assert metrics.total_tests == 1
assert metrics.passed_tests == 1
assert metrics.failed_tests == 0
assert metrics.avg_composite_score == sample_test_result.composite_score
def test_metrics_empty_results(self):
"""Test metrics with empty results."""
metrics = BQASMetrics.from_results([])
assert metrics.total_tests == 0
assert metrics.passed_tests == 0
assert metrics.avg_composite_score == 0.0
def test_metrics_summary(self, sample_test_result: TestResult):
"""Test metrics summary generation."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
summary = metrics.summary()
assert "BQAS Test Run Summary" in summary
assert "Total Tests: 1" in summary
assert "Passed: 1" in summary