This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

188 lines
6.5 KiB
Python

"""
Golden Suite Tests
Tests against validated reference test cases
"""
import pytest
from typing import Dict, Any, List
from bqas.judge import LLMJudge
from bqas.metrics import TestResult, BQASMetrics
class TestGoldenSuite:
"""Tests using the golden test suite."""
@pytest.mark.asyncio
async def test_judge_available(self, llm_judge: LLMJudge):
"""Verify LLM judge is available."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available (Ollama not running or model not loaded)")
@pytest.mark.asyncio
async def test_single_intent_evaluation(self, llm_judge: LLMJudge):
"""Test single intent evaluation."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
result = await llm_judge.evaluate(
user_input="Notiz zu Max: heute wiederholt gestoert",
detected_intent="student_observation",
response="Verstanden, ich habe mir das notiert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 80
assert result.faithfulness >= 3
assert result.relevance >= 3
assert result.coherence >= 3
assert result.safety == "pass"
assert result.composite_score >= 3.5
@pytest.mark.asyncio
@pytest.mark.parametrize("test_case", [
{
"id": "INT-001",
"input": "Notiz zu Max: heute wiederholt gestoert",
"expected_intent": "student_observation",
"min_score": 3.5,
},
{
"id": "INT-007",
"input": "Nimm Vokabeln Lektion 4, mach 3 Lueckentexte",
"expected_intent": "worksheet_generate",
"min_score": 3.5,
},
{
"id": "INT-013",
"input": "Neutraler Elternbrief wegen wiederholter Stoerungen",
"expected_intent": "parent_letter",
"min_score": 3.5,
},
], ids=lambda t: t["id"])
async def test_sample_golden_cases(
self,
llm_judge: LLMJudge,
voice_service_client,
test_case: Dict[str, Any],
):
"""Test sample golden cases."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
# Call voice service intent endpoint
try:
response = await voice_service_client.post(
"/api/v1/intent",
json={"text": test_case["input"]},
)
if response.status_code != 200:
# Service might not have this endpoint - use mock
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
else:
result = response.json()
detected_intent = result.get("intent", "unknown")
response_text = result.get("response", "Verstanden.")
except Exception:
# Use expected values for testing judge itself
detected_intent = test_case["expected_intent"]
response_text = "Verstanden."
# Evaluate with judge
judge_result = await llm_judge.evaluate(
user_input=test_case["input"],
detected_intent=detected_intent,
response=response_text,
expected_intent=test_case["expected_intent"],
)
assert judge_result.composite_score >= test_case.get("min_score", 3.5), \
f"Score {judge_result.composite_score} < {test_case['min_score']}: {judge_result.reasoning}"
class TestIntentAccuracy:
"""Tests for intent detection accuracy."""
@pytest.mark.asyncio
async def test_student_observation_patterns(self, llm_judge: LLMJudge):
"""Test student observation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Notiz zu Lisa: sehr aufmerksam heute",
"Beobachtung Tim: braucht Hilfe bei Bruchrechnung",
"Anna hat heute wiederholt gestört",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="student_observation",
response="Notiz gespeichert.",
expected_intent="student_observation",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
@pytest.mark.asyncio
async def test_worksheet_generation_patterns(self, llm_judge: LLMJudge):
"""Test worksheet generation intent patterns."""
is_available = await llm_judge.health_check()
if not is_available:
pytest.skip("LLM judge not available")
patterns = [
"Erstelle Arbeitsblatt zu Bruchrechnung",
"Mach mir 5 Aufgaben zu Vokabeln",
"Ich brauche ein Uebungsblatt fuer Prozentrechnung",
]
for pattern in patterns:
result = await llm_judge.evaluate(
user_input=pattern,
detected_intent="worksheet_generate",
response="Ich erstelle das Arbeitsblatt.",
expected_intent="worksheet_generate",
)
assert result.intent_accuracy >= 70, f"Failed for: {pattern}"
class TestMetrics:
"""Tests for metrics calculation."""
def test_metrics_from_results(self, sample_test_result: TestResult):
"""Test metrics calculation from results."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
assert metrics.total_tests == 1
assert metrics.passed_tests == 1
assert metrics.failed_tests == 0
assert metrics.avg_composite_score == sample_test_result.composite_score
def test_metrics_empty_results(self):
"""Test metrics with empty results."""
metrics = BQASMetrics.from_results([])
assert metrics.total_tests == 0
assert metrics.passed_tests == 0
assert metrics.avg_composite_score == 0.0
def test_metrics_summary(self, sample_test_result: TestResult):
"""Test metrics summary generation."""
results = [sample_test_result]
metrics = BQASMetrics.from_results(results)
summary = metrics.summary()
assert "BQAS Test Run Summary" in summary
assert "Total Tests: 1" in summary
assert "Passed: 1" in summary