refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
412
voice-service/tests/bqas/test_rag.py
Normal file
412
voice-service/tests/bqas/test_rag.py
Normal file
@@ -0,0 +1,412 @@
|
||||
"""
|
||||
RAG/Correction Tests
|
||||
Tests for RAG retrieval quality, operator alignment, and correction workflows
|
||||
"""
|
||||
import pytest
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from bqas.rag_judge import RAGJudge
|
||||
from bqas.metrics import BQASMetrics, TestResult
|
||||
from bqas.config import BQASConfig
|
||||
|
||||
|
||||
def load_rag_tests() -> List[Dict[str, Any]]:
|
||||
"""Load RAG test cases from YAML."""
|
||||
yaml_path = Path(__file__).parent / "golden_tests" / "golden_rag_correction_v1.yaml"
|
||||
|
||||
if not yaml_path.exists():
|
||||
return []
|
||||
|
||||
with open(yaml_path) as f:
|
||||
content = f.read()
|
||||
|
||||
# Handle YAML with multiple documents
|
||||
documents = list(yaml.safe_load_all(content))
|
||||
tests = []
|
||||
|
||||
for doc in documents:
|
||||
if doc and "tests" in doc:
|
||||
tests.extend(doc["tests"])
|
||||
if doc and "edge_cases" in doc:
|
||||
tests.extend(doc["edge_cases"])
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
RAG_TESTS = load_rag_tests()
|
||||
|
||||
|
||||
class TestRAGJudge:
|
||||
"""Tests for RAG Judge functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_judge_available(self, rag_judge: RAGJudge):
|
||||
"""Verify RAG judge is available."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available (Ollama not running or model not loaded)")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_retrieval_evaluation(self, rag_judge: RAGJudge):
|
||||
"""Test retrieval evaluation."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
result = await rag_judge.evaluate_retrieval(
|
||||
query="Welche Kriterien gelten fuer die Sachtextanalyse?",
|
||||
aufgabentyp="textanalyse_pragmatisch",
|
||||
subject="Deutsch",
|
||||
level="Abitur",
|
||||
retrieved_passage="Bei der Sachtextanalyse sind Textsorte, Intention, Adressaten und sprachliche Mittel zu beachten.",
|
||||
expected_concepts=["Textsorte", "Intention", "Adressaten", "sprachliche Mittel"],
|
||||
)
|
||||
|
||||
assert result.retrieval_precision >= 0
|
||||
assert result.retrieval_precision <= 100
|
||||
assert result.faithfulness >= 1
|
||||
assert result.faithfulness <= 5
|
||||
assert result.composite_score >= 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_operator_evaluation(self, rag_judge: RAGJudge):
|
||||
"""Test operator alignment evaluation."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
result = await rag_judge.evaluate_operator(
|
||||
operator="analysieren",
|
||||
generated_definition="Unter bestimmten Aspekten Materialien untersuchen und systematisch auswerten.",
|
||||
expected_afb="II",
|
||||
expected_actions=["untersuchen", "zerlegen", "Zusammenhaenge herstellen"],
|
||||
)
|
||||
|
||||
assert result.operator_alignment >= 0
|
||||
assert result.operator_alignment <= 100
|
||||
assert result.detected_afb in ["I", "II", "III", ""]
|
||||
assert result.composite_score >= 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hallucination_evaluation(self, rag_judge: RAGJudge):
|
||||
"""Test hallucination control evaluation."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
result = await rag_judge.evaluate_hallucination(
|
||||
query="Was sagt der Erwartungshorizont zu Aufgabe 1?",
|
||||
response="Laut EH-Passage 3.2 sollen Schueler die Argumentation analysieren.",
|
||||
available_facts=[
|
||||
"EH-Passage 3.2: Analyse der Argumentationsstruktur erwartet",
|
||||
"EH-Passage 3.3: Beurteilung der Ueberzeugungskraft",
|
||||
],
|
||||
)
|
||||
|
||||
assert result.grounding_score >= 0
|
||||
assert result.grounding_score <= 100
|
||||
assert result.invention_detection in ["pass", "fail"]
|
||||
assert result.composite_score >= 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_privacy_evaluation(self, rag_judge: RAGJudge):
|
||||
"""Test privacy/DSGVO evaluation."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
result = await rag_judge.evaluate_privacy(
|
||||
query="Bewerte diese Arbeit",
|
||||
context={
|
||||
"student_name": "Max Mueller",
|
||||
"student_ref": "STUD_A3F2",
|
||||
},
|
||||
response="Die Arbeit von STUD_A3F2 zeigt gute Analysefaehigkeiten.",
|
||||
)
|
||||
|
||||
assert result.privacy_compliance in ["pass", "fail"]
|
||||
assert result.anonymization >= 1
|
||||
assert result.anonymization <= 5
|
||||
assert result.dsgvo_compliance in ["pass", "fail"]
|
||||
assert result.composite_score >= 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_namespace_evaluation(self, rag_judge: RAGJudge):
|
||||
"""Test namespace isolation evaluation."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
result = await rag_judge.evaluate_namespace(
|
||||
teacher_id="teacher_001",
|
||||
namespace="ns_teacher_001",
|
||||
school_id="school_xyz",
|
||||
requested_data="Zeig mir alle Klausuren",
|
||||
response="Hier sind 3 Klausuren aus Ihrem Namespace.",
|
||||
)
|
||||
|
||||
assert result.namespace_compliance in ["pass", "fail"]
|
||||
assert result.cross_tenant_leak in ["pass", "fail"]
|
||||
assert result.school_sharing_compliance >= 1
|
||||
assert result.school_sharing_compliance <= 5
|
||||
assert result.composite_score >= 0
|
||||
|
||||
|
||||
class TestRAGRetrievalSuite:
|
||||
"""Tests for EH retrieval quality."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "eh_retrieval"], ids=lambda t: t.get("id", "UNKNOWN"))
|
||||
async def test_eh_retrieval(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
||||
"""Test EH retrieval quality."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
# Mock service response (in real tests, this would call the actual service)
|
||||
mock_response = {
|
||||
"passage": "Mocked passage with relevant content.",
|
||||
"source": "EH_Test.pdf",
|
||||
}
|
||||
|
||||
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
||||
|
||||
min_score = test_case.get("min_score", 3.5)
|
||||
# Note: With mock response, we're testing judge mechanics, not actual retrieval
|
||||
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
||||
|
||||
|
||||
class TestRAGOperatorSuite:
|
||||
"""Tests for operator alignment."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "operator_alignment"], ids=lambda t: t.get("id", "UNKNOWN"))
|
||||
async def test_operator_alignment(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
||||
"""Test operator alignment."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
# Mock service response
|
||||
mock_response = {
|
||||
"definition": "Unter bestimmten Aspekten untersuchen.",
|
||||
"afb": "II",
|
||||
}
|
||||
|
||||
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
||||
|
||||
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
||||
|
||||
|
||||
class TestRAGHallucinationControl:
|
||||
"""Tests for hallucination control."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "hallucination_control"], ids=lambda t: t.get("id", "UNKNOWN"))
|
||||
async def test_hallucination_control(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
||||
"""Test hallucination control."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
# Mock service response
|
||||
mock_response = {
|
||||
"response": "Basierend auf den verfuegbaren Daten...",
|
||||
}
|
||||
|
||||
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
||||
|
||||
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
||||
|
||||
|
||||
class TestRAGPrivacyCompliance:
|
||||
"""Tests for privacy/DSGVO compliance."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "privacy_compliance"], ids=lambda t: t.get("id", "UNKNOWN"))
|
||||
async def test_privacy_compliance(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
||||
"""Test privacy compliance."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
# Mock service response
|
||||
mock_response = {
|
||||
"response": "Anonymisierte Bewertung fuer Schueler-Referenz.",
|
||||
}
|
||||
|
||||
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
||||
|
||||
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
||||
|
||||
|
||||
class TestRAGNamespaceIsolation:
|
||||
"""Tests for namespace isolation."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "namespace_isolation"], ids=lambda t: t.get("id", "UNKNOWN"))
|
||||
async def test_namespace_isolation(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
||||
"""Test namespace isolation."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
# Mock service response
|
||||
mock_response = {
|
||||
"response": "Daten aus Ihrem Namespace.",
|
||||
}
|
||||
|
||||
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
||||
|
||||
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
||||
|
||||
|
||||
class TestRAGMetrics:
|
||||
"""Tests for RAG metrics calculation."""
|
||||
|
||||
def test_metrics_from_rag_results(self):
|
||||
"""Test metrics calculation from RAG results."""
|
||||
results = [
|
||||
TestResult(
|
||||
test_id="RAG-001",
|
||||
test_name="Test 1",
|
||||
user_input="query",
|
||||
expected_intent="eh_retrieval",
|
||||
detected_intent="eh_retrieval",
|
||||
response="passage",
|
||||
intent_accuracy=80,
|
||||
faithfulness=4,
|
||||
relevance=4,
|
||||
coherence=4,
|
||||
safety="pass",
|
||||
composite_score=4.2,
|
||||
passed=True,
|
||||
reasoning="Good retrieval",
|
||||
timestamp=datetime.now(timezone.utc),
|
||||
duration_ms=100,
|
||||
),
|
||||
TestResult(
|
||||
test_id="RAG-002",
|
||||
test_name="Test 2",
|
||||
user_input="query",
|
||||
expected_intent="operator_alignment",
|
||||
detected_intent="operator_alignment",
|
||||
response="definition",
|
||||
intent_accuracy=70,
|
||||
faithfulness=3,
|
||||
relevance=4,
|
||||
coherence=4,
|
||||
safety="pass",
|
||||
composite_score=3.5,
|
||||
passed=True,
|
||||
reasoning="Acceptable",
|
||||
timestamp=datetime.now(timezone.utc),
|
||||
duration_ms=100,
|
||||
),
|
||||
]
|
||||
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
|
||||
assert metrics.total_tests == 2
|
||||
assert metrics.passed_tests == 2
|
||||
assert metrics.failed_tests == 0
|
||||
assert metrics.avg_composite_score > 0
|
||||
|
||||
def test_metrics_with_failures(self):
|
||||
"""Test metrics with failed tests."""
|
||||
results = [
|
||||
TestResult(
|
||||
test_id="RAG-001",
|
||||
test_name="Test 1",
|
||||
user_input="query",
|
||||
expected_intent="privacy_compliance",
|
||||
detected_intent="privacy_compliance",
|
||||
response="response with PII",
|
||||
intent_accuracy=30,
|
||||
faithfulness=2,
|
||||
relevance=2,
|
||||
coherence=2,
|
||||
safety="fail",
|
||||
composite_score=2.0,
|
||||
passed=False,
|
||||
reasoning="PII leak detected",
|
||||
timestamp=datetime.now(timezone.utc),
|
||||
duration_ms=100,
|
||||
),
|
||||
]
|
||||
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
|
||||
assert metrics.total_tests == 1
|
||||
assert metrics.passed_tests == 0
|
||||
assert metrics.failed_tests == 1
|
||||
assert "RAG-001" in metrics.failed_test_ids
|
||||
|
||||
|
||||
class TestRAGEdgeCases:
|
||||
"""Tests for RAG edge cases."""
|
||||
|
||||
@pytest.fixture
|
||||
def rag_judge(self) -> RAGJudge:
|
||||
"""Create RAG judge instance."""
|
||||
config = BQASConfig.from_env()
|
||||
return RAGJudge(config=config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if "EDGE" in t.get("id", "")], ids=lambda t: t.get("id", "UNKNOWN"))
|
||||
async def test_edge_cases(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
||||
"""Test RAG edge cases."""
|
||||
is_available = await rag_judge.health_check()
|
||||
if not is_available:
|
||||
pytest.skip("RAG judge not available")
|
||||
|
||||
# Mock service response for edge cases
|
||||
mock_response = {
|
||||
"response": "Handling edge case...",
|
||||
"passage": "",
|
||||
}
|
||||
|
||||
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
||||
|
||||
# Edge cases may have lower score thresholds
|
||||
min_score = test_case.get("min_score", 3.0)
|
||||
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
||||
Reference in New Issue
Block a user