413 lines
15 KiB
Python
413 lines
15 KiB
Python
"""
|
|
RAG/Correction Tests
|
|
Tests for RAG retrieval quality, operator alignment, and correction workflows
|
|
"""
|
|
import pytest
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List
|
|
from datetime import datetime, timezone
|
|
|
|
from bqas.rag_judge import RAGJudge
|
|
from bqas.metrics import BQASMetrics, TestResult
|
|
from bqas.config import BQASConfig
|
|
|
|
|
|
def load_rag_tests() -> List[Dict[str, Any]]:
|
|
"""Load RAG test cases from YAML."""
|
|
yaml_path = Path(__file__).parent / "golden_tests" / "golden_rag_correction_v1.yaml"
|
|
|
|
if not yaml_path.exists():
|
|
return []
|
|
|
|
with open(yaml_path) as f:
|
|
content = f.read()
|
|
|
|
# Handle YAML with multiple documents
|
|
documents = list(yaml.safe_load_all(content))
|
|
tests = []
|
|
|
|
for doc in documents:
|
|
if doc and "tests" in doc:
|
|
tests.extend(doc["tests"])
|
|
if doc and "edge_cases" in doc:
|
|
tests.extend(doc["edge_cases"])
|
|
|
|
return tests
|
|
|
|
|
|
RAG_TESTS = load_rag_tests()
|
|
|
|
|
|
class TestRAGJudge:
|
|
"""Tests for RAG Judge functionality."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_available(self, rag_judge: RAGJudge):
|
|
"""Verify RAG judge is available."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available (Ollama not running or model not loaded)")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retrieval_evaluation(self, rag_judge: RAGJudge):
|
|
"""Test retrieval evaluation."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
result = await rag_judge.evaluate_retrieval(
|
|
query="Welche Kriterien gelten fuer die Sachtextanalyse?",
|
|
aufgabentyp="textanalyse_pragmatisch",
|
|
subject="Deutsch",
|
|
level="Abitur",
|
|
retrieved_passage="Bei der Sachtextanalyse sind Textsorte, Intention, Adressaten und sprachliche Mittel zu beachten.",
|
|
expected_concepts=["Textsorte", "Intention", "Adressaten", "sprachliche Mittel"],
|
|
)
|
|
|
|
assert result.retrieval_precision >= 0
|
|
assert result.retrieval_precision <= 100
|
|
assert result.faithfulness >= 1
|
|
assert result.faithfulness <= 5
|
|
assert result.composite_score >= 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_operator_evaluation(self, rag_judge: RAGJudge):
|
|
"""Test operator alignment evaluation."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
result = await rag_judge.evaluate_operator(
|
|
operator="analysieren",
|
|
generated_definition="Unter bestimmten Aspekten Materialien untersuchen und systematisch auswerten.",
|
|
expected_afb="II",
|
|
expected_actions=["untersuchen", "zerlegen", "Zusammenhaenge herstellen"],
|
|
)
|
|
|
|
assert result.operator_alignment >= 0
|
|
assert result.operator_alignment <= 100
|
|
assert result.detected_afb in ["I", "II", "III", ""]
|
|
assert result.composite_score >= 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_hallucination_evaluation(self, rag_judge: RAGJudge):
|
|
"""Test hallucination control evaluation."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
result = await rag_judge.evaluate_hallucination(
|
|
query="Was sagt der Erwartungshorizont zu Aufgabe 1?",
|
|
response="Laut EH-Passage 3.2 sollen Schueler die Argumentation analysieren.",
|
|
available_facts=[
|
|
"EH-Passage 3.2: Analyse der Argumentationsstruktur erwartet",
|
|
"EH-Passage 3.3: Beurteilung der Ueberzeugungskraft",
|
|
],
|
|
)
|
|
|
|
assert result.grounding_score >= 0
|
|
assert result.grounding_score <= 100
|
|
assert result.invention_detection in ["pass", "fail"]
|
|
assert result.composite_score >= 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_privacy_evaluation(self, rag_judge: RAGJudge):
|
|
"""Test privacy/DSGVO evaluation."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
result = await rag_judge.evaluate_privacy(
|
|
query="Bewerte diese Arbeit",
|
|
context={
|
|
"student_name": "Max Mueller",
|
|
"student_ref": "STUD_A3F2",
|
|
},
|
|
response="Die Arbeit von STUD_A3F2 zeigt gute Analysefaehigkeiten.",
|
|
)
|
|
|
|
assert result.privacy_compliance in ["pass", "fail"]
|
|
assert result.anonymization >= 1
|
|
assert result.anonymization <= 5
|
|
assert result.dsgvo_compliance in ["pass", "fail"]
|
|
assert result.composite_score >= 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_namespace_evaluation(self, rag_judge: RAGJudge):
|
|
"""Test namespace isolation evaluation."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
result = await rag_judge.evaluate_namespace(
|
|
teacher_id="teacher_001",
|
|
namespace="ns_teacher_001",
|
|
school_id="school_xyz",
|
|
requested_data="Zeig mir alle Klausuren",
|
|
response="Hier sind 3 Klausuren aus Ihrem Namespace.",
|
|
)
|
|
|
|
assert result.namespace_compliance in ["pass", "fail"]
|
|
assert result.cross_tenant_leak in ["pass", "fail"]
|
|
assert result.school_sharing_compliance >= 1
|
|
assert result.school_sharing_compliance <= 5
|
|
assert result.composite_score >= 0
|
|
|
|
|
|
class TestRAGRetrievalSuite:
|
|
"""Tests for EH retrieval quality."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "eh_retrieval"], ids=lambda t: t.get("id", "UNKNOWN"))
|
|
async def test_eh_retrieval(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
|
"""Test EH retrieval quality."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
# Mock service response (in real tests, this would call the actual service)
|
|
mock_response = {
|
|
"passage": "Mocked passage with relevant content.",
|
|
"source": "EH_Test.pdf",
|
|
}
|
|
|
|
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
|
|
|
min_score = test_case.get("min_score", 3.5)
|
|
# Note: With mock response, we're testing judge mechanics, not actual retrieval
|
|
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
|
|
|
|
|
class TestRAGOperatorSuite:
|
|
"""Tests for operator alignment."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "operator_alignment"], ids=lambda t: t.get("id", "UNKNOWN"))
|
|
async def test_operator_alignment(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
|
"""Test operator alignment."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
# Mock service response
|
|
mock_response = {
|
|
"definition": "Unter bestimmten Aspekten untersuchen.",
|
|
"afb": "II",
|
|
}
|
|
|
|
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
|
|
|
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
|
|
|
|
|
class TestRAGHallucinationControl:
|
|
"""Tests for hallucination control."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "hallucination_control"], ids=lambda t: t.get("id", "UNKNOWN"))
|
|
async def test_hallucination_control(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
|
"""Test hallucination control."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
# Mock service response
|
|
mock_response = {
|
|
"response": "Basierend auf den verfuegbaren Daten...",
|
|
}
|
|
|
|
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
|
|
|
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
|
|
|
|
|
class TestRAGPrivacyCompliance:
|
|
"""Tests for privacy/DSGVO compliance."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "privacy_compliance"], ids=lambda t: t.get("id", "UNKNOWN"))
|
|
async def test_privacy_compliance(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
|
"""Test privacy compliance."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
# Mock service response
|
|
mock_response = {
|
|
"response": "Anonymisierte Bewertung fuer Schueler-Referenz.",
|
|
}
|
|
|
|
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
|
|
|
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
|
|
|
|
|
class TestRAGNamespaceIsolation:
|
|
"""Tests for namespace isolation."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "namespace_isolation"], ids=lambda t: t.get("id", "UNKNOWN"))
|
|
async def test_namespace_isolation(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
|
"""Test namespace isolation."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
# Mock service response
|
|
mock_response = {
|
|
"response": "Daten aus Ihrem Namespace.",
|
|
}
|
|
|
|
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
|
|
|
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|
|
|
|
|
|
class TestRAGMetrics:
|
|
"""Tests for RAG metrics calculation."""
|
|
|
|
def test_metrics_from_rag_results(self):
|
|
"""Test metrics calculation from RAG results."""
|
|
results = [
|
|
TestResult(
|
|
test_id="RAG-001",
|
|
test_name="Test 1",
|
|
user_input="query",
|
|
expected_intent="eh_retrieval",
|
|
detected_intent="eh_retrieval",
|
|
response="passage",
|
|
intent_accuracy=80,
|
|
faithfulness=4,
|
|
relevance=4,
|
|
coherence=4,
|
|
safety="pass",
|
|
composite_score=4.2,
|
|
passed=True,
|
|
reasoning="Good retrieval",
|
|
timestamp=datetime.now(timezone.utc),
|
|
duration_ms=100,
|
|
),
|
|
TestResult(
|
|
test_id="RAG-002",
|
|
test_name="Test 2",
|
|
user_input="query",
|
|
expected_intent="operator_alignment",
|
|
detected_intent="operator_alignment",
|
|
response="definition",
|
|
intent_accuracy=70,
|
|
faithfulness=3,
|
|
relevance=4,
|
|
coherence=4,
|
|
safety="pass",
|
|
composite_score=3.5,
|
|
passed=True,
|
|
reasoning="Acceptable",
|
|
timestamp=datetime.now(timezone.utc),
|
|
duration_ms=100,
|
|
),
|
|
]
|
|
|
|
metrics = BQASMetrics.from_results(results)
|
|
|
|
assert metrics.total_tests == 2
|
|
assert metrics.passed_tests == 2
|
|
assert metrics.failed_tests == 0
|
|
assert metrics.avg_composite_score > 0
|
|
|
|
def test_metrics_with_failures(self):
|
|
"""Test metrics with failed tests."""
|
|
results = [
|
|
TestResult(
|
|
test_id="RAG-001",
|
|
test_name="Test 1",
|
|
user_input="query",
|
|
expected_intent="privacy_compliance",
|
|
detected_intent="privacy_compliance",
|
|
response="response with PII",
|
|
intent_accuracy=30,
|
|
faithfulness=2,
|
|
relevance=2,
|
|
coherence=2,
|
|
safety="fail",
|
|
composite_score=2.0,
|
|
passed=False,
|
|
reasoning="PII leak detected",
|
|
timestamp=datetime.now(timezone.utc),
|
|
duration_ms=100,
|
|
),
|
|
]
|
|
|
|
metrics = BQASMetrics.from_results(results)
|
|
|
|
assert metrics.total_tests == 1
|
|
assert metrics.passed_tests == 0
|
|
assert metrics.failed_tests == 1
|
|
assert "RAG-001" in metrics.failed_test_ids
|
|
|
|
|
|
class TestRAGEdgeCases:
|
|
"""Tests for RAG edge cases."""
|
|
|
|
@pytest.fixture
|
|
def rag_judge(self) -> RAGJudge:
|
|
"""Create RAG judge instance."""
|
|
config = BQASConfig.from_env()
|
|
return RAGJudge(config=config)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if "EDGE" in t.get("id", "")], ids=lambda t: t.get("id", "UNKNOWN"))
|
|
async def test_edge_cases(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
|
|
"""Test RAG edge cases."""
|
|
is_available = await rag_judge.health_check()
|
|
if not is_available:
|
|
pytest.skip("RAG judge not available")
|
|
|
|
# Mock service response for edge cases
|
|
mock_response = {
|
|
"response": "Handling edge case...",
|
|
"passage": "",
|
|
}
|
|
|
|
result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
|
|
|
|
# Edge cases may have lower score thresholds
|
|
min_score = test_case.get("min_score", 3.0)
|
|
assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
|