breakpilot-core/voice-service/tests/bqas/test_rag.py

"""
RAG/Correction Tests
Tests for RAG retrieval quality, operator alignment, and correction workflows
"""
import pytest
import yaml
from pathlib import Path
from typing import Dict, Any, List
from datetime import datetime, timezone

from bqas.rag_judge import RAGJudge
from bqas.metrics import BQASMetrics, TestResult
from bqas.config import BQASConfig


def load_rag_tests() -> List[Dict[str, Any]]:
    """Load RAG test cases from YAML."""
    yaml_path = Path(__file__).parent / "golden_tests" / "golden_rag_correction_v1.yaml"

    if not yaml_path.exists():
        return []

    with open(yaml_path) as f:
        content = f.read()

    # Handle YAML with multiple documents
    documents = list(yaml.safe_load_all(content))
    tests = []

    for doc in documents:
        if doc and "tests" in doc:
            tests.extend(doc["tests"])
        if doc and "edge_cases" in doc:
            tests.extend(doc["edge_cases"])

    return tests


RAG_TESTS = load_rag_tests()


class TestRAGJudge:
    """Tests for RAG Judge functionality."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    async def test_judge_available(self, rag_judge: RAGJudge):
        """Verify RAG judge is available."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available (Ollama not running or model not loaded)")

    @pytest.mark.asyncio
    async def test_retrieval_evaluation(self, rag_judge: RAGJudge):
        """Test retrieval evaluation."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        result = await rag_judge.evaluate_retrieval(
            query="Welche Kriterien gelten fuer die Sachtextanalyse?",
            aufgabentyp="textanalyse_pragmatisch",
            subject="Deutsch",
            level="Abitur",
            retrieved_passage="Bei der Sachtextanalyse sind Textsorte, Intention, Adressaten und sprachliche Mittel zu beachten.",
            expected_concepts=["Textsorte", "Intention", "Adressaten", "sprachliche Mittel"],
        )

        assert result.retrieval_precision >= 0
        assert result.retrieval_precision <= 100
        assert result.faithfulness >= 1
        assert result.faithfulness <= 5
        assert result.composite_score >= 0

    @pytest.mark.asyncio
    async def test_operator_evaluation(self, rag_judge: RAGJudge):
        """Test operator alignment evaluation."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        result = await rag_judge.evaluate_operator(
            operator="analysieren",
            generated_definition="Unter bestimmten Aspekten Materialien untersuchen und systematisch auswerten.",
            expected_afb="II",
            expected_actions=["untersuchen", "zerlegen", "Zusammenhaenge herstellen"],
        )

        assert result.operator_alignment >= 0
        assert result.operator_alignment <= 100
        assert result.detected_afb in ["I", "II", "III", ""]
        assert result.composite_score >= 0

    @pytest.mark.asyncio
    async def test_hallucination_evaluation(self, rag_judge: RAGJudge):
        """Test hallucination control evaluation."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        result = await rag_judge.evaluate_hallucination(
            query="Was sagt der Erwartungshorizont zu Aufgabe 1?",
            response="Laut EH-Passage 3.2 sollen Schueler die Argumentation analysieren.",
            available_facts=[
                "EH-Passage 3.2: Analyse der Argumentationsstruktur erwartet",
                "EH-Passage 3.3: Beurteilung der Ueberzeugungskraft",
            ],
        )

        assert result.grounding_score >= 0
        assert result.grounding_score <= 100
        assert result.invention_detection in ["pass", "fail"]
        assert result.composite_score >= 0

    @pytest.mark.asyncio
    async def test_privacy_evaluation(self, rag_judge: RAGJudge):
        """Test privacy/DSGVO evaluation."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        result = await rag_judge.evaluate_privacy(
            query="Bewerte diese Arbeit",
            context={
                "student_name": "Max Mueller",
                "student_ref": "STUD_A3F2",
            },
            response="Die Arbeit von STUD_A3F2 zeigt gute Analysefaehigkeiten.",
        )

        assert result.privacy_compliance in ["pass", "fail"]
        assert result.anonymization >= 1
        assert result.anonymization <= 5
        assert result.dsgvo_compliance in ["pass", "fail"]
        assert result.composite_score >= 0

    @pytest.mark.asyncio
    async def test_namespace_evaluation(self, rag_judge: RAGJudge):
        """Test namespace isolation evaluation."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        result = await rag_judge.evaluate_namespace(
            teacher_id="teacher_001",
            namespace="ns_teacher_001",
            school_id="school_xyz",
            requested_data="Zeig mir alle Klausuren",
            response="Hier sind 3 Klausuren aus Ihrem Namespace.",
        )

        assert result.namespace_compliance in ["pass", "fail"]
        assert result.cross_tenant_leak in ["pass", "fail"]
        assert result.school_sharing_compliance >= 1
        assert result.school_sharing_compliance <= 5
        assert result.composite_score >= 0


class TestRAGRetrievalSuite:
    """Tests for EH retrieval quality."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "eh_retrieval"], ids=lambda t: t.get("id", "UNKNOWN"))
    async def test_eh_retrieval(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
        """Test EH retrieval quality."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        # Mock service response (in real tests, this would call the actual service)
        mock_response = {
            "passage": "Mocked passage with relevant content.",
            "source": "EH_Test.pdf",
        }

        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)

        min_score = test_case.get("min_score", 3.5)
        # Note: With mock response, we're testing judge mechanics, not actual retrieval
        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"


class TestRAGOperatorSuite:
    """Tests for operator alignment."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "operator_alignment"], ids=lambda t: t.get("id", "UNKNOWN"))
    async def test_operator_alignment(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
        """Test operator alignment."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        # Mock service response
        mock_response = {
            "definition": "Unter bestimmten Aspekten untersuchen.",
            "afb": "II",
        }

        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)

        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"


class TestRAGHallucinationControl:
    """Tests for hallucination control."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "hallucination_control"], ids=lambda t: t.get("id", "UNKNOWN"))
    async def test_hallucination_control(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
        """Test hallucination control."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        # Mock service response
        mock_response = {
            "response": "Basierend auf den verfuegbaren Daten...",
        }

        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)

        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"


class TestRAGPrivacyCompliance:
    """Tests for privacy/DSGVO compliance."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "privacy_compliance"], ids=lambda t: t.get("id", "UNKNOWN"))
    async def test_privacy_compliance(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
        """Test privacy compliance."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        # Mock service response
        mock_response = {
            "response": "Anonymisierte Bewertung fuer Schueler-Referenz.",
        }

        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)

        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"


class TestRAGNamespaceIsolation:
    """Tests for namespace isolation."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "namespace_isolation"], ids=lambda t: t.get("id", "UNKNOWN"))
    async def test_namespace_isolation(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
        """Test namespace isolation."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        # Mock service response
        mock_response = {
            "response": "Daten aus Ihrem Namespace.",
        }

        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)

        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"


class TestRAGMetrics:
    """Tests for RAG metrics calculation."""

    def test_metrics_from_rag_results(self):
        """Test metrics calculation from RAG results."""
        results = [
            TestResult(
                test_id="RAG-001",
                test_name="Test 1",
                user_input="query",
                expected_intent="eh_retrieval",
                detected_intent="eh_retrieval",
                response="passage",
                intent_accuracy=80,
                faithfulness=4,
                relevance=4,
                coherence=4,
                safety="pass",
                composite_score=4.2,
                passed=True,
                reasoning="Good retrieval",
                timestamp=datetime.now(timezone.utc),
                duration_ms=100,
            ),
            TestResult(
                test_id="RAG-002",
                test_name="Test 2",
                user_input="query",
                expected_intent="operator_alignment",
                detected_intent="operator_alignment",
                response="definition",
                intent_accuracy=70,
                faithfulness=3,
                relevance=4,
                coherence=4,
                safety="pass",
                composite_score=3.5,
                passed=True,
                reasoning="Acceptable",
                timestamp=datetime.now(timezone.utc),
                duration_ms=100,
            ),
        ]

        metrics = BQASMetrics.from_results(results)

        assert metrics.total_tests == 2
        assert metrics.passed_tests == 2
        assert metrics.failed_tests == 0
        assert metrics.avg_composite_score > 0

    def test_metrics_with_failures(self):
        """Test metrics with failed tests."""
        results = [
            TestResult(
                test_id="RAG-001",
                test_name="Test 1",
                user_input="query",
                expected_intent="privacy_compliance",
                detected_intent="privacy_compliance",
                response="response with PII",
                intent_accuracy=30,
                faithfulness=2,
                relevance=2,
                coherence=2,
                safety="fail",
                composite_score=2.0,
                passed=False,
                reasoning="PII leak detected",
                timestamp=datetime.now(timezone.utc),
                duration_ms=100,
            ),
        ]

        metrics = BQASMetrics.from_results(results)

        assert metrics.total_tests == 1
        assert metrics.passed_tests == 0
        assert metrics.failed_tests == 1
        assert "RAG-001" in metrics.failed_test_ids


class TestRAGEdgeCases:
    """Tests for RAG edge cases."""

    @pytest.fixture
    def rag_judge(self) -> RAGJudge:
        """Create RAG judge instance."""
        config = BQASConfig.from_env()
        return RAGJudge(config=config)

    @pytest.mark.asyncio
    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if "EDGE" in t.get("id", "")], ids=lambda t: t.get("id", "UNKNOWN"))
    async def test_edge_cases(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
        """Test RAG edge cases."""
        is_available = await rag_judge.health_check()
        if not is_available:
            pytest.skip("RAG judge not available")

        # Mock service response for edge cases
        mock_response = {
            "response": "Handling edge case...",
            "passage": "",
        }

        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)

        # Edge cases may have lower score thresholds
        min_score = test_case.get("min_score", 3.0)
        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"