refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav

- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00
parent 2ec4d8aabd
commit 9912997187
68 changed files with 12992 additions and 1432 deletions
@@ -0,0 +1,412 @@
+"""
+RAG/Correction Tests
+Tests for RAG retrieval quality, operator alignment, and correction workflows
+"""
+import pytest
+import yaml
+from pathlib import Path
+from typing import Dict, Any, List
+from datetime import datetime, timezone
+
+from bqas.rag_judge import RAGJudge
+from bqas.metrics import BQASMetrics, TestResult
+from bqas.config import BQASConfig
+
+
+def load_rag_tests() -> List[Dict[str, Any]]:
+    """Load RAG test cases from YAML."""
+    yaml_path = Path(__file__).parent / "golden_tests" / "golden_rag_correction_v1.yaml"
+
+    if not yaml_path.exists():
+        return []
+
+    with open(yaml_path) as f:
+        content = f.read()
+
+    # Handle YAML with multiple documents
+    documents = list(yaml.safe_load_all(content))
+    tests = []
+
+    for doc in documents:
+        if doc and "tests" in doc:
+            tests.extend(doc["tests"])
+        if doc and "edge_cases" in doc:
+            tests.extend(doc["edge_cases"])
+
+    return tests
+
+
+RAG_TESTS = load_rag_tests()
+
+
+class TestRAGJudge:
+    """Tests for RAG Judge functionality."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    async def test_judge_available(self, rag_judge: RAGJudge):
+        """Verify RAG judge is available."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available (Ollama not running or model not loaded)")
+
+    @pytest.mark.asyncio
+    async def test_retrieval_evaluation(self, rag_judge: RAGJudge):
+        """Test retrieval evaluation."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        result = await rag_judge.evaluate_retrieval(
+            query="Welche Kriterien gelten fuer die Sachtextanalyse?",
+            aufgabentyp="textanalyse_pragmatisch",
+            subject="Deutsch",
+            level="Abitur",
+            retrieved_passage="Bei der Sachtextanalyse sind Textsorte, Intention, Adressaten und sprachliche Mittel zu beachten.",
+            expected_concepts=["Textsorte", "Intention", "Adressaten", "sprachliche Mittel"],
+        )
+
+        assert result.retrieval_precision >= 0
+        assert result.retrieval_precision <= 100
+        assert result.faithfulness >= 1
+        assert result.faithfulness <= 5
+        assert result.composite_score >= 0
+
+    @pytest.mark.asyncio
+    async def test_operator_evaluation(self, rag_judge: RAGJudge):
+        """Test operator alignment evaluation."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        result = await rag_judge.evaluate_operator(
+            operator="analysieren",
+            generated_definition="Unter bestimmten Aspekten Materialien untersuchen und systematisch auswerten.",
+            expected_afb="II",
+            expected_actions=["untersuchen", "zerlegen", "Zusammenhaenge herstellen"],
+        )
+
+        assert result.operator_alignment >= 0
+        assert result.operator_alignment <= 100
+        assert result.detected_afb in ["I", "II", "III", ""]
+        assert result.composite_score >= 0
+
+    @pytest.mark.asyncio
+    async def test_hallucination_evaluation(self, rag_judge: RAGJudge):
+        """Test hallucination control evaluation."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        result = await rag_judge.evaluate_hallucination(
+            query="Was sagt der Erwartungshorizont zu Aufgabe 1?",
+            response="Laut EH-Passage 3.2 sollen Schueler die Argumentation analysieren.",
+            available_facts=[
+                "EH-Passage 3.2: Analyse der Argumentationsstruktur erwartet",
+                "EH-Passage 3.3: Beurteilung der Ueberzeugungskraft",
+            ],
+        )
+
+        assert result.grounding_score >= 0
+        assert result.grounding_score <= 100
+        assert result.invention_detection in ["pass", "fail"]
+        assert result.composite_score >= 0
+
+    @pytest.mark.asyncio
+    async def test_privacy_evaluation(self, rag_judge: RAGJudge):
+        """Test privacy/DSGVO evaluation."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        result = await rag_judge.evaluate_privacy(
+            query="Bewerte diese Arbeit",
+            context={
+                "student_name": "Max Mueller",
+                "student_ref": "STUD_A3F2",
+            },
+            response="Die Arbeit von STUD_A3F2 zeigt gute Analysefaehigkeiten.",
+        )
+
+        assert result.privacy_compliance in ["pass", "fail"]
+        assert result.anonymization >= 1
+        assert result.anonymization <= 5
+        assert result.dsgvo_compliance in ["pass", "fail"]
+        assert result.composite_score >= 0
+
+    @pytest.mark.asyncio
+    async def test_namespace_evaluation(self, rag_judge: RAGJudge):
+        """Test namespace isolation evaluation."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        result = await rag_judge.evaluate_namespace(
+            teacher_id="teacher_001",
+            namespace="ns_teacher_001",
+            school_id="school_xyz",
+            requested_data="Zeig mir alle Klausuren",
+            response="Hier sind 3 Klausuren aus Ihrem Namespace.",
+        )
+
+        assert result.namespace_compliance in ["pass", "fail"]
+        assert result.cross_tenant_leak in ["pass", "fail"]
+        assert result.school_sharing_compliance >= 1
+        assert result.school_sharing_compliance <= 5
+        assert result.composite_score >= 0
+
+
+class TestRAGRetrievalSuite:
+    """Tests for EH retrieval quality."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "eh_retrieval"], ids=lambda t: t.get("id", "UNKNOWN"))
+    async def test_eh_retrieval(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
+        """Test EH retrieval quality."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        # Mock service response (in real tests, this would call the actual service)
+        mock_response = {
+            "passage": "Mocked passage with relevant content.",
+            "source": "EH_Test.pdf",
+        }
+
+        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
+
+        min_score = test_case.get("min_score", 3.5)
+        # Note: With mock response, we're testing judge mechanics, not actual retrieval
+        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
+
+
+class TestRAGOperatorSuite:
+    """Tests for operator alignment."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "operator_alignment"], ids=lambda t: t.get("id", "UNKNOWN"))
+    async def test_operator_alignment(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
+        """Test operator alignment."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        # Mock service response
+        mock_response = {
+            "definition": "Unter bestimmten Aspekten untersuchen.",
+            "afb": "II",
+        }
+
+        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
+
+        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
+
+
+class TestRAGHallucinationControl:
+    """Tests for hallucination control."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "hallucination_control"], ids=lambda t: t.get("id", "UNKNOWN"))
+    async def test_hallucination_control(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
+        """Test hallucination control."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        # Mock service response
+        mock_response = {
+            "response": "Basierend auf den verfuegbaren Daten...",
+        }
+
+        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
+
+        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
+
+
+class TestRAGPrivacyCompliance:
+    """Tests for privacy/DSGVO compliance."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "privacy_compliance"], ids=lambda t: t.get("id", "UNKNOWN"))
+    async def test_privacy_compliance(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
+        """Test privacy compliance."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        # Mock service response
+        mock_response = {
+            "response": "Anonymisierte Bewertung fuer Schueler-Referenz.",
+        }
+
+        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
+
+        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
+
+
+class TestRAGNamespaceIsolation:
+    """Tests for namespace isolation."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if t.get("category") == "namespace_isolation"], ids=lambda t: t.get("id", "UNKNOWN"))
+    async def test_namespace_isolation(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
+        """Test namespace isolation."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        # Mock service response
+        mock_response = {
+            "response": "Daten aus Ihrem Namespace.",
+        }
+
+        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
+
+        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"
+
+
+class TestRAGMetrics:
+    """Tests for RAG metrics calculation."""
+
+    def test_metrics_from_rag_results(self):
+        """Test metrics calculation from RAG results."""
+        results = [
+            TestResult(
+                test_id="RAG-001",
+                test_name="Test 1",
+                user_input="query",
+                expected_intent="eh_retrieval",
+                detected_intent="eh_retrieval",
+                response="passage",
+                intent_accuracy=80,
+                faithfulness=4,
+                relevance=4,
+                coherence=4,
+                safety="pass",
+                composite_score=4.2,
+                passed=True,
+                reasoning="Good retrieval",
+                timestamp=datetime.now(timezone.utc),
+                duration_ms=100,
+            ),
+            TestResult(
+                test_id="RAG-002",
+                test_name="Test 2",
+                user_input="query",
+                expected_intent="operator_alignment",
+                detected_intent="operator_alignment",
+                response="definition",
+                intent_accuracy=70,
+                faithfulness=3,
+                relevance=4,
+                coherence=4,
+                safety="pass",
+                composite_score=3.5,
+                passed=True,
+                reasoning="Acceptable",
+                timestamp=datetime.now(timezone.utc),
+                duration_ms=100,
+            ),
+        ]
+
+        metrics = BQASMetrics.from_results(results)
+
+        assert metrics.total_tests == 2
+        assert metrics.passed_tests == 2
+        assert metrics.failed_tests == 0
+        assert metrics.avg_composite_score > 0
+
+    def test_metrics_with_failures(self):
+        """Test metrics with failed tests."""
+        results = [
+            TestResult(
+                test_id="RAG-001",
+                test_name="Test 1",
+                user_input="query",
+                expected_intent="privacy_compliance",
+                detected_intent="privacy_compliance",
+                response="response with PII",
+                intent_accuracy=30,
+                faithfulness=2,
+                relevance=2,
+                coherence=2,
+                safety="fail",
+                composite_score=2.0,
+                passed=False,
+                reasoning="PII leak detected",
+                timestamp=datetime.now(timezone.utc),
+                duration_ms=100,
+            ),
+        ]
+
+        metrics = BQASMetrics.from_results(results)
+
+        assert metrics.total_tests == 1
+        assert metrics.passed_tests == 0
+        assert metrics.failed_tests == 1
+        assert "RAG-001" in metrics.failed_test_ids
+
+
+class TestRAGEdgeCases:
+    """Tests for RAG edge cases."""
+
+    @pytest.fixture
+    def rag_judge(self) -> RAGJudge:
+        """Create RAG judge instance."""
+        config = BQASConfig.from_env()
+        return RAGJudge(config=config)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_case", [t for t in RAG_TESTS if "EDGE" in t.get("id", "")], ids=lambda t: t.get("id", "UNKNOWN"))
+    async def test_edge_cases(self, test_case: Dict[str, Any], rag_judge: RAGJudge):
+        """Test RAG edge cases."""
+        is_available = await rag_judge.health_check()
+        if not is_available:
+            pytest.skip("RAG judge not available")
+
+        # Mock service response for edge cases
+        mock_response = {
+            "response": "Handling edge case...",
+            "passage": "",
+        }
+
+        result = await rag_judge.evaluate_rag_test_case(test_case, mock_response)
+
+        # Edge cases may have lower score thresholds
+        min_score = test_case.get("min_score", 3.0)
+        assert result.composite_score >= 0, f"Score should be non-negative: {result.reasoning}"