[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions
--- a/voice-service/bqas/rag_judge.py
+++ b/voice-service/bqas/rag_judge.py
@@ -1,82 +1,49 @@
 """
 RAG Judge - Specialized evaluation for RAG/Correction quality
+
+Split into:
+- rag_judge_types.py: Data classes for evaluation results
+- rag_judge_evaluators.py: Individual evaluation methods
+- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
 """
 import json
-import time
 import structlog
 import httpx
-from dataclasses import dataclass
-from typing import Literal, Optional, Dict, List, Any
-from datetime import datetime
+from typing import Optional, Dict, List, Any

 from bqas.config import BQASConfig
-from bqas.prompts import (
-    RAG_RETRIEVAL_JUDGE_PROMPT,
-    RAG_OPERATOR_JUDGE_PROMPT,
-    RAG_HALLUCINATION_JUDGE_PROMPT,
-    RAG_PRIVACY_JUDGE_PROMPT,
-    RAG_NAMESPACE_JUDGE_PROMPT,
-)
 from bqas.metrics import TestResult

+# Re-export types for backward compatibility
+from bqas.rag_judge_types import (
+    RAGRetrievalResult,
+    RAGOperatorResult,
+    RAGHallucinationResult,
+    RAGPrivacyResult,
+    RAGNamespaceResult,
+)
+
+from bqas.rag_judge_evaluators import (
+    evaluate_retrieval as _evaluate_retrieval,
+    evaluate_operator as _evaluate_operator,
+    evaluate_hallucination as _evaluate_hallucination,
+    evaluate_privacy as _evaluate_privacy,
+    evaluate_namespace as _evaluate_namespace,
+    evaluate_rag_test_case as _evaluate_rag_test_case,
+)
+
+__all__ = [
+    "RAGJudge",
+    "RAGRetrievalResult",
+    "RAGOperatorResult",
+    "RAGHallucinationResult",
+    "RAGPrivacyResult",
+    "RAGNamespaceResult",
+]
+
 logger = structlog.get_logger(__name__)


-@dataclass
-class RAGRetrievalResult:
-    """Result from RAG retrieval evaluation."""
-    retrieval_precision: int  # 0-100
-    faithfulness: int  # 1-5
-    relevance: int  # 1-5
-    citation_accuracy: int  # 1-5
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGOperatorResult:
-    """Result from operator alignment evaluation."""
-    operator_alignment: int  # 0-100
-    faithfulness: int  # 1-5
-    completeness: int  # 1-5
-    detected_afb: str  # I, II, III
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGHallucinationResult:
-    """Result from hallucination control evaluation."""
-    grounding_score: int  # 0-100
-    invention_detection: Literal["pass", "fail"]
-    source_attribution: int  # 1-5
-    hallucinated_claims: List[str]
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGPrivacyResult:
-    """Result from privacy compliance evaluation."""
-    privacy_compliance: Literal["pass", "fail"]
-    anonymization: int  # 1-5
-    dsgvo_compliance: Literal["pass", "fail"]
-    detected_pii: List[str]
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGNamespaceResult:
-    """Result from namespace isolation evaluation."""
-    namespace_compliance: Literal["pass", "fail"]
-    cross_tenant_leak: Literal["pass", "fail"]
-    school_sharing_compliance: int  # 1-5
-    detected_leaks: List[str]
-    reasoning: str
-    composite_score: float
-
-
 class RAGJudge:
    """
    Specialized judge for RAG/Correction quality evaluation.
@@ -130,460 +97,53 @@ class RAGJudge:
            logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
        return {}

-    # ================================
-    # Retrieval Evaluation
-    # ================================
-
    async def evaluate_retrieval(
-        self,
-        query: str,
-        aufgabentyp: str,
-        subject: str,
-        level: str,
-        retrieved_passage: str,
-        expected_concepts: List[str],
+        self, query: str, aufgabentyp: str, subject: str, level: str,
+        retrieved_passage: str, expected_concepts: List[str],
    ) -> RAGRetrievalResult:
-        """Evaluate EH retrieval quality."""
-        prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
-            query=query,
-            aufgabentyp=aufgabentyp,
-            subject=subject,
-            level=level,
-            retrieved_passage=retrieved_passage,
-            expected_concepts=", ".join(expected_concepts),
+        return await _evaluate_retrieval(
+            self._call_ollama, self._parse_json_response, self.config,
+            query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
-            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
-            relevance = max(1, min(5, int(data.get("relevance", 1))))
-            citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
-
-            composite = self._calculate_retrieval_composite(
-                retrieval_precision, faithfulness, relevance, citation_accuracy
-            )
-
-            return RAGRetrievalResult(
-                retrieval_precision=retrieval_precision,
-                faithfulness=faithfulness,
-                relevance=relevance,
-                citation_accuracy=citation_accuracy,
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Retrieval evaluation failed", error=str(e))
-            return RAGRetrievalResult(
-                retrieval_precision=0,
-                faithfulness=1,
-                relevance=1,
-                citation_accuracy=1,
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_retrieval_composite(
-        self,
-        retrieval_precision: int,
-        faithfulness: int,
-        relevance: int,
-        citation_accuracy: int,
-    ) -> float:
-        """Calculate composite score for retrieval evaluation."""
-        c = self.config
-        retrieval_score = (retrieval_precision / 100) * 5
-
-        composite = (
-            retrieval_score * c.rag_retrieval_precision_weight +
-            faithfulness * c.rag_faithfulness_weight +
-            relevance * 0.3 +  # Higher weight for relevance in retrieval
-            citation_accuracy * c.rag_citation_accuracy_weight
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Operator Evaluation
-    # ================================
-
    async def evaluate_operator(
-        self,
-        operator: str,
-        generated_definition: str,
-        expected_afb: str,
-        expected_actions: List[str],
+        self, operator: str, generated_definition: str,
+        expected_afb: str, expected_actions: List[str],
    ) -> RAGOperatorResult:
-        """Evaluate operator alignment."""
-        prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
-            operator=operator,
-            generated_definition=generated_definition,
-            expected_afb=expected_afb,
-            expected_actions=", ".join(expected_actions),
+        return await _evaluate_operator(
+            self._call_ollama, self._parse_json_response,
+            operator, generated_definition, expected_afb, expected_actions,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
-            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
-            completeness = max(1, min(5, int(data.get("completeness", 1))))
-            detected_afb = str(data.get("detected_afb", ""))
-
-            composite = self._calculate_operator_composite(
-                operator_alignment, faithfulness, completeness
-            )
-
-            return RAGOperatorResult(
-                operator_alignment=operator_alignment,
-                faithfulness=faithfulness,
-                completeness=completeness,
-                detected_afb=detected_afb,
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Operator evaluation failed", error=str(e))
-            return RAGOperatorResult(
-                operator_alignment=0,
-                faithfulness=1,
-                completeness=1,
-                detected_afb="",
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_operator_composite(
-        self,
-        operator_alignment: int,
-        faithfulness: int,
-        completeness: int,
-    ) -> float:
-        """Calculate composite score for operator evaluation."""
-        alignment_score = (operator_alignment / 100) * 5
-
-        composite = (
-            alignment_score * 0.5 +
-            faithfulness * 0.3 +
-            completeness * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Hallucination Evaluation
-    # ================================
-
    async def evaluate_hallucination(
-        self,
-        query: str,
-        response: str,
-        available_facts: List[str],
+        self, query: str, response: str, available_facts: List[str],
    ) -> RAGHallucinationResult:
-        """Evaluate for hallucinations."""
-        prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
-            query=query,
-            response=response,
-            available_facts="\n".join(f"- {f}" for f in available_facts),
+        return await _evaluate_hallucination(
+            self._call_ollama, self._parse_json_response,
+            query, response, available_facts,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
-            invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
-            source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
-            hallucinated_claims = data.get("hallucinated_claims", [])
-
-            composite = self._calculate_hallucination_composite(
-                grounding_score, invention_detection, source_attribution
-            )
-
-            return RAGHallucinationResult(
-                grounding_score=grounding_score,
-                invention_detection=invention_detection,
-                source_attribution=source_attribution,
-                hallucinated_claims=hallucinated_claims[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Hallucination evaluation failed", error=str(e))
-            return RAGHallucinationResult(
-                grounding_score=0,
-                invention_detection="fail",
-                source_attribution=1,
-                hallucinated_claims=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_hallucination_composite(
-        self,
-        grounding_score: int,
-        invention_detection: str,
-        source_attribution: int,
-    ) -> float:
-        """Calculate composite score for hallucination evaluation."""
-        grounding = (grounding_score / 100) * 5
-        invention = 5.0 if invention_detection == "pass" else 0.0
-
-        composite = (
-            grounding * 0.4 +
-            invention * 0.4 +
-            source_attribution * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Privacy Evaluation
-    # ================================
-
    async def evaluate_privacy(
-        self,
-        query: str,
-        context: Dict[str, Any],
-        response: str,
+        self, query: str, context: Dict[str, Any], response: str,
    ) -> RAGPrivacyResult:
-        """Evaluate privacy/DSGVO compliance."""
-        prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
-            query=query,
-            context=json.dumps(context, ensure_ascii=False, indent=2),
-            response=response,
+        return await _evaluate_privacy(
+            self._call_ollama, self._parse_json_response,
+            query, context, response,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
-            anonymization = max(1, min(5, int(data.get("anonymization", 1))))
-            dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
-            detected_pii = data.get("detected_pii", [])
-
-            composite = self._calculate_privacy_composite(
-                privacy_compliance, anonymization, dsgvo_compliance
-            )
-
-            return RAGPrivacyResult(
-                privacy_compliance=privacy_compliance,
-                anonymization=anonymization,
-                dsgvo_compliance=dsgvo_compliance,
-                detected_pii=detected_pii[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Privacy evaluation failed", error=str(e))
-            return RAGPrivacyResult(
-                privacy_compliance="fail",
-                anonymization=1,
-                dsgvo_compliance="fail",
-                detected_pii=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_privacy_composite(
-        self,
-        privacy_compliance: str,
-        anonymization: int,
-        dsgvo_compliance: str,
-    ) -> float:
-        """Calculate composite score for privacy evaluation."""
-        privacy = 5.0 if privacy_compliance == "pass" else 0.0
-        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
-
-        composite = (
-            privacy * 0.4 +
-            anonymization * 0.2 +
-            dsgvo * 0.4
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Namespace Evaluation
-    # ================================
-
    async def evaluate_namespace(
-        self,
-        teacher_id: str,
-        namespace: str,
-        school_id: str,
-        requested_data: str,
-        response: str,
+        self, teacher_id: str, namespace: str, school_id: str,
+        requested_data: str, response: str,
    ) -> RAGNamespaceResult:
-        """Evaluate namespace isolation."""
-        prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
-            teacher_id=teacher_id,
-            namespace=namespace,
-            school_id=school_id,
-            requested_data=requested_data,
-            response=response,
+        return await _evaluate_namespace(
+            self._call_ollama, self._parse_json_response,
+            teacher_id, namespace, school_id, requested_data, response,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
-            cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
-            school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
-            detected_leaks = data.get("detected_leaks", [])
-
-            composite = self._calculate_namespace_composite(
-                namespace_compliance, cross_tenant_leak, school_sharing_compliance
-            )
-
-            return RAGNamespaceResult(
-                namespace_compliance=namespace_compliance,
-                cross_tenant_leak=cross_tenant_leak,
-                school_sharing_compliance=school_sharing_compliance,
-                detected_leaks=detected_leaks[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Namespace evaluation failed", error=str(e))
-            return RAGNamespaceResult(
-                namespace_compliance="fail",
-                cross_tenant_leak="fail",
-                school_sharing_compliance=1,
-                detected_leaks=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_namespace_composite(
-        self,
-        namespace_compliance: str,
-        cross_tenant_leak: str,
-        school_sharing_compliance: int,
-    ) -> float:
-        """Calculate composite score for namespace evaluation."""
-        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
-        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
-
-        composite = (
-            ns_compliance * 0.4 +
-            cross_tenant * 0.4 +
-            school_sharing_compliance * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Test Case Evaluation
-    # ================================
-
    async def evaluate_rag_test_case(
-        self,
-        test_case: Dict[str, Any],
-        service_response: Dict[str, Any],
+        self, test_case: Dict[str, Any], service_response: Dict[str, Any],
    ) -> TestResult:
-        """
-        Evaluate a full RAG test case from the golden suite.
-
-        Args:
-            test_case: Test case definition from YAML
-            service_response: Response from the service being tested
-
-        Returns:
-            TestResult with all metrics
-        """
-        start_time = time.time()
-
-        test_id = test_case.get("id", "UNKNOWN")
-        test_name = test_case.get("name", "")
-        category = test_case.get("category", "")
-        min_score = test_case.get("min_score", 3.5)
-
-        # Route to appropriate evaluation based on category
-        composite_score = 0.0
-        reasoning = ""
-
-        if category == "eh_retrieval":
-            result = await self.evaluate_retrieval(
-                query=test_case.get("input", {}).get("query", ""),
-                aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
-                subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
-                level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
-                retrieved_passage=service_response.get("passage", ""),
-                expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "operator_alignment":
-            result = await self.evaluate_operator(
-                operator=test_case.get("input", {}).get("operator", ""),
-                generated_definition=service_response.get("definition", ""),
-                expected_afb=test_case.get("expected", {}).get("afb_level", ""),
-                expected_actions=test_case.get("expected", {}).get("expected_actions", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "hallucination_control":
-            result = await self.evaluate_hallucination(
-                query=test_case.get("input", {}).get("query", ""),
-                response=service_response.get("response", ""),
-                available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "privacy_compliance":
-            result = await self.evaluate_privacy(
-                query=test_case.get("input", {}).get("query", ""),
-                context=test_case.get("input", {}).get("context", {}),
-                response=service_response.get("response", ""),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "namespace_isolation":
-            context = test_case.get("input", {}).get("context", {})
-            result = await self.evaluate_namespace(
-                teacher_id=context.get("teacher_id", ""),
-                namespace=context.get("namespace", ""),
-                school_id=context.get("school_id", ""),
-                requested_data=test_case.get("input", {}).get("query", ""),
-                response=service_response.get("response", ""),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        else:
-            reasoning = f"Unknown category: {category}"
-
-        duration_ms = int((time.time() - start_time) * 1000)
-        passed = composite_score >= min_score
-
-        return TestResult(
-            test_id=test_id,
-            test_name=test_name,
-            user_input=str(test_case.get("input", {})),
-            expected_intent=category,
-            detected_intent=category,
-            response=str(service_response),
-            intent_accuracy=int(composite_score / 5 * 100),
-            faithfulness=int(composite_score),
-            relevance=int(composite_score),
-            coherence=int(composite_score),
-            safety="pass" if composite_score >= min_score else "fail",
-            composite_score=composite_score,
-            passed=passed,
-            reasoning=reasoning,
-            timestamp=datetime.utcnow(),
-            duration_ms=duration_ms,
-        )
+        return await _evaluate_rag_test_case(self, test_case, service_response)

    async def health_check(self) -> bool:
        """Check if Ollama and judge model are available."""