[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions
@@ -1,82 +1,49 @@
 """
 RAG Judge - Specialized evaluation for RAG/Correction quality
+
+Split into:
+- rag_judge_types.py: Data classes for evaluation results
+- rag_judge_evaluators.py: Individual evaluation methods
+- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
 """
 import json
-import time
 import structlog
 import httpx
-from dataclasses import dataclass
-from typing import Literal, Optional, Dict, List, Any
-from datetime import datetime
+from typing import Optional, Dict, List, Any

 from bqas.config import BQASConfig
-from bqas.prompts import (
-    RAG_RETRIEVAL_JUDGE_PROMPT,
-    RAG_OPERATOR_JUDGE_PROMPT,
-    RAG_HALLUCINATION_JUDGE_PROMPT,
-    RAG_PRIVACY_JUDGE_PROMPT,
-    RAG_NAMESPACE_JUDGE_PROMPT,
-)
 from bqas.metrics import TestResult

+# Re-export types for backward compatibility
+from bqas.rag_judge_types import (
+    RAGRetrievalResult,
+    RAGOperatorResult,
+    RAGHallucinationResult,
+    RAGPrivacyResult,
+    RAGNamespaceResult,
+)
+
+from bqas.rag_judge_evaluators import (
+    evaluate_retrieval as _evaluate_retrieval,
+    evaluate_operator as _evaluate_operator,
+    evaluate_hallucination as _evaluate_hallucination,
+    evaluate_privacy as _evaluate_privacy,
+    evaluate_namespace as _evaluate_namespace,
+    evaluate_rag_test_case as _evaluate_rag_test_case,
+)
+
+__all__ = [
+    "RAGJudge",
+    "RAGRetrievalResult",
+    "RAGOperatorResult",
+    "RAGHallucinationResult",
+    "RAGPrivacyResult",
+    "RAGNamespaceResult",
+]
+
 logger = structlog.get_logger(__name__)


-@dataclass
-class RAGRetrievalResult:
-    """Result from RAG retrieval evaluation."""
-    retrieval_precision: int  # 0-100
-    faithfulness: int  # 1-5
-    relevance: int  # 1-5
-    citation_accuracy: int  # 1-5
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGOperatorResult:
-    """Result from operator alignment evaluation."""
-    operator_alignment: int  # 0-100
-    faithfulness: int  # 1-5
-    completeness: int  # 1-5
-    detected_afb: str  # I, II, III
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGHallucinationResult:
-    """Result from hallucination control evaluation."""
-    grounding_score: int  # 0-100
-    invention_detection: Literal["pass", "fail"]
-    source_attribution: int  # 1-5
-    hallucinated_claims: List[str]
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGPrivacyResult:
-    """Result from privacy compliance evaluation."""
-    privacy_compliance: Literal["pass", "fail"]
-    anonymization: int  # 1-5
-    dsgvo_compliance: Literal["pass", "fail"]
-    detected_pii: List[str]
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGNamespaceResult:
-    """Result from namespace isolation evaluation."""
-    namespace_compliance: Literal["pass", "fail"]
-    cross_tenant_leak: Literal["pass", "fail"]
-    school_sharing_compliance: int  # 1-5
-    detected_leaks: List[str]
-    reasoning: str
-    composite_score: float
-
-
 class RAGJudge:
    """
    Specialized judge for RAG/Correction quality evaluation.
@@ -130,460 +97,53 @@ class RAGJudge:
            logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
        return {}

-    # ================================
-    # Retrieval Evaluation
-    # ================================
-
    async def evaluate_retrieval(
-        self,
-        query: str,
-        aufgabentyp: str,
-        subject: str,
-        level: str,
-        retrieved_passage: str,
-        expected_concepts: List[str],
+        self, query: str, aufgabentyp: str, subject: str, level: str,
+        retrieved_passage: str, expected_concepts: List[str],
    ) -> RAGRetrievalResult:
-        """Evaluate EH retrieval quality."""
-        prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
-            query=query,
-            aufgabentyp=aufgabentyp,
-            subject=subject,
-            level=level,
-            retrieved_passage=retrieved_passage,
-            expected_concepts=", ".join(expected_concepts),
+        return await _evaluate_retrieval(
+            self._call_ollama, self._parse_json_response, self.config,
+            query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
-            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
-            relevance = max(1, min(5, int(data.get("relevance", 1))))
-            citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
-
-            composite = self._calculate_retrieval_composite(
-                retrieval_precision, faithfulness, relevance, citation_accuracy
-            )
-
-            return RAGRetrievalResult(
-                retrieval_precision=retrieval_precision,
-                faithfulness=faithfulness,
-                relevance=relevance,
-                citation_accuracy=citation_accuracy,
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Retrieval evaluation failed", error=str(e))
-            return RAGRetrievalResult(
-                retrieval_precision=0,
-                faithfulness=1,
-                relevance=1,
-                citation_accuracy=1,
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_retrieval_composite(
-        self,
-        retrieval_precision: int,
-        faithfulness: int,
-        relevance: int,
-        citation_accuracy: int,
-    ) -> float:
-        """Calculate composite score for retrieval evaluation."""
-        c = self.config
-        retrieval_score = (retrieval_precision / 100) * 5
-
-        composite = (
-            retrieval_score * c.rag_retrieval_precision_weight +
-            faithfulness * c.rag_faithfulness_weight +
-            relevance * 0.3 +  # Higher weight for relevance in retrieval
-            citation_accuracy * c.rag_citation_accuracy_weight
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Operator Evaluation
-    # ================================
-
    async def evaluate_operator(
-        self,
-        operator: str,
-        generated_definition: str,
-        expected_afb: str,
-        expected_actions: List[str],
+        self, operator: str, generated_definition: str,
+        expected_afb: str, expected_actions: List[str],
    ) -> RAGOperatorResult:
-        """Evaluate operator alignment."""
-        prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
-            operator=operator,
-            generated_definition=generated_definition,
-            expected_afb=expected_afb,
-            expected_actions=", ".join(expected_actions),
+        return await _evaluate_operator(
+            self._call_ollama, self._parse_json_response,
+            operator, generated_definition, expected_afb, expected_actions,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
-            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
-            completeness = max(1, min(5, int(data.get("completeness", 1))))
-            detected_afb = str(data.get("detected_afb", ""))
-
-            composite = self._calculate_operator_composite(
-                operator_alignment, faithfulness, completeness
-            )
-
-            return RAGOperatorResult(
-                operator_alignment=operator_alignment,
-                faithfulness=faithfulness,
-                completeness=completeness,
-                detected_afb=detected_afb,
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Operator evaluation failed", error=str(e))
-            return RAGOperatorResult(
-                operator_alignment=0,
-                faithfulness=1,
-                completeness=1,
-                detected_afb="",
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_operator_composite(
-        self,
-        operator_alignment: int,
-        faithfulness: int,
-        completeness: int,
-    ) -> float:
-        """Calculate composite score for operator evaluation."""
-        alignment_score = (operator_alignment / 100) * 5
-
-        composite = (
-            alignment_score * 0.5 +
-            faithfulness * 0.3 +
-            completeness * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Hallucination Evaluation
-    # ================================
-
    async def evaluate_hallucination(
-        self,
-        query: str,
-        response: str,
-        available_facts: List[str],
+        self, query: str, response: str, available_facts: List[str],
    ) -> RAGHallucinationResult:
-        """Evaluate for hallucinations."""
-        prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
-            query=query,
-            response=response,
-            available_facts="\n".join(f"- {f}" for f in available_facts),
+        return await _evaluate_hallucination(
+            self._call_ollama, self._parse_json_response,
+            query, response, available_facts,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
-            invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
-            source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
-            hallucinated_claims = data.get("hallucinated_claims", [])
-
-            composite = self._calculate_hallucination_composite(
-                grounding_score, invention_detection, source_attribution
-            )
-
-            return RAGHallucinationResult(
-                grounding_score=grounding_score,
-                invention_detection=invention_detection,
-                source_attribution=source_attribution,
-                hallucinated_claims=hallucinated_claims[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Hallucination evaluation failed", error=str(e))
-            return RAGHallucinationResult(
-                grounding_score=0,
-                invention_detection="fail",
-                source_attribution=1,
-                hallucinated_claims=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_hallucination_composite(
-        self,
-        grounding_score: int,
-        invention_detection: str,
-        source_attribution: int,
-    ) -> float:
-        """Calculate composite score for hallucination evaluation."""
-        grounding = (grounding_score / 100) * 5
-        invention = 5.0 if invention_detection == "pass" else 0.0
-
-        composite = (
-            grounding * 0.4 +
-            invention * 0.4 +
-            source_attribution * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Privacy Evaluation
-    # ================================
-
    async def evaluate_privacy(
-        self,
-        query: str,
-        context: Dict[str, Any],
-        response: str,
+        self, query: str, context: Dict[str, Any], response: str,
    ) -> RAGPrivacyResult:
-        """Evaluate privacy/DSGVO compliance."""
-        prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
-            query=query,
-            context=json.dumps(context, ensure_ascii=False, indent=2),
-            response=response,
+        return await _evaluate_privacy(
+            self._call_ollama, self._parse_json_response,
+            query, context, response,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
-            anonymization = max(1, min(5, int(data.get("anonymization", 1))))
-            dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
-            detected_pii = data.get("detected_pii", [])
-
-            composite = self._calculate_privacy_composite(
-                privacy_compliance, anonymization, dsgvo_compliance
-            )
-
-            return RAGPrivacyResult(
-                privacy_compliance=privacy_compliance,
-                anonymization=anonymization,
-                dsgvo_compliance=dsgvo_compliance,
-                detected_pii=detected_pii[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Privacy evaluation failed", error=str(e))
-            return RAGPrivacyResult(
-                privacy_compliance="fail",
-                anonymization=1,
-                dsgvo_compliance="fail",
-                detected_pii=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_privacy_composite(
-        self,
-        privacy_compliance: str,
-        anonymization: int,
-        dsgvo_compliance: str,
-    ) -> float:
-        """Calculate composite score for privacy evaluation."""
-        privacy = 5.0 if privacy_compliance == "pass" else 0.0
-        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
-
-        composite = (
-            privacy * 0.4 +
-            anonymization * 0.2 +
-            dsgvo * 0.4
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Namespace Evaluation
-    # ================================
-
    async def evaluate_namespace(
-        self,
-        teacher_id: str,
-        namespace: str,
-        school_id: str,
-        requested_data: str,
-        response: str,
+        self, teacher_id: str, namespace: str, school_id: str,
+        requested_data: str, response: str,
    ) -> RAGNamespaceResult:
-        """Evaluate namespace isolation."""
-        prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
-            teacher_id=teacher_id,
-            namespace=namespace,
-            school_id=school_id,
-            requested_data=requested_data,
-            response=response,
+        return await _evaluate_namespace(
+            self._call_ollama, self._parse_json_response,
+            teacher_id, namespace, school_id, requested_data, response,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
-            cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
-            school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
-            detected_leaks = data.get("detected_leaks", [])
-
-            composite = self._calculate_namespace_composite(
-                namespace_compliance, cross_tenant_leak, school_sharing_compliance
-            )
-
-            return RAGNamespaceResult(
-                namespace_compliance=namespace_compliance,
-                cross_tenant_leak=cross_tenant_leak,
-                school_sharing_compliance=school_sharing_compliance,
-                detected_leaks=detected_leaks[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Namespace evaluation failed", error=str(e))
-            return RAGNamespaceResult(
-                namespace_compliance="fail",
-                cross_tenant_leak="fail",
-                school_sharing_compliance=1,
-                detected_leaks=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_namespace_composite(
-        self,
-        namespace_compliance: str,
-        cross_tenant_leak: str,
-        school_sharing_compliance: int,
-    ) -> float:
-        """Calculate composite score for namespace evaluation."""
-        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
-        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
-
-        composite = (
-            ns_compliance * 0.4 +
-            cross_tenant * 0.4 +
-            school_sharing_compliance * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Test Case Evaluation
-    # ================================
-
    async def evaluate_rag_test_case(
-        self,
-        test_case: Dict[str, Any],
-        service_response: Dict[str, Any],
+        self, test_case: Dict[str, Any], service_response: Dict[str, Any],
    ) -> TestResult:
-        """
-        Evaluate a full RAG test case from the golden suite.
-
-        Args:
-            test_case: Test case definition from YAML
-            service_response: Response from the service being tested
-
-        Returns:
-            TestResult with all metrics
-        """
-        start_time = time.time()
-
-        test_id = test_case.get("id", "UNKNOWN")
-        test_name = test_case.get("name", "")
-        category = test_case.get("category", "")
-        min_score = test_case.get("min_score", 3.5)
-
-        # Route to appropriate evaluation based on category
-        composite_score = 0.0
-        reasoning = ""
-
-        if category == "eh_retrieval":
-            result = await self.evaluate_retrieval(
-                query=test_case.get("input", {}).get("query", ""),
-                aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
-                subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
-                level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
-                retrieved_passage=service_response.get("passage", ""),
-                expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "operator_alignment":
-            result = await self.evaluate_operator(
-                operator=test_case.get("input", {}).get("operator", ""),
-                generated_definition=service_response.get("definition", ""),
-                expected_afb=test_case.get("expected", {}).get("afb_level", ""),
-                expected_actions=test_case.get("expected", {}).get("expected_actions", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "hallucination_control":
-            result = await self.evaluate_hallucination(
-                query=test_case.get("input", {}).get("query", ""),
-                response=service_response.get("response", ""),
-                available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "privacy_compliance":
-            result = await self.evaluate_privacy(
-                query=test_case.get("input", {}).get("query", ""),
-                context=test_case.get("input", {}).get("context", {}),
-                response=service_response.get("response", ""),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "namespace_isolation":
-            context = test_case.get("input", {}).get("context", {})
-            result = await self.evaluate_namespace(
-                teacher_id=context.get("teacher_id", ""),
-                namespace=context.get("namespace", ""),
-                school_id=context.get("school_id", ""),
-                requested_data=test_case.get("input", {}).get("query", ""),
-                response=service_response.get("response", ""),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        else:
-            reasoning = f"Unknown category: {category}"
-
-        duration_ms = int((time.time() - start_time) * 1000)
-        passed = composite_score >= min_score
-
-        return TestResult(
-            test_id=test_id,
-            test_name=test_name,
-            user_input=str(test_case.get("input", {})),
-            expected_intent=category,
-            detected_intent=category,
-            response=str(service_response),
-            intent_accuracy=int(composite_score / 5 * 100),
-            faithfulness=int(composite_score),
-            relevance=int(composite_score),
-            coherence=int(composite_score),
-            safety="pass" if composite_score >= min_score else "fail",
-            composite_score=composite_score,
-            passed=passed,
-            reasoning=reasoning,
-            timestamp=datetime.utcnow(),
-            duration_ms=duration_ms,
-        )
+        return await _evaluate_rag_test_case(self, test_case, service_response)

    async def health_check(self) -> bool:
        """Check if Ollama and judge model are available."""
@@ -0,0 +1,397 @@
+"""
+RAG Judge Evaluators - Individual evaluation methods for RAG quality
+"""
+import json
+import time
+import structlog
+from typing import List, Dict, Any
+from datetime import datetime
+
+from bqas.config import BQASConfig
+from bqas.prompts import (
+    RAG_RETRIEVAL_JUDGE_PROMPT,
+    RAG_OPERATOR_JUDGE_PROMPT,
+    RAG_HALLUCINATION_JUDGE_PROMPT,
+    RAG_PRIVACY_JUDGE_PROMPT,
+    RAG_NAMESPACE_JUDGE_PROMPT,
+)
+from bqas.metrics import TestResult
+from bqas.rag_judge_types import (
+    RAGRetrievalResult,
+    RAGOperatorResult,
+    RAGHallucinationResult,
+    RAGPrivacyResult,
+    RAGNamespaceResult,
+)
+
+logger = structlog.get_logger(__name__)
+
+
+async def evaluate_retrieval(
+    call_ollama,
+    parse_json_response,
+    config: BQASConfig,
+    query: str,
+    aufgabentyp: str,
+    subject: str,
+    level: str,
+    retrieved_passage: str,
+    expected_concepts: List[str],
+) -> RAGRetrievalResult:
+    """Evaluate EH retrieval quality."""
+    prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
+        query=query,
+        aufgabentyp=aufgabentyp,
+        subject=subject,
+        level=level,
+        retrieved_passage=retrieved_passage,
+        expected_concepts=", ".join(expected_concepts),
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
+        faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
+        relevance = max(1, min(5, int(data.get("relevance", 1))))
+        citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
+
+        composite = _calculate_retrieval_composite(
+            config, retrieval_precision, faithfulness, relevance, citation_accuracy
+        )
+
+        return RAGRetrievalResult(
+            retrieval_precision=retrieval_precision,
+            faithfulness=faithfulness,
+            relevance=relevance,
+            citation_accuracy=citation_accuracy,
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Retrieval evaluation failed", error=str(e))
+        return RAGRetrievalResult(
+            retrieval_precision=0,
+            faithfulness=1,
+            relevance=1,
+            citation_accuracy=1,
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+def _calculate_retrieval_composite(
+    config: BQASConfig,
+    retrieval_precision: int,
+    faithfulness: int,
+    relevance: int,
+    citation_accuracy: int,
+) -> float:
+    """Calculate composite score for retrieval evaluation."""
+    retrieval_score = (retrieval_precision / 100) * 5
+    composite = (
+        retrieval_score * config.rag_retrieval_precision_weight +
+        faithfulness * config.rag_faithfulness_weight +
+        relevance * 0.3 +
+        citation_accuracy * config.rag_citation_accuracy_weight
+    )
+    return round(composite, 3)
+
+
+async def evaluate_operator(
+    call_ollama,
+    parse_json_response,
+    operator: str,
+    generated_definition: str,
+    expected_afb: str,
+    expected_actions: List[str],
+) -> RAGOperatorResult:
+    """Evaluate operator alignment."""
+    prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
+        operator=operator,
+        generated_definition=generated_definition,
+        expected_afb=expected_afb,
+        expected_actions=", ".join(expected_actions),
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
+        faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
+        completeness = max(1, min(5, int(data.get("completeness", 1))))
+        detected_afb = str(data.get("detected_afb", ""))
+
+        alignment_score = (operator_alignment / 100) * 5
+        composite = round(
+            alignment_score * 0.5 + faithfulness * 0.3 + completeness * 0.2, 3
+        )
+
+        return RAGOperatorResult(
+            operator_alignment=operator_alignment,
+            faithfulness=faithfulness,
+            completeness=completeness,
+            detected_afb=detected_afb,
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Operator evaluation failed", error=str(e))
+        return RAGOperatorResult(
+            operator_alignment=0,
+            faithfulness=1,
+            completeness=1,
+            detected_afb="",
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_hallucination(
+    call_ollama,
+    parse_json_response,
+    query: str,
+    response: str,
+    available_facts: List[str],
+) -> RAGHallucinationResult:
+    """Evaluate for hallucinations."""
+    prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
+        query=query,
+        response=response,
+        available_facts="\n".join(f"- {f}" for f in available_facts),
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
+        invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
+        source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
+        hallucinated_claims = data.get("hallucinated_claims", [])
+
+        grounding = (grounding_score / 100) * 5
+        invention = 5.0 if invention_detection == "pass" else 0.0
+        composite = round(grounding * 0.4 + invention * 0.4 + source_attribution * 0.2, 3)
+
+        return RAGHallucinationResult(
+            grounding_score=grounding_score,
+            invention_detection=invention_detection,
+            source_attribution=source_attribution,
+            hallucinated_claims=hallucinated_claims[:5],
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Hallucination evaluation failed", error=str(e))
+        return RAGHallucinationResult(
+            grounding_score=0,
+            invention_detection="fail",
+            source_attribution=1,
+            hallucinated_claims=[],
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_privacy(
+    call_ollama,
+    parse_json_response,
+    query: str,
+    context: Dict[str, Any],
+    response: str,
+) -> RAGPrivacyResult:
+    """Evaluate privacy/DSGVO compliance."""
+    prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
+        query=query,
+        context=json.dumps(context, ensure_ascii=False, indent=2),
+        response=response,
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
+        anonymization = max(1, min(5, int(data.get("anonymization", 1))))
+        dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
+        detected_pii = data.get("detected_pii", [])
+
+        privacy = 5.0 if privacy_compliance == "pass" else 0.0
+        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
+        composite = round(privacy * 0.4 + anonymization * 0.2 + dsgvo * 0.4, 3)
+
+        return RAGPrivacyResult(
+            privacy_compliance=privacy_compliance,
+            anonymization=anonymization,
+            dsgvo_compliance=dsgvo_compliance,
+            detected_pii=detected_pii[:5],
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Privacy evaluation failed", error=str(e))
+        return RAGPrivacyResult(
+            privacy_compliance="fail",
+            anonymization=1,
+            dsgvo_compliance="fail",
+            detected_pii=[],
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_namespace(
+    call_ollama,
+    parse_json_response,
+    teacher_id: str,
+    namespace: str,
+    school_id: str,
+    requested_data: str,
+    response: str,
+) -> RAGNamespaceResult:
+    """Evaluate namespace isolation."""
+    prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
+        teacher_id=teacher_id,
+        namespace=namespace,
+        school_id=school_id,
+        requested_data=requested_data,
+        response=response,
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
+        cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
+        school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
+        detected_leaks = data.get("detected_leaks", [])
+
+        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
+        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
+        composite = round(
+            ns_compliance * 0.4 + cross_tenant * 0.4 + school_sharing_compliance * 0.2, 3
+        )
+
+        return RAGNamespaceResult(
+            namespace_compliance=namespace_compliance,
+            cross_tenant_leak=cross_tenant_leak,
+            school_sharing_compliance=school_sharing_compliance,
+            detected_leaks=detected_leaks[:5],
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Namespace evaluation failed", error=str(e))
+        return RAGNamespaceResult(
+            namespace_compliance="fail",
+            cross_tenant_leak="fail",
+            school_sharing_compliance=1,
+            detected_leaks=[],
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_rag_test_case(
+    judge_instance,
+    test_case: Dict[str, Any],
+    service_response: Dict[str, Any],
+) -> TestResult:
+    """
+    Evaluate a full RAG test case from the golden suite.
+    """
+    start_time = time.time()
+
+    test_id = test_case.get("id", "UNKNOWN")
+    test_name = test_case.get("name", "")
+    category = test_case.get("category", "")
+    min_score = test_case.get("min_score", 3.5)
+
+    composite_score = 0.0
+    reasoning = ""
+
+    if category == "eh_retrieval":
+        result = await judge_instance.evaluate_retrieval(
+            query=test_case.get("input", {}).get("query", ""),
+            aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
+            subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
+            level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
+            retrieved_passage=service_response.get("passage", ""),
+            expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "operator_alignment":
+        result = await judge_instance.evaluate_operator(
+            operator=test_case.get("input", {}).get("operator", ""),
+            generated_definition=service_response.get("definition", ""),
+            expected_afb=test_case.get("expected", {}).get("afb_level", ""),
+            expected_actions=test_case.get("expected", {}).get("expected_actions", []),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "hallucination_control":
+        result = await judge_instance.evaluate_hallucination(
+            query=test_case.get("input", {}).get("query", ""),
+            response=service_response.get("response", ""),
+            available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "privacy_compliance":
+        result = await judge_instance.evaluate_privacy(
+            query=test_case.get("input", {}).get("query", ""),
+            context=test_case.get("input", {}).get("context", {}),
+            response=service_response.get("response", ""),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "namespace_isolation":
+        context = test_case.get("input", {}).get("context", {})
+        result = await judge_instance.evaluate_namespace(
+            teacher_id=context.get("teacher_id", ""),
+            namespace=context.get("namespace", ""),
+            school_id=context.get("school_id", ""),
+            requested_data=test_case.get("input", {}).get("query", ""),
+            response=service_response.get("response", ""),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    else:
+        reasoning = f"Unknown category: {category}"
+
+    duration_ms = int((time.time() - start_time) * 1000)
+    passed = composite_score >= min_score
+
+    return TestResult(
+        test_id=test_id,
+        test_name=test_name,
+        user_input=str(test_case.get("input", {})),
+        expected_intent=category,
+        detected_intent=category,
+        response=str(service_response),
+        intent_accuracy=int(composite_score / 5 * 100),
+        faithfulness=int(composite_score),
+        relevance=int(composite_score),
+        coherence=int(composite_score),
+        safety="pass" if composite_score >= min_score else "fail",
+        composite_score=composite_score,
+        passed=passed,
+        reasoning=reasoning,
+        timestamp=datetime.utcnow(),
+        duration_ms=duration_ms,
+    )
@@ -0,0 +1,60 @@
+"""
+RAG Judge Types - Data classes for RAG evaluation results
+"""
+from dataclasses import dataclass
+from typing import Literal, List
+
+
+@dataclass
+class RAGRetrievalResult:
+    """Result from RAG retrieval evaluation."""
+    retrieval_precision: int  # 0-100
+    faithfulness: int  # 1-5
+    relevance: int  # 1-5
+    citation_accuracy: int  # 1-5
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGOperatorResult:
+    """Result from operator alignment evaluation."""
+    operator_alignment: int  # 0-100
+    faithfulness: int  # 1-5
+    completeness: int  # 1-5
+    detected_afb: str  # I, II, III
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGHallucinationResult:
+    """Result from hallucination control evaluation."""
+    grounding_score: int  # 0-100
+    invention_detection: Literal["pass", "fail"]
+    source_attribution: int  # 1-5
+    hallucinated_claims: List[str]
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGPrivacyResult:
+    """Result from privacy compliance evaluation."""
+    privacy_compliance: Literal["pass", "fail"]
+    anonymization: int  # 1-5
+    dsgvo_compliance: Literal["pass", "fail"]
+    detected_pii: List[str]
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGNamespaceResult:
+    """Result from namespace isolation evaluation."""
+    namespace_compliance: Literal["pass", "fail"]
+    cross_tenant_leak: Literal["pass", "fail"]
+    school_sharing_compliance: int  # 1-5
+    detected_leaks: List[str]
+    reasoning: str
+    composite_score: float
@@ -1,11 +1,12 @@
 """
 BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
+
+Split into:
+- runner_golden.py: Test loading, simulation helpers, error result creation
+- runner.py (this file): BQASRunner class, singleton
 """
-import yaml
-import asyncio
 import structlog
 import httpx
-from pathlib import Path
 from typing import List, Dict, Any, Optional
 from datetime import datetime
 from dataclasses import dataclass, field
@@ -15,6 +16,13 @@ from bqas.judge import LLMJudge
 from bqas.rag_judge import RAGJudge
 from bqas.metrics import TestResult, BQASMetrics
 from bqas.synthetic_generator import SyntheticGenerator
+from bqas.runner_golden import (
+    load_golden_tests,
+    load_rag_tests,
+    simulate_response,
+    create_error_result,
+    simulate_rag_response,
+)

 logger = structlog.get_logger(__name__)

@@ -61,87 +69,42 @@ class BQASRunner:
    # ================================

    async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
-        """
-        Run the golden test suite.
-
-        Loads test cases from YAML files and evaluates each one.
-        """
+        """Run the golden test suite."""
        logger.info("Starting Golden Suite run")
        start_time = datetime.utcnow()

-        # Load all golden test cases
-        test_cases = await self._load_golden_tests()
+        test_cases = await load_golden_tests()
        logger.info(f"Loaded {len(test_cases)} golden test cases")

-        # Run all tests
        results = []
        for i, test_case in enumerate(test_cases):
            try:
                result = await self._run_golden_test(test_case)
                results.append(result)
-
                if (i + 1) % 10 == 0:
                    logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
-
            except Exception as e:
                logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
-                # Create a failed result
-                results.append(self._create_error_result(test_case, str(e)))
+                results.append(create_error_result(test_case, str(e)))

-        # Calculate metrics
        metrics = BQASMetrics.from_results(results)
        duration = (datetime.utcnow() - start_time).total_seconds()

-        # Record run
        self._run_counter += 1
        run = TestRun(
-            id=self._run_counter,
-            suite="golden",
-            timestamp=start_time,
-            git_commit=git_commit,
-            metrics=metrics,
-            results=results,
+            id=self._run_counter, suite="golden", timestamp=start_time,
+            git_commit=git_commit, metrics=metrics, results=results,
            duration_seconds=duration,
        )
        self._test_runs.insert(0, run)

        logger.info(
-            "Golden Suite completed",
-            total=metrics.total_tests,
-            passed=metrics.passed_tests,
-            failed=metrics.failed_tests,
-            score=metrics.avg_composite_score,
-            duration=f"{duration:.1f}s",
+            "Golden Suite completed", total=metrics.total_tests,
+            passed=metrics.passed_tests, failed=metrics.failed_tests,
+            score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
        )
-
        return run

-    async def _load_golden_tests(self) -> List[Dict[str, Any]]:
-        """Load all golden test cases from YAML files."""
-        tests = []
-        golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
-
-        yaml_files = [
-            "intent_tests.yaml",
-            "edge_cases.yaml",
-            "workflow_tests.yaml",
-        ]
-
-        for filename in yaml_files:
-            filepath = golden_dir / filename
-            if filepath.exists():
-                try:
-                    with open(filepath, 'r', encoding='utf-8') as f:
-                        data = yaml.safe_load(f)
-                        if data and 'tests' in data:
-                            for test in data['tests']:
-                                test['source_file'] = filename
-                            tests.extend(data['tests'])
-                except Exception as e:
-                    logger.warning(f"Failed to load {filename}", error=str(e))
-
-        return tests
-
    async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
        """Run a single golden test case."""
        test_id = test_case.get('id', 'UNKNOWN')
@@ -150,38 +113,19 @@ class BQASRunner:
        expected_intent = test_case.get('expected_intent', '')
        min_score = test_case.get('min_score', self.config.min_golden_score)

-        # Get response from voice service (or simulate)
        detected_intent, response = await self._get_voice_response(user_input, expected_intent)

-        # Evaluate with judge
        result = await self.judge.evaluate_test_case(
-            test_id=test_id,
-            test_name=test_name,
-            user_input=user_input,
-            expected_intent=expected_intent,
-            detected_intent=detected_intent,
-            response=response,
-            min_score=min_score,
+            test_id=test_id, test_name=test_name, user_input=user_input,
+            expected_intent=expected_intent, detected_intent=detected_intent,
+            response=response, min_score=min_score,
        )
-
        return result

-    async def _get_voice_response(
-        self,
-        user_input: str,
-        expected_intent: str
-    ) -> tuple[str, str]:
-        """
-        Get response from voice service.
-
-        For now, simulates responses since the full voice pipeline
-        might not be available. In production, this would call the
-        actual voice service endpoints.
-        """
+    async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
+        """Get response from voice service."""
        try:
            client = await self._get_client()
-
-            # Try to call the voice service intent detection
            response = await client.post(
                f"{self.config.voice_service_url}/api/v1/tasks",
                json={
@@ -191,231 +135,71 @@ class BQASRunner:
                },
                timeout=10.0,
            )
-
            if response.status_code == 200:
                data = response.json()
                return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
-
        except Exception as e:
            logger.debug(f"Voice service call failed, using simulation", error=str(e))

-        # Simulate response based on expected intent
-        return self._simulate_response(user_input, expected_intent)
-
-    def _simulate_response(self, user_input: str, expected_intent: str) -> tuple[str, str]:
-        """Simulate voice service response for testing without live service."""
-        # Simulate realistic detected intent (90% correct for golden tests)
-        import random
-        if random.random() < 0.90:
-            detected_intent = expected_intent
-        else:
-            # Simulate occasional misclassification
-            intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
-            detected_intent = random.choice([i for i in intents if i != expected_intent])
-
-        # Generate simulated response
-        responses = {
-            "student_observation": f"Notiz wurde gespeichert: {user_input}",
-            "reminder": f"Erinnerung erstellt: {user_input}",
-            "worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
-            "homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
-            "parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
-            "class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
-            "quiz_generate": f"Quiz wird erstellt: {user_input}",
-            "quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
-            "canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
-            "canvas_layout": f"Layout wird angepasst: {user_input}",
-            "operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
-            "eh_passage": f"EH-Passage gefunden: {user_input}",
-            "feedback_suggest": f"Feedback-Vorschlag: {user_input}",
-            "reminder_schedule": f"Erinnerung geplant: {user_input}",
-            "task_summary": f"Aufgabenuebersicht: {user_input}",
-            "conference_topic": f"Konferenzthema notiert: {user_input}",
-            "correction_note": f"Korrekturnotiz gespeichert: {user_input}",
-            "worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
-        }
-
-        response = responses.get(detected_intent, f"Verstanden: {user_input}")
-        return detected_intent, response
-
-    def _create_error_result(self, test_case: Dict[str, Any], error: str) -> TestResult:
-        """Create a failed test result due to error."""
-        return TestResult(
-            test_id=test_case.get('id', 'UNKNOWN'),
-            test_name=test_case.get('name', 'Error'),
-            user_input=test_case.get('input', ''),
-            expected_intent=test_case.get('expected_intent', ''),
-            detected_intent='error',
-            response='',
-            intent_accuracy=0,
-            faithfulness=1,
-            relevance=1,
-            coherence=1,
-            safety='fail',
-            composite_score=0.0,
-            passed=False,
-            reasoning=f"Test execution error: {error}",
-            timestamp=datetime.utcnow(),
-            duration_ms=0,
-        )
+        return simulate_response(user_input, expected_intent)

    # ================================
    # RAG Suite Runner
    # ================================

    async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
-        """
-        Run the RAG/Correction test suite.
-
-        Tests EH retrieval, operator alignment, hallucination control, etc.
-        """
+        """Run the RAG/Correction test suite."""
        logger.info("Starting RAG Suite run")
        start_time = datetime.utcnow()

-        # Load RAG test cases
-        test_cases = await self._load_rag_tests()
+        test_cases = await load_rag_tests()
        logger.info(f"Loaded {len(test_cases)} RAG test cases")

-        # Run all tests
        results = []
        for i, test_case in enumerate(test_cases):
            try:
-                result = await self._run_rag_test(test_case)
+                service_response = await simulate_rag_response(test_case)
+                result = await self.rag_judge.evaluate_rag_test_case(
+                    test_case=test_case, service_response=service_response,
+                )
                results.append(result)
-
                if (i + 1) % 5 == 0:
                    logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
-
            except Exception as e:
                logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
-                results.append(self._create_error_result(test_case, str(e)))
+                results.append(create_error_result(test_case, str(e)))

-        # Calculate metrics
        metrics = BQASMetrics.from_results(results)
        duration = (datetime.utcnow() - start_time).total_seconds()

-        # Record run
        self._run_counter += 1
        run = TestRun(
-            id=self._run_counter,
-            suite="rag",
-            timestamp=start_time,
-            git_commit=git_commit,
-            metrics=metrics,
-            results=results,
+            id=self._run_counter, suite="rag", timestamp=start_time,
+            git_commit=git_commit, metrics=metrics, results=results,
            duration_seconds=duration,
        )
        self._test_runs.insert(0, run)

        logger.info(
-            "RAG Suite completed",
-            total=metrics.total_tests,
-            passed=metrics.passed_tests,
-            score=metrics.avg_composite_score,
+            "RAG Suite completed", total=metrics.total_tests,
+            passed=metrics.passed_tests, score=metrics.avg_composite_score,
            duration=f"{duration:.1f}s",
        )
-
        return run

-    async def _load_rag_tests(self) -> List[Dict[str, Any]]:
-        """Load RAG test cases from YAML."""
-        tests = []
-        rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
-
-        if rag_file.exists():
-            try:
-                with open(rag_file, 'r', encoding='utf-8') as f:
-                    # Handle YAML documents separated by ---
-                    documents = list(yaml.safe_load_all(f))
-                    for doc in documents:
-                        if doc and 'tests' in doc:
-                            tests.extend(doc['tests'])
-                        if doc and 'edge_cases' in doc:
-                            tests.extend(doc['edge_cases'])
-            except Exception as e:
-                logger.warning(f"Failed to load RAG tests", error=str(e))
-
-        return tests
-
-    async def _run_rag_test(self, test_case: Dict[str, Any]) -> TestResult:
-        """Run a single RAG test case."""
-        # Simulate service response for RAG tests
-        service_response = await self._simulate_rag_response(test_case)
-
-        # Evaluate with RAG judge
-        result = await self.rag_judge.evaluate_rag_test_case(
-            test_case=test_case,
-            service_response=service_response,
-        )
-
-        return result
-
-    async def _simulate_rag_response(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
-        """Simulate RAG service response."""
-        category = test_case.get('category', '')
-        input_data = test_case.get('input', {})
-        expected = test_case.get('expected', {})
-
-        # Simulate responses based on category
-        if category == 'eh_retrieval':
-            concepts = expected.get('must_contain_concepts', [])
-            passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
-            passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
-            return {
-                "passage": passage,
-                "source": "EH_Deutsch_Abitur_2024_NI.pdf",
-                "relevance_score": 0.85,
-            }
-
-        elif category == 'operator_alignment':
-            operator = input_data.get('operator', '')
-            afb = expected.get('afb_level', 'II')
-            actions = expected.get('expected_actions', [])
-            return {
-                "operator": operator,
-                "definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
-                "afb_level": afb,
-            }
-
-        elif category == 'hallucination_control':
-            return {
-                "response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
-                "grounded": True,
-            }
-
-        elif category == 'privacy_compliance':
-            return {
-                "response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
-                "contains_pii": False,
-            }
-
-        elif category == 'namespace_isolation':
-            return {
-                "response": "Zugriff nur auf Daten im eigenen Namespace.",
-                "namespace_violation": False,
-            }
-
-        return {"response": "Simulated response", "success": True}
-
    # ================================
    # Synthetic Suite Runner
    # ================================

    async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
-        """
-        Run the synthetic test suite.
-
-        Generates test variations using LLM and evaluates them.
-        """
+        """Run the synthetic test suite."""
        logger.info("Starting Synthetic Suite run")
        start_time = datetime.utcnow()

-        # Generate synthetic tests
        all_variations = await self.synthetic_generator.generate_all_intents(
            count_per_intent=self.config.synthetic_count_per_intent
        )

-        # Flatten variations
        test_cases = []
        for intent, variations in all_variations.items():
            for i, v in enumerate(variations):
@@ -431,45 +215,33 @@ class BQASRunner:

        logger.info(f"Generated {len(test_cases)} synthetic test cases")

-        # Run all tests
        results = []
        for i, test_case in enumerate(test_cases):
            try:
-                result = await self._run_golden_test(test_case)  # Same logic as golden
+                result = await self._run_golden_test(test_case)
                results.append(result)
-
                if (i + 1) % 20 == 0:
                    logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
-
            except Exception as e:
                logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
-                results.append(self._create_error_result(test_case, str(e)))
+                results.append(create_error_result(test_case, str(e)))

-        # Calculate metrics
        metrics = BQASMetrics.from_results(results)
        duration = (datetime.utcnow() - start_time).total_seconds()

-        # Record run
        self._run_counter += 1
        run = TestRun(
-            id=self._run_counter,
-            suite="synthetic",
-            timestamp=start_time,
-            git_commit=git_commit,
-            metrics=metrics,
-            results=results,
+            id=self._run_counter, suite="synthetic", timestamp=start_time,
+            git_commit=git_commit, metrics=metrics, results=results,
            duration_seconds=duration,
        )
        self._test_runs.insert(0, run)

        logger.info(
-            "Synthetic Suite completed",
-            total=metrics.total_tests,
-            passed=metrics.passed_tests,
-            score=metrics.avg_composite_score,
+            "Synthetic Suite completed", total=metrics.total_tests,
+            passed=metrics.passed_tests, score=metrics.avg_composite_score,
            duration=f"{duration:.1f}s",
        )
-
        return run

    # ================================
@@ -483,20 +255,17 @@ class BQASRunner:
    def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
        """Get latest metrics for each suite."""
        result = {"golden": None, "rag": None, "synthetic": None}
-
        for run in self._test_runs:
            if result[run.suite] is None:
                result[run.suite] = run.metrics
            if all(v is not None for v in result.values()):
                break
-
        return result

    async def health_check(self) -> Dict[str, Any]:
        """Check health of BQAS components."""
        judge_ok = await self.judge.health_check()
        rag_judge_ok = await self.rag_judge.health_check()
-
        return {
            "judge_available": judge_ok,
            "rag_judge_available": rag_judge_ok,
@@ -0,0 +1,162 @@
+"""
+BQAS Golden Suite Runner - Loads and executes golden test cases
+"""
+import yaml
+import structlog
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+from bqas.metrics import TestResult
+
+logger = structlog.get_logger(__name__)
+
+
+async def load_golden_tests() -> List[Dict[str, Any]]:
+    """Load all golden test cases from YAML files."""
+    tests = []
+    golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
+
+    yaml_files = [
+        "intent_tests.yaml",
+        "edge_cases.yaml",
+        "workflow_tests.yaml",
+    ]
+
+    for filename in yaml_files:
+        filepath = golden_dir / filename
+        if filepath.exists():
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    data = yaml.safe_load(f)
+                    if data and 'tests' in data:
+                        for test in data['tests']:
+                            test['source_file'] = filename
+                        tests.extend(data['tests'])
+            except Exception as e:
+                logger.warning(f"Failed to load {filename}", error=str(e))
+
+    return tests
+
+
+async def load_rag_tests() -> List[Dict[str, Any]]:
+    """Load RAG test cases from YAML."""
+    tests = []
+    rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
+
+    if rag_file.exists():
+        try:
+            with open(rag_file, 'r', encoding='utf-8') as f:
+                documents = list(yaml.safe_load_all(f))
+                for doc in documents:
+                    if doc and 'tests' in doc:
+                        tests.extend(doc['tests'])
+                    if doc and 'edge_cases' in doc:
+                        tests.extend(doc['edge_cases'])
+        except Exception as e:
+            logger.warning(f"Failed to load RAG tests", error=str(e))
+
+    return tests
+
+
+def simulate_response(user_input: str, expected_intent: str) -> tuple:
+    """Simulate voice service response for testing without live service."""
+    import random
+    if random.random() < 0.90:
+        detected_intent = expected_intent
+    else:
+        intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
+        detected_intent = random.choice([i for i in intents if i != expected_intent])
+
+    responses = {
+        "student_observation": f"Notiz wurde gespeichert: {user_input}",
+        "reminder": f"Erinnerung erstellt: {user_input}",
+        "worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
+        "homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
+        "parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
+        "class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
+        "quiz_generate": f"Quiz wird erstellt: {user_input}",
+        "quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
+        "canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
+        "canvas_layout": f"Layout wird angepasst: {user_input}",
+        "operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
+        "eh_passage": f"EH-Passage gefunden: {user_input}",
+        "feedback_suggest": f"Feedback-Vorschlag: {user_input}",
+        "reminder_schedule": f"Erinnerung geplant: {user_input}",
+        "task_summary": f"Aufgabenuebersicht: {user_input}",
+        "conference_topic": f"Konferenzthema notiert: {user_input}",
+        "correction_note": f"Korrekturnotiz gespeichert: {user_input}",
+        "worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
+    }
+
+    response = responses.get(detected_intent, f"Verstanden: {user_input}")
+    return detected_intent, response
+
+
+def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult:
+    """Create a failed test result due to error."""
+    return TestResult(
+        test_id=test_case.get('id', 'UNKNOWN'),
+        test_name=test_case.get('name', 'Error'),
+        user_input=test_case.get('input', ''),
+        expected_intent=test_case.get('expected_intent', ''),
+        detected_intent='error',
+        response='',
+        intent_accuracy=0,
+        faithfulness=1,
+        relevance=1,
+        coherence=1,
+        safety='fail',
+        composite_score=0.0,
+        passed=False,
+        reasoning=f"Test execution error: {error}",
+        timestamp=datetime.utcnow(),
+        duration_ms=0,
+    )
+
+
+async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]:
+    """Simulate RAG service response."""
+    category = test_case.get('category', '')
+    input_data = test_case.get('input', {})
+    expected = test_case.get('expected', {})
+
+    if category == 'eh_retrieval':
+        concepts = expected.get('must_contain_concepts', [])
+        passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
+        passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
+        return {
+            "passage": passage,
+            "source": "EH_Deutsch_Abitur_2024_NI.pdf",
+            "relevance_score": 0.85,
+        }
+
+    elif category == 'operator_alignment':
+        operator = input_data.get('operator', '')
+        afb = expected.get('afb_level', 'II')
+        actions = expected.get('expected_actions', [])
+        return {
+            "operator": operator,
+            "definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
+            "afb_level": afb,
+        }
+
+    elif category == 'hallucination_control':
+        return {
+            "response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
+            "grounded": True,
+        }
+
+    elif category == 'privacy_compliance':
+        return {
+            "response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
+            "contains_pii": False,
+        }
+
+    elif category == 'namespace_isolation':
+        return {
+            "response": "Zugriff nur auf Daten im eigenen Namespace.",
+            "namespace_violation": False,
+        }
+
+    return {"response": "Simulated response", "success": True}