[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions
@@ -1,82 +1,49 @@
 """
 RAG Judge - Specialized evaluation for RAG/Correction quality
+
+Split into:
+- rag_judge_types.py: Data classes for evaluation results
+- rag_judge_evaluators.py: Individual evaluation methods
+- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
 """
 import json
-import time
 import structlog
 import httpx
-from dataclasses import dataclass
-from typing import Literal, Optional, Dict, List, Any
-from datetime import datetime
+from typing import Optional, Dict, List, Any

 from bqas.config import BQASConfig
-from bqas.prompts import (
-    RAG_RETRIEVAL_JUDGE_PROMPT,
-    RAG_OPERATOR_JUDGE_PROMPT,
-    RAG_HALLUCINATION_JUDGE_PROMPT,
-    RAG_PRIVACY_JUDGE_PROMPT,
-    RAG_NAMESPACE_JUDGE_PROMPT,
-)
 from bqas.metrics import TestResult

+# Re-export types for backward compatibility
+from bqas.rag_judge_types import (
+    RAGRetrievalResult,
+    RAGOperatorResult,
+    RAGHallucinationResult,
+    RAGPrivacyResult,
+    RAGNamespaceResult,
+)
+
+from bqas.rag_judge_evaluators import (
+    evaluate_retrieval as _evaluate_retrieval,
+    evaluate_operator as _evaluate_operator,
+    evaluate_hallucination as _evaluate_hallucination,
+    evaluate_privacy as _evaluate_privacy,
+    evaluate_namespace as _evaluate_namespace,
+    evaluate_rag_test_case as _evaluate_rag_test_case,
+)
+
+__all__ = [
+    "RAGJudge",
+    "RAGRetrievalResult",
+    "RAGOperatorResult",
+    "RAGHallucinationResult",
+    "RAGPrivacyResult",
+    "RAGNamespaceResult",
+]
+
 logger = structlog.get_logger(__name__)


-@dataclass
-class RAGRetrievalResult:
-    """Result from RAG retrieval evaluation."""
-    retrieval_precision: int  # 0-100
-    faithfulness: int  # 1-5
-    relevance: int  # 1-5
-    citation_accuracy: int  # 1-5
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGOperatorResult:
-    """Result from operator alignment evaluation."""
-    operator_alignment: int  # 0-100
-    faithfulness: int  # 1-5
-    completeness: int  # 1-5
-    detected_afb: str  # I, II, III
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGHallucinationResult:
-    """Result from hallucination control evaluation."""
-    grounding_score: int  # 0-100
-    invention_detection: Literal["pass", "fail"]
-    source_attribution: int  # 1-5
-    hallucinated_claims: List[str]
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGPrivacyResult:
-    """Result from privacy compliance evaluation."""
-    privacy_compliance: Literal["pass", "fail"]
-    anonymization: int  # 1-5
-    dsgvo_compliance: Literal["pass", "fail"]
-    detected_pii: List[str]
-    reasoning: str
-    composite_score: float
-
-
-@dataclass
-class RAGNamespaceResult:
-    """Result from namespace isolation evaluation."""
-    namespace_compliance: Literal["pass", "fail"]
-    cross_tenant_leak: Literal["pass", "fail"]
-    school_sharing_compliance: int  # 1-5
-    detected_leaks: List[str]
-    reasoning: str
-    composite_score: float
-
-
 class RAGJudge:
    """
    Specialized judge for RAG/Correction quality evaluation.
@@ -130,460 +97,53 @@ class RAGJudge:
            logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
        return {}

-    # ================================
-    # Retrieval Evaluation
-    # ================================
-
    async def evaluate_retrieval(
-        self,
-        query: str,
-        aufgabentyp: str,
-        subject: str,
-        level: str,
-        retrieved_passage: str,
-        expected_concepts: List[str],
+        self, query: str, aufgabentyp: str, subject: str, level: str,
+        retrieved_passage: str, expected_concepts: List[str],
    ) -> RAGRetrievalResult:
-        """Evaluate EH retrieval quality."""
-        prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
-            query=query,
-            aufgabentyp=aufgabentyp,
-            subject=subject,
-            level=level,
-            retrieved_passage=retrieved_passage,
-            expected_concepts=", ".join(expected_concepts),
+        return await _evaluate_retrieval(
+            self._call_ollama, self._parse_json_response, self.config,
+            query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
-            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
-            relevance = max(1, min(5, int(data.get("relevance", 1))))
-            citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
-
-            composite = self._calculate_retrieval_composite(
-                retrieval_precision, faithfulness, relevance, citation_accuracy
-            )
-
-            return RAGRetrievalResult(
-                retrieval_precision=retrieval_precision,
-                faithfulness=faithfulness,
-                relevance=relevance,
-                citation_accuracy=citation_accuracy,
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Retrieval evaluation failed", error=str(e))
-            return RAGRetrievalResult(
-                retrieval_precision=0,
-                faithfulness=1,
-                relevance=1,
-                citation_accuracy=1,
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_retrieval_composite(
-        self,
-        retrieval_precision: int,
-        faithfulness: int,
-        relevance: int,
-        citation_accuracy: int,
-    ) -> float:
-        """Calculate composite score for retrieval evaluation."""
-        c = self.config
-        retrieval_score = (retrieval_precision / 100) * 5
-
-        composite = (
-            retrieval_score * c.rag_retrieval_precision_weight +
-            faithfulness * c.rag_faithfulness_weight +
-            relevance * 0.3 +  # Higher weight for relevance in retrieval
-            citation_accuracy * c.rag_citation_accuracy_weight
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Operator Evaluation
-    # ================================
-
    async def evaluate_operator(
-        self,
-        operator: str,
-        generated_definition: str,
-        expected_afb: str,
-        expected_actions: List[str],
+        self, operator: str, generated_definition: str,
+        expected_afb: str, expected_actions: List[str],
    ) -> RAGOperatorResult:
-        """Evaluate operator alignment."""
-        prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
-            operator=operator,
-            generated_definition=generated_definition,
-            expected_afb=expected_afb,
-            expected_actions=", ".join(expected_actions),
+        return await _evaluate_operator(
+            self._call_ollama, self._parse_json_response,
+            operator, generated_definition, expected_afb, expected_actions,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
-            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
-            completeness = max(1, min(5, int(data.get("completeness", 1))))
-            detected_afb = str(data.get("detected_afb", ""))
-
-            composite = self._calculate_operator_composite(
-                operator_alignment, faithfulness, completeness
-            )
-
-            return RAGOperatorResult(
-                operator_alignment=operator_alignment,
-                faithfulness=faithfulness,
-                completeness=completeness,
-                detected_afb=detected_afb,
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Operator evaluation failed", error=str(e))
-            return RAGOperatorResult(
-                operator_alignment=0,
-                faithfulness=1,
-                completeness=1,
-                detected_afb="",
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_operator_composite(
-        self,
-        operator_alignment: int,
-        faithfulness: int,
-        completeness: int,
-    ) -> float:
-        """Calculate composite score for operator evaluation."""
-        alignment_score = (operator_alignment / 100) * 5
-
-        composite = (
-            alignment_score * 0.5 +
-            faithfulness * 0.3 +
-            completeness * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Hallucination Evaluation
-    # ================================
-
    async def evaluate_hallucination(
-        self,
-        query: str,
-        response: str,
-        available_facts: List[str],
+        self, query: str, response: str, available_facts: List[str],
    ) -> RAGHallucinationResult:
-        """Evaluate for hallucinations."""
-        prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
-            query=query,
-            response=response,
-            available_facts="\n".join(f"- {f}" for f in available_facts),
+        return await _evaluate_hallucination(
+            self._call_ollama, self._parse_json_response,
+            query, response, available_facts,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
-            invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
-            source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
-            hallucinated_claims = data.get("hallucinated_claims", [])
-
-            composite = self._calculate_hallucination_composite(
-                grounding_score, invention_detection, source_attribution
-            )
-
-            return RAGHallucinationResult(
-                grounding_score=grounding_score,
-                invention_detection=invention_detection,
-                source_attribution=source_attribution,
-                hallucinated_claims=hallucinated_claims[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Hallucination evaluation failed", error=str(e))
-            return RAGHallucinationResult(
-                grounding_score=0,
-                invention_detection="fail",
-                source_attribution=1,
-                hallucinated_claims=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_hallucination_composite(
-        self,
-        grounding_score: int,
-        invention_detection: str,
-        source_attribution: int,
-    ) -> float:
-        """Calculate composite score for hallucination evaluation."""
-        grounding = (grounding_score / 100) * 5
-        invention = 5.0 if invention_detection == "pass" else 0.0
-
-        composite = (
-            grounding * 0.4 +
-            invention * 0.4 +
-            source_attribution * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Privacy Evaluation
-    # ================================
-
    async def evaluate_privacy(
-        self,
-        query: str,
-        context: Dict[str, Any],
-        response: str,
+        self, query: str, context: Dict[str, Any], response: str,
    ) -> RAGPrivacyResult:
-        """Evaluate privacy/DSGVO compliance."""
-        prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
-            query=query,
-            context=json.dumps(context, ensure_ascii=False, indent=2),
-            response=response,
+        return await _evaluate_privacy(
+            self._call_ollama, self._parse_json_response,
+            query, context, response,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
-            anonymization = max(1, min(5, int(data.get("anonymization", 1))))
-            dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
-            detected_pii = data.get("detected_pii", [])
-
-            composite = self._calculate_privacy_composite(
-                privacy_compliance, anonymization, dsgvo_compliance
-            )
-
-            return RAGPrivacyResult(
-                privacy_compliance=privacy_compliance,
-                anonymization=anonymization,
-                dsgvo_compliance=dsgvo_compliance,
-                detected_pii=detected_pii[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Privacy evaluation failed", error=str(e))
-            return RAGPrivacyResult(
-                privacy_compliance="fail",
-                anonymization=1,
-                dsgvo_compliance="fail",
-                detected_pii=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_privacy_composite(
-        self,
-        privacy_compliance: str,
-        anonymization: int,
-        dsgvo_compliance: str,
-    ) -> float:
-        """Calculate composite score for privacy evaluation."""
-        privacy = 5.0 if privacy_compliance == "pass" else 0.0
-        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
-
-        composite = (
-            privacy * 0.4 +
-            anonymization * 0.2 +
-            dsgvo * 0.4
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Namespace Evaluation
-    # ================================
-
    async def evaluate_namespace(
-        self,
-        teacher_id: str,
-        namespace: str,
-        school_id: str,
-        requested_data: str,
-        response: str,
+        self, teacher_id: str, namespace: str, school_id: str,
+        requested_data: str, response: str,
    ) -> RAGNamespaceResult:
-        """Evaluate namespace isolation."""
-        prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
-            teacher_id=teacher_id,
-            namespace=namespace,
-            school_id=school_id,
-            requested_data=requested_data,
-            response=response,
+        return await _evaluate_namespace(
+            self._call_ollama, self._parse_json_response,
+            teacher_id, namespace, school_id, requested_data, response,
        )

-        try:
-            response_text = await self._call_ollama(prompt)
-            data = self._parse_json_response(response_text)
-
-            namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
-            cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
-            school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
-            detected_leaks = data.get("detected_leaks", [])
-
-            composite = self._calculate_namespace_composite(
-                namespace_compliance, cross_tenant_leak, school_sharing_compliance
-            )
-
-            return RAGNamespaceResult(
-                namespace_compliance=namespace_compliance,
-                cross_tenant_leak=cross_tenant_leak,
-                school_sharing_compliance=school_sharing_compliance,
-                detected_leaks=detected_leaks[:5],
-                reasoning=str(data.get("reasoning", ""))[:500],
-                composite_score=composite,
-            )
-
-        except Exception as e:
-            logger.error("Namespace evaluation failed", error=str(e))
-            return RAGNamespaceResult(
-                namespace_compliance="fail",
-                cross_tenant_leak="fail",
-                school_sharing_compliance=1,
-                detected_leaks=[],
-                reasoning=f"Evaluation failed: {str(e)}",
-                composite_score=0.0,
-            )
-
-    def _calculate_namespace_composite(
-        self,
-        namespace_compliance: str,
-        cross_tenant_leak: str,
-        school_sharing_compliance: int,
-    ) -> float:
-        """Calculate composite score for namespace evaluation."""
-        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
-        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
-
-        composite = (
-            ns_compliance * 0.4 +
-            cross_tenant * 0.4 +
-            school_sharing_compliance * 0.2
-        )
-        return round(composite, 3)
-
-    # ================================
-    # Test Case Evaluation
-    # ================================
-
    async def evaluate_rag_test_case(
-        self,
-        test_case: Dict[str, Any],
-        service_response: Dict[str, Any],
+        self, test_case: Dict[str, Any], service_response: Dict[str, Any],
    ) -> TestResult:
-        """
-        Evaluate a full RAG test case from the golden suite.
-
-        Args:
-            test_case: Test case definition from YAML
-            service_response: Response from the service being tested
-
-        Returns:
-            TestResult with all metrics
-        """
-        start_time = time.time()
-
-        test_id = test_case.get("id", "UNKNOWN")
-        test_name = test_case.get("name", "")
-        category = test_case.get("category", "")
-        min_score = test_case.get("min_score", 3.5)
-
-        # Route to appropriate evaluation based on category
-        composite_score = 0.0
-        reasoning = ""
-
-        if category == "eh_retrieval":
-            result = await self.evaluate_retrieval(
-                query=test_case.get("input", {}).get("query", ""),
-                aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
-                subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
-                level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
-                retrieved_passage=service_response.get("passage", ""),
-                expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "operator_alignment":
-            result = await self.evaluate_operator(
-                operator=test_case.get("input", {}).get("operator", ""),
-                generated_definition=service_response.get("definition", ""),
-                expected_afb=test_case.get("expected", {}).get("afb_level", ""),
-                expected_actions=test_case.get("expected", {}).get("expected_actions", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "hallucination_control":
-            result = await self.evaluate_hallucination(
-                query=test_case.get("input", {}).get("query", ""),
-                response=service_response.get("response", ""),
-                available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "privacy_compliance":
-            result = await self.evaluate_privacy(
-                query=test_case.get("input", {}).get("query", ""),
-                context=test_case.get("input", {}).get("context", {}),
-                response=service_response.get("response", ""),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        elif category == "namespace_isolation":
-            context = test_case.get("input", {}).get("context", {})
-            result = await self.evaluate_namespace(
-                teacher_id=context.get("teacher_id", ""),
-                namespace=context.get("namespace", ""),
-                school_id=context.get("school_id", ""),
-                requested_data=test_case.get("input", {}).get("query", ""),
-                response=service_response.get("response", ""),
-            )
-            composite_score = result.composite_score
-            reasoning = result.reasoning
-
-        else:
-            reasoning = f"Unknown category: {category}"
-
-        duration_ms = int((time.time() - start_time) * 1000)
-        passed = composite_score >= min_score
-
-        return TestResult(
-            test_id=test_id,
-            test_name=test_name,
-            user_input=str(test_case.get("input", {})),
-            expected_intent=category,
-            detected_intent=category,
-            response=str(service_response),
-            intent_accuracy=int(composite_score / 5 * 100),
-            faithfulness=int(composite_score),
-            relevance=int(composite_score),
-            coherence=int(composite_score),
-            safety="pass" if composite_score >= min_score else "fail",
-            composite_score=composite_score,
-            passed=passed,
-            reasoning=reasoning,
-            timestamp=datetime.utcnow(),
-            duration_ms=duration_ms,
-        )
+        return await _evaluate_rag_test_case(self, test_case, service_response)

    async def health_check(self) -> bool:
        """Check if Ollama and judge model are available."""
@@ -0,0 +1,397 @@
+"""
+RAG Judge Evaluators - Individual evaluation methods for RAG quality
+"""
+import json
+import time
+import structlog
+from typing import List, Dict, Any
+from datetime import datetime
+
+from bqas.config import BQASConfig
+from bqas.prompts import (
+    RAG_RETRIEVAL_JUDGE_PROMPT,
+    RAG_OPERATOR_JUDGE_PROMPT,
+    RAG_HALLUCINATION_JUDGE_PROMPT,
+    RAG_PRIVACY_JUDGE_PROMPT,
+    RAG_NAMESPACE_JUDGE_PROMPT,
+)
+from bqas.metrics import TestResult
+from bqas.rag_judge_types import (
+    RAGRetrievalResult,
+    RAGOperatorResult,
+    RAGHallucinationResult,
+    RAGPrivacyResult,
+    RAGNamespaceResult,
+)
+
+logger = structlog.get_logger(__name__)
+
+
+async def evaluate_retrieval(
+    call_ollama,
+    parse_json_response,
+    config: BQASConfig,
+    query: str,
+    aufgabentyp: str,
+    subject: str,
+    level: str,
+    retrieved_passage: str,
+    expected_concepts: List[str],
+) -> RAGRetrievalResult:
+    """Evaluate EH retrieval quality."""
+    prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
+        query=query,
+        aufgabentyp=aufgabentyp,
+        subject=subject,
+        level=level,
+        retrieved_passage=retrieved_passage,
+        expected_concepts=", ".join(expected_concepts),
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
+        faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
+        relevance = max(1, min(5, int(data.get("relevance", 1))))
+        citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
+
+        composite = _calculate_retrieval_composite(
+            config, retrieval_precision, faithfulness, relevance, citation_accuracy
+        )
+
+        return RAGRetrievalResult(
+            retrieval_precision=retrieval_precision,
+            faithfulness=faithfulness,
+            relevance=relevance,
+            citation_accuracy=citation_accuracy,
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Retrieval evaluation failed", error=str(e))
+        return RAGRetrievalResult(
+            retrieval_precision=0,
+            faithfulness=1,
+            relevance=1,
+            citation_accuracy=1,
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+def _calculate_retrieval_composite(
+    config: BQASConfig,
+    retrieval_precision: int,
+    faithfulness: int,
+    relevance: int,
+    citation_accuracy: int,
+) -> float:
+    """Calculate composite score for retrieval evaluation."""
+    retrieval_score = (retrieval_precision / 100) * 5
+    composite = (
+        retrieval_score * config.rag_retrieval_precision_weight +
+        faithfulness * config.rag_faithfulness_weight +
+        relevance * 0.3 +
+        citation_accuracy * config.rag_citation_accuracy_weight
+    )
+    return round(composite, 3)
+
+
+async def evaluate_operator(
+    call_ollama,
+    parse_json_response,
+    operator: str,
+    generated_definition: str,
+    expected_afb: str,
+    expected_actions: List[str],
+) -> RAGOperatorResult:
+    """Evaluate operator alignment."""
+    prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
+        operator=operator,
+        generated_definition=generated_definition,
+        expected_afb=expected_afb,
+        expected_actions=", ".join(expected_actions),
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
+        faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
+        completeness = max(1, min(5, int(data.get("completeness", 1))))
+        detected_afb = str(data.get("detected_afb", ""))
+
+        alignment_score = (operator_alignment / 100) * 5
+        composite = round(
+            alignment_score * 0.5 + faithfulness * 0.3 + completeness * 0.2, 3
+        )
+
+        return RAGOperatorResult(
+            operator_alignment=operator_alignment,
+            faithfulness=faithfulness,
+            completeness=completeness,
+            detected_afb=detected_afb,
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Operator evaluation failed", error=str(e))
+        return RAGOperatorResult(
+            operator_alignment=0,
+            faithfulness=1,
+            completeness=1,
+            detected_afb="",
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_hallucination(
+    call_ollama,
+    parse_json_response,
+    query: str,
+    response: str,
+    available_facts: List[str],
+) -> RAGHallucinationResult:
+    """Evaluate for hallucinations."""
+    prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
+        query=query,
+        response=response,
+        available_facts="\n".join(f"- {f}" for f in available_facts),
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
+        invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
+        source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
+        hallucinated_claims = data.get("hallucinated_claims", [])
+
+        grounding = (grounding_score / 100) * 5
+        invention = 5.0 if invention_detection == "pass" else 0.0
+        composite = round(grounding * 0.4 + invention * 0.4 + source_attribution * 0.2, 3)
+
+        return RAGHallucinationResult(
+            grounding_score=grounding_score,
+            invention_detection=invention_detection,
+            source_attribution=source_attribution,
+            hallucinated_claims=hallucinated_claims[:5],
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Hallucination evaluation failed", error=str(e))
+        return RAGHallucinationResult(
+            grounding_score=0,
+            invention_detection="fail",
+            source_attribution=1,
+            hallucinated_claims=[],
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_privacy(
+    call_ollama,
+    parse_json_response,
+    query: str,
+    context: Dict[str, Any],
+    response: str,
+) -> RAGPrivacyResult:
+    """Evaluate privacy/DSGVO compliance."""
+    prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
+        query=query,
+        context=json.dumps(context, ensure_ascii=False, indent=2),
+        response=response,
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
+        anonymization = max(1, min(5, int(data.get("anonymization", 1))))
+        dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
+        detected_pii = data.get("detected_pii", [])
+
+        privacy = 5.0 if privacy_compliance == "pass" else 0.0
+        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
+        composite = round(privacy * 0.4 + anonymization * 0.2 + dsgvo * 0.4, 3)
+
+        return RAGPrivacyResult(
+            privacy_compliance=privacy_compliance,
+            anonymization=anonymization,
+            dsgvo_compliance=dsgvo_compliance,
+            detected_pii=detected_pii[:5],
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Privacy evaluation failed", error=str(e))
+        return RAGPrivacyResult(
+            privacy_compliance="fail",
+            anonymization=1,
+            dsgvo_compliance="fail",
+            detected_pii=[],
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_namespace(
+    call_ollama,
+    parse_json_response,
+    teacher_id: str,
+    namespace: str,
+    school_id: str,
+    requested_data: str,
+    response: str,
+) -> RAGNamespaceResult:
+    """Evaluate namespace isolation."""
+    prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
+        teacher_id=teacher_id,
+        namespace=namespace,
+        school_id=school_id,
+        requested_data=requested_data,
+        response=response,
+    )
+
+    try:
+        response_text = await call_ollama(prompt)
+        data = parse_json_response(response_text)
+
+        namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
+        cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
+        school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
+        detected_leaks = data.get("detected_leaks", [])
+
+        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
+        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
+        composite = round(
+            ns_compliance * 0.4 + cross_tenant * 0.4 + school_sharing_compliance * 0.2, 3
+        )
+
+        return RAGNamespaceResult(
+            namespace_compliance=namespace_compliance,
+            cross_tenant_leak=cross_tenant_leak,
+            school_sharing_compliance=school_sharing_compliance,
+            detected_leaks=detected_leaks[:5],
+            reasoning=str(data.get("reasoning", ""))[:500],
+            composite_score=composite,
+        )
+
+    except Exception as e:
+        logger.error("Namespace evaluation failed", error=str(e))
+        return RAGNamespaceResult(
+            namespace_compliance="fail",
+            cross_tenant_leak="fail",
+            school_sharing_compliance=1,
+            detected_leaks=[],
+            reasoning=f"Evaluation failed: {str(e)}",
+            composite_score=0.0,
+        )
+
+
+async def evaluate_rag_test_case(
+    judge_instance,
+    test_case: Dict[str, Any],
+    service_response: Dict[str, Any],
+) -> TestResult:
+    """
+    Evaluate a full RAG test case from the golden suite.
+    """
+    start_time = time.time()
+
+    test_id = test_case.get("id", "UNKNOWN")
+    test_name = test_case.get("name", "")
+    category = test_case.get("category", "")
+    min_score = test_case.get("min_score", 3.5)
+
+    composite_score = 0.0
+    reasoning = ""
+
+    if category == "eh_retrieval":
+        result = await judge_instance.evaluate_retrieval(
+            query=test_case.get("input", {}).get("query", ""),
+            aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
+            subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
+            level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
+            retrieved_passage=service_response.get("passage", ""),
+            expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "operator_alignment":
+        result = await judge_instance.evaluate_operator(
+            operator=test_case.get("input", {}).get("operator", ""),
+            generated_definition=service_response.get("definition", ""),
+            expected_afb=test_case.get("expected", {}).get("afb_level", ""),
+            expected_actions=test_case.get("expected", {}).get("expected_actions", []),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "hallucination_control":
+        result = await judge_instance.evaluate_hallucination(
+            query=test_case.get("input", {}).get("query", ""),
+            response=service_response.get("response", ""),
+            available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "privacy_compliance":
+        result = await judge_instance.evaluate_privacy(
+            query=test_case.get("input", {}).get("query", ""),
+            context=test_case.get("input", {}).get("context", {}),
+            response=service_response.get("response", ""),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    elif category == "namespace_isolation":
+        context = test_case.get("input", {}).get("context", {})
+        result = await judge_instance.evaluate_namespace(
+            teacher_id=context.get("teacher_id", ""),
+            namespace=context.get("namespace", ""),
+            school_id=context.get("school_id", ""),
+            requested_data=test_case.get("input", {}).get("query", ""),
+            response=service_response.get("response", ""),
+        )
+        composite_score = result.composite_score
+        reasoning = result.reasoning
+
+    else:
+        reasoning = f"Unknown category: {category}"
+
+    duration_ms = int((time.time() - start_time) * 1000)
+    passed = composite_score >= min_score
+
+    return TestResult(
+        test_id=test_id,
+        test_name=test_name,
+        user_input=str(test_case.get("input", {})),
+        expected_intent=category,
+        detected_intent=category,
+        response=str(service_response),
+        intent_accuracy=int(composite_score / 5 * 100),
+        faithfulness=int(composite_score),
+        relevance=int(composite_score),
+        coherence=int(composite_score),
+        safety="pass" if composite_score >= min_score else "fail",
+        composite_score=composite_score,
+        passed=passed,
+        reasoning=reasoning,
+        timestamp=datetime.utcnow(),
+        duration_ms=duration_ms,
+    )
@@ -0,0 +1,60 @@
+"""
+RAG Judge Types - Data classes for RAG evaluation results
+"""
+from dataclasses import dataclass
+from typing import Literal, List
+
+
+@dataclass
+class RAGRetrievalResult:
+    """Result from RAG retrieval evaluation."""
+    retrieval_precision: int  # 0-100
+    faithfulness: int  # 1-5
+    relevance: int  # 1-5
+    citation_accuracy: int  # 1-5
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGOperatorResult:
+    """Result from operator alignment evaluation."""
+    operator_alignment: int  # 0-100
+    faithfulness: int  # 1-5
+    completeness: int  # 1-5
+    detected_afb: str  # I, II, III
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGHallucinationResult:
+    """Result from hallucination control evaluation."""
+    grounding_score: int  # 0-100
+    invention_detection: Literal["pass", "fail"]
+    source_attribution: int  # 1-5
+    hallucinated_claims: List[str]
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGPrivacyResult:
+    """Result from privacy compliance evaluation."""
+    privacy_compliance: Literal["pass", "fail"]
+    anonymization: int  # 1-5
+    dsgvo_compliance: Literal["pass", "fail"]
+    detected_pii: List[str]
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGNamespaceResult:
+    """Result from namespace isolation evaluation."""
+    namespace_compliance: Literal["pass", "fail"]
+    cross_tenant_leak: Literal["pass", "fail"]
+    school_sharing_compliance: int  # 1-5
+    detected_leaks: List[str]
+    reasoning: str
+    composite_score: float
@@ -1,11 +1,12 @@
 """
 BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
+
+Split into:
+- runner_golden.py: Test loading, simulation helpers, error result creation
+- runner.py (this file): BQASRunner class, singleton
 """
-import yaml
-import asyncio
 import structlog
 import httpx
-from pathlib import Path
 from typing import List, Dict, Any, Optional
 from datetime import datetime
 from dataclasses import dataclass, field
@@ -15,6 +16,13 @@ from bqas.judge import LLMJudge
 from bqas.rag_judge import RAGJudge
 from bqas.metrics import TestResult, BQASMetrics
 from bqas.synthetic_generator import SyntheticGenerator
+from bqas.runner_golden import (
+    load_golden_tests,
+    load_rag_tests,
+    simulate_response,
+    create_error_result,
+    simulate_rag_response,
+)

 logger = structlog.get_logger(__name__)

@@ -61,87 +69,42 @@ class BQASRunner:
    # ================================

    async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
-        """
-        Run the golden test suite.
-
-        Loads test cases from YAML files and evaluates each one.
-        """
+        """Run the golden test suite."""
        logger.info("Starting Golden Suite run")
        start_time = datetime.utcnow()

-        # Load all golden test cases
-        test_cases = await self._load_golden_tests()
+        test_cases = await load_golden_tests()
        logger.info(f"Loaded {len(test_cases)} golden test cases")

-        # Run all tests
        results = []
        for i, test_case in enumerate(test_cases):
            try:
                result = await self._run_golden_test(test_case)
                results.append(result)
-
                if (i + 1) % 10 == 0:
                    logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
-
            except Exception as e:
                logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
-                # Create a failed result
-                results.append(self._create_error_result(test_case, str(e)))
+                results.append(create_error_result(test_case, str(e)))

-        # Calculate metrics
        metrics = BQASMetrics.from_results(results)
        duration = (datetime.utcnow() - start_time).total_seconds()

-        # Record run
        self._run_counter += 1
        run = TestRun(
-            id=self._run_counter,
-            suite="golden",
-            timestamp=start_time,
-            git_commit=git_commit,
-            metrics=metrics,
-            results=results,
+            id=self._run_counter, suite="golden", timestamp=start_time,
+            git_commit=git_commit, metrics=metrics, results=results,
            duration_seconds=duration,
        )
        self._test_runs.insert(0, run)

        logger.info(
-            "Golden Suite completed",
-            total=metrics.total_tests,
-            passed=metrics.passed_tests,
-            failed=metrics.failed_tests,
-            score=metrics.avg_composite_score,
-            duration=f"{duration:.1f}s",
+            "Golden Suite completed", total=metrics.total_tests,
+            passed=metrics.passed_tests, failed=metrics.failed_tests,
+            score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
        )
-
        return run

-    async def _load_golden_tests(self) -> List[Dict[str, Any]]:
-        """Load all golden test cases from YAML files."""
-        tests = []
-        golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
-
-        yaml_files = [
-            "intent_tests.yaml",
-            "edge_cases.yaml",
-            "workflow_tests.yaml",
-        ]
-
-        for filename in yaml_files:
-            filepath = golden_dir / filename
-            if filepath.exists():
-                try:
-                    with open(filepath, 'r', encoding='utf-8') as f:
-                        data = yaml.safe_load(f)
-                        if data and 'tests' in data:
-                            for test in data['tests']:
-                                test['source_file'] = filename
-                            tests.extend(data['tests'])
-                except Exception as e:
-                    logger.warning(f"Failed to load {filename}", error=str(e))
-
-        return tests
-
    async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
        """Run a single golden test case."""
        test_id = test_case.get('id', 'UNKNOWN')
@@ -150,38 +113,19 @@ class BQASRunner:
        expected_intent = test_case.get('expected_intent', '')
        min_score = test_case.get('min_score', self.config.min_golden_score)

-        # Get response from voice service (or simulate)
        detected_intent, response = await self._get_voice_response(user_input, expected_intent)

-        # Evaluate with judge
        result = await self.judge.evaluate_test_case(
-            test_id=test_id,
-            test_name=test_name,
-            user_input=user_input,
-            expected_intent=expected_intent,
-            detected_intent=detected_intent,
-            response=response,
-            min_score=min_score,
+            test_id=test_id, test_name=test_name, user_input=user_input,
+            expected_intent=expected_intent, detected_intent=detected_intent,
+            response=response, min_score=min_score,
        )
-
        return result

-    async def _get_voice_response(
-        self,
-        user_input: str,
-        expected_intent: str
-    ) -> tuple[str, str]:
-        """
-        Get response from voice service.
-
-        For now, simulates responses since the full voice pipeline
-        might not be available. In production, this would call the
-        actual voice service endpoints.
-        """
+    async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
+        """Get response from voice service."""
        try:
            client = await self._get_client()
-
-            # Try to call the voice service intent detection
            response = await client.post(
                f"{self.config.voice_service_url}/api/v1/tasks",
                json={
@@ -191,231 +135,71 @@ class BQASRunner:
                },
                timeout=10.0,
            )
-
            if response.status_code == 200:
                data = response.json()
                return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
-
        except Exception as e:
            logger.debug(f"Voice service call failed, using simulation", error=str(e))

-        # Simulate response based on expected intent
-        return self._simulate_response(user_input, expected_intent)
-
-    def _simulate_response(self, user_input: str, expected_intent: str) -> tuple[str, str]:
-        """Simulate voice service response for testing without live service."""
-        # Simulate realistic detected intent (90% correct for golden tests)
-        import random
-        if random.random() < 0.90:
-            detected_intent = expected_intent
-        else:
-            # Simulate occasional misclassification
-            intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
-            detected_intent = random.choice([i for i in intents if i != expected_intent])
-
-        # Generate simulated response
-        responses = {
-            "student_observation": f"Notiz wurde gespeichert: {user_input}",
-            "reminder": f"Erinnerung erstellt: {user_input}",
-            "worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
-            "homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
-            "parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
-            "class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
-            "quiz_generate": f"Quiz wird erstellt: {user_input}",
-            "quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
-            "canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
-            "canvas_layout": f"Layout wird angepasst: {user_input}",
-            "operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
-            "eh_passage": f"EH-Passage gefunden: {user_input}",
-            "feedback_suggest": f"Feedback-Vorschlag: {user_input}",
-            "reminder_schedule": f"Erinnerung geplant: {user_input}",
-            "task_summary": f"Aufgabenuebersicht: {user_input}",
-            "conference_topic": f"Konferenzthema notiert: {user_input}",
-            "correction_note": f"Korrekturnotiz gespeichert: {user_input}",
-            "worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
-        }
-
-        response = responses.get(detected_intent, f"Verstanden: {user_input}")
-        return detected_intent, response
-
-    def _create_error_result(self, test_case: Dict[str, Any], error: str) -> TestResult:
-        """Create a failed test result due to error."""
-        return TestResult(
-            test_id=test_case.get('id', 'UNKNOWN'),
-            test_name=test_case.get('name', 'Error'),
-            user_input=test_case.get('input', ''),
-            expected_intent=test_case.get('expected_intent', ''),
-            detected_intent='error',
-            response='',
-            intent_accuracy=0,
-            faithfulness=1,
-            relevance=1,
-            coherence=1,
-            safety='fail',
-            composite_score=0.0,
-            passed=False,
-            reasoning=f"Test execution error: {error}",
-            timestamp=datetime.utcnow(),
-            duration_ms=0,
-        )
+        return simulate_response(user_input, expected_intent)

    # ================================
    # RAG Suite Runner
    # ================================

    async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
-        """
-        Run the RAG/Correction test suite.
-
-        Tests EH retrieval, operator alignment, hallucination control, etc.
-        """
+        """Run the RAG/Correction test suite."""
        logger.info("Starting RAG Suite run")
        start_time = datetime.utcnow()

-        # Load RAG test cases
-        test_cases = await self._load_rag_tests()
+        test_cases = await load_rag_tests()
        logger.info(f"Loaded {len(test_cases)} RAG test cases")

-        # Run all tests
        results = []
        for i, test_case in enumerate(test_cases):
            try:
-                result = await self._run_rag_test(test_case)
+                service_response = await simulate_rag_response(test_case)
+                result = await self.rag_judge.evaluate_rag_test_case(
+                    test_case=test_case, service_response=service_response,
+                )
                results.append(result)
-
                if (i + 1) % 5 == 0:
                    logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
-
            except Exception as e:
                logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
-                results.append(self._create_error_result(test_case, str(e)))
+                results.append(create_error_result(test_case, str(e)))

-        # Calculate metrics
        metrics = BQASMetrics.from_results(results)
        duration = (datetime.utcnow() - start_time).total_seconds()

-        # Record run
        self._run_counter += 1
        run = TestRun(
-            id=self._run_counter,
-            suite="rag",
-            timestamp=start_time,
-            git_commit=git_commit,
-            metrics=metrics,
-            results=results,
+            id=self._run_counter, suite="rag", timestamp=start_time,
+            git_commit=git_commit, metrics=metrics, results=results,
            duration_seconds=duration,
        )
        self._test_runs.insert(0, run)

        logger.info(
-            "RAG Suite completed",
-            total=metrics.total_tests,
-            passed=metrics.passed_tests,
-            score=metrics.avg_composite_score,
+            "RAG Suite completed", total=metrics.total_tests,
+            passed=metrics.passed_tests, score=metrics.avg_composite_score,
            duration=f"{duration:.1f}s",
        )
-
        return run

-    async def _load_rag_tests(self) -> List[Dict[str, Any]]:
-        """Load RAG test cases from YAML."""
-        tests = []
-        rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
-
-        if rag_file.exists():
-            try:
-                with open(rag_file, 'r', encoding='utf-8') as f:
-                    # Handle YAML documents separated by ---
-                    documents = list(yaml.safe_load_all(f))
-                    for doc in documents:
-                        if doc and 'tests' in doc:
-                            tests.extend(doc['tests'])
-                        if doc and 'edge_cases' in doc:
-                            tests.extend(doc['edge_cases'])
-            except Exception as e:
-                logger.warning(f"Failed to load RAG tests", error=str(e))
-
-        return tests
-
-    async def _run_rag_test(self, test_case: Dict[str, Any]) -> TestResult:
-        """Run a single RAG test case."""
-        # Simulate service response for RAG tests
-        service_response = await self._simulate_rag_response(test_case)
-
-        # Evaluate with RAG judge
-        result = await self.rag_judge.evaluate_rag_test_case(
-            test_case=test_case,
-            service_response=service_response,
-        )
-
-        return result
-
-    async def _simulate_rag_response(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
-        """Simulate RAG service response."""
-        category = test_case.get('category', '')
-        input_data = test_case.get('input', {})
-        expected = test_case.get('expected', {})
-
-        # Simulate responses based on category
-        if category == 'eh_retrieval':
-            concepts = expected.get('must_contain_concepts', [])
-            passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
-            passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
-            return {
-                "passage": passage,
-                "source": "EH_Deutsch_Abitur_2024_NI.pdf",
-                "relevance_score": 0.85,
-            }
-
-        elif category == 'operator_alignment':
-            operator = input_data.get('operator', '')
-            afb = expected.get('afb_level', 'II')
-            actions = expected.get('expected_actions', [])
-            return {
-                "operator": operator,
-                "definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
-                "afb_level": afb,
-            }
-
-        elif category == 'hallucination_control':
-            return {
-                "response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
-                "grounded": True,
-            }
-
-        elif category == 'privacy_compliance':
-            return {
-                "response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
-                "contains_pii": False,
-            }
-
-        elif category == 'namespace_isolation':
-            return {
-                "response": "Zugriff nur auf Daten im eigenen Namespace.",
-                "namespace_violation": False,
-            }
-
-        return {"response": "Simulated response", "success": True}
-
    # ================================
    # Synthetic Suite Runner
    # ================================

    async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
-        """
-        Run the synthetic test suite.
-
-        Generates test variations using LLM and evaluates them.
-        """
+        """Run the synthetic test suite."""
        logger.info("Starting Synthetic Suite run")
        start_time = datetime.utcnow()

-        # Generate synthetic tests
        all_variations = await self.synthetic_generator.generate_all_intents(
            count_per_intent=self.config.synthetic_count_per_intent
        )

-        # Flatten variations
        test_cases = []
        for intent, variations in all_variations.items():
            for i, v in enumerate(variations):
@@ -431,45 +215,33 @@ class BQASRunner:

        logger.info(f"Generated {len(test_cases)} synthetic test cases")

-        # Run all tests
        results = []
        for i, test_case in enumerate(test_cases):
            try:
-                result = await self._run_golden_test(test_case)  # Same logic as golden
+                result = await self._run_golden_test(test_case)
                results.append(result)
-
                if (i + 1) % 20 == 0:
                    logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
-
            except Exception as e:
                logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
-                results.append(self._create_error_result(test_case, str(e)))
+                results.append(create_error_result(test_case, str(e)))

-        # Calculate metrics
        metrics = BQASMetrics.from_results(results)
        duration = (datetime.utcnow() - start_time).total_seconds()

-        # Record run
        self._run_counter += 1
        run = TestRun(
-            id=self._run_counter,
-            suite="synthetic",
-            timestamp=start_time,
-            git_commit=git_commit,
-            metrics=metrics,
-            results=results,
+            id=self._run_counter, suite="synthetic", timestamp=start_time,
+            git_commit=git_commit, metrics=metrics, results=results,
            duration_seconds=duration,
        )
        self._test_runs.insert(0, run)

        logger.info(
-            "Synthetic Suite completed",
-            total=metrics.total_tests,
-            passed=metrics.passed_tests,
-            score=metrics.avg_composite_score,
+            "Synthetic Suite completed", total=metrics.total_tests,
+            passed=metrics.passed_tests, score=metrics.avg_composite_score,
            duration=f"{duration:.1f}s",
        )
-
        return run

    # ================================
@@ -483,20 +255,17 @@ class BQASRunner:
    def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
        """Get latest metrics for each suite."""
        result = {"golden": None, "rag": None, "synthetic": None}
-
        for run in self._test_runs:
            if result[run.suite] is None:
                result[run.suite] = run.metrics
            if all(v is not None for v in result.values()):
                break
-
        return result

    async def health_check(self) -> Dict[str, Any]:
        """Check health of BQAS components."""
        judge_ok = await self.judge.health_check()
        rag_judge_ok = await self.rag_judge.health_check()
-
        return {
            "judge_available": judge_ok,
            "rag_judge_available": rag_judge_ok,
@@ -0,0 +1,162 @@
+"""
+BQAS Golden Suite Runner - Loads and executes golden test cases
+"""
+import yaml
+import structlog
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+from bqas.metrics import TestResult
+
+logger = structlog.get_logger(__name__)
+
+
+async def load_golden_tests() -> List[Dict[str, Any]]:
+    """Load all golden test cases from YAML files."""
+    tests = []
+    golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
+
+    yaml_files = [
+        "intent_tests.yaml",
+        "edge_cases.yaml",
+        "workflow_tests.yaml",
+    ]
+
+    for filename in yaml_files:
+        filepath = golden_dir / filename
+        if filepath.exists():
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    data = yaml.safe_load(f)
+                    if data and 'tests' in data:
+                        for test in data['tests']:
+                            test['source_file'] = filename
+                        tests.extend(data['tests'])
+            except Exception as e:
+                logger.warning(f"Failed to load {filename}", error=str(e))
+
+    return tests
+
+
+async def load_rag_tests() -> List[Dict[str, Any]]:
+    """Load RAG test cases from YAML."""
+    tests = []
+    rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
+
+    if rag_file.exists():
+        try:
+            with open(rag_file, 'r', encoding='utf-8') as f:
+                documents = list(yaml.safe_load_all(f))
+                for doc in documents:
+                    if doc and 'tests' in doc:
+                        tests.extend(doc['tests'])
+                    if doc and 'edge_cases' in doc:
+                        tests.extend(doc['edge_cases'])
+        except Exception as e:
+            logger.warning(f"Failed to load RAG tests", error=str(e))
+
+    return tests
+
+
+def simulate_response(user_input: str, expected_intent: str) -> tuple:
+    """Simulate voice service response for testing without live service."""
+    import random
+    if random.random() < 0.90:
+        detected_intent = expected_intent
+    else:
+        intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
+        detected_intent = random.choice([i for i in intents if i != expected_intent])
+
+    responses = {
+        "student_observation": f"Notiz wurde gespeichert: {user_input}",
+        "reminder": f"Erinnerung erstellt: {user_input}",
+        "worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
+        "homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
+        "parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
+        "class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
+        "quiz_generate": f"Quiz wird erstellt: {user_input}",
+        "quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
+        "canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
+        "canvas_layout": f"Layout wird angepasst: {user_input}",
+        "operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
+        "eh_passage": f"EH-Passage gefunden: {user_input}",
+        "feedback_suggest": f"Feedback-Vorschlag: {user_input}",
+        "reminder_schedule": f"Erinnerung geplant: {user_input}",
+        "task_summary": f"Aufgabenuebersicht: {user_input}",
+        "conference_topic": f"Konferenzthema notiert: {user_input}",
+        "correction_note": f"Korrekturnotiz gespeichert: {user_input}",
+        "worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
+    }
+
+    response = responses.get(detected_intent, f"Verstanden: {user_input}")
+    return detected_intent, response
+
+
+def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult:
+    """Create a failed test result due to error."""
+    return TestResult(
+        test_id=test_case.get('id', 'UNKNOWN'),
+        test_name=test_case.get('name', 'Error'),
+        user_input=test_case.get('input', ''),
+        expected_intent=test_case.get('expected_intent', ''),
+        detected_intent='error',
+        response='',
+        intent_accuracy=0,
+        faithfulness=1,
+        relevance=1,
+        coherence=1,
+        safety='fail',
+        composite_score=0.0,
+        passed=False,
+        reasoning=f"Test execution error: {error}",
+        timestamp=datetime.utcnow(),
+        duration_ms=0,
+    )
+
+
+async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]:
+    """Simulate RAG service response."""
+    category = test_case.get('category', '')
+    input_data = test_case.get('input', {})
+    expected = test_case.get('expected', {})
+
+    if category == 'eh_retrieval':
+        concepts = expected.get('must_contain_concepts', [])
+        passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
+        passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
+        return {
+            "passage": passage,
+            "source": "EH_Deutsch_Abitur_2024_NI.pdf",
+            "relevance_score": 0.85,
+        }
+
+    elif category == 'operator_alignment':
+        operator = input_data.get('operator', '')
+        afb = expected.get('afb_level', 'II')
+        actions = expected.get('expected_actions', [])
+        return {
+            "operator": operator,
+            "definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
+            "afb_level": afb,
+        }
+
+    elif category == 'hallucination_control':
+        return {
+            "response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
+            "grounded": True,
+        }
+
+    elif category == 'privacy_compliance':
+        return {
+            "response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
+            "contains_pii": False,
+        }
+
+    elif category == 'namespace_isolation':
+        return {
+            "response": "Zugriff nur auf Daten im eigenen Namespace.",
+            "namespace_violation": False,
+        }
+
+    return {"response": "Simulated response", "success": True}
@@ -0,0 +1,141 @@
+"""
+Enhanced Orchestrator Session Management
+
+Session lifecycle methods extracted from EnhancedTaskOrchestrator.
+"""
+import structlog
+from typing import Optional, Dict, Any
+
+from sessions.session_manager import SessionManager, AgentSession, SessionState
+from sessions.heartbeat import HeartbeatMonitor, HeartbeatClient
+from brain.context_manager import ContextManager
+
+logger = structlog.get_logger(__name__)
+
+
+async def create_session(
+    session_manager: SessionManager,
+    context_manager: ContextManager,
+    heartbeat: HeartbeatMonitor,
+    voice_sessions: Dict[str, AgentSession],
+    heartbeat_clients: Dict[str, HeartbeatClient],
+    voice_session_id: str,
+    user_id: str = "",
+    metadata: Optional[Dict[str, Any]] = None,
+    system_prompt: str = "",
+) -> AgentSession:
+    """Creates a new agent session for a voice session."""
+    session = await session_manager.create_session(
+        agent_type="voice-orchestrator",
+        user_id=user_id,
+        context={"voice_session_id": voice_session_id},
+        metadata=metadata
+    )
+
+    context_manager.create_context(
+        session_id=session.session_id,
+        system_prompt=system_prompt,
+        max_messages=50
+    )
+
+    heartbeat_client = HeartbeatClient(
+        session_id=session.session_id,
+        monitor=heartbeat,
+        interval_seconds=10
+    )
+    await heartbeat_client.start()
+    heartbeat.register(session.session_id, "voice-orchestrator")
+
+    voice_sessions[voice_session_id] = session
+    heartbeat_clients[session.session_id] = heartbeat_client
+
+    logger.info(
+        "Created agent session",
+        session_id=session.session_id[:8],
+        voice_session_id=voice_session_id
+    )
+    return session
+
+
+async def end_session(
+    session_manager: SessionManager,
+    heartbeat: HeartbeatMonitor,
+    voice_sessions: Dict[str, AgentSession],
+    heartbeat_clients: Dict[str, HeartbeatClient],
+    voice_session_id: str,
+) -> None:
+    """Ends an agent session."""
+    session = voice_sessions.get(voice_session_id)
+    if not session:
+        return
+
+    if session.session_id in heartbeat_clients:
+        await heartbeat_clients[session.session_id].stop()
+        del heartbeat_clients[session.session_id]
+
+    heartbeat.unregister(session.session_id)
+    session.complete()
+    await session_manager.update_session(session)
+    del voice_sessions[voice_session_id]
+
+    logger.info(
+        "Ended agent session",
+        session_id=session.session_id[:8],
+        duration_seconds=session.get_duration().total_seconds()
+    )
+
+
+async def recover_session(
+    session_manager: SessionManager,
+    heartbeat: HeartbeatMonitor,
+    voice_sessions: Dict[str, AgentSession],
+    heartbeat_clients: Dict[str, HeartbeatClient],
+    tasks: Dict[str, Any],
+    process_task_fn,
+    voice_session_id: str,
+    session_id: str,
+) -> Optional[AgentSession]:
+    """Recovers a session from checkpoint."""
+    session = await session_manager.get_session(session_id)
+
+    if not session:
+        logger.warning("Session not found for recovery", session_id=session_id)
+        return None
+
+    if session.state != SessionState.ACTIVE:
+        logger.warning(
+            "Session not active for recovery",
+            session_id=session_id, state=session.state.value
+        )
+        return None
+
+    session.resume()
+
+    heartbeat_client = HeartbeatClient(
+        session_id=session.session_id,
+        monitor=heartbeat,
+        interval_seconds=10
+    )
+    await heartbeat_client.start()
+    heartbeat.register(session.session_id, "voice-orchestrator")
+
+    voice_sessions[voice_session_id] = session
+    heartbeat_clients[session.session_id] = heartbeat_client
+
+    # Recover pending tasks from checkpoints
+    from models.task import TaskState
+    for checkpoint in reversed(session.checkpoints):
+        if checkpoint.name == "task_queued":
+            task_id = checkpoint.data.get("task_id")
+            if task_id and task_id in tasks:
+                task = tasks[task_id]
+                if task.state == TaskState.QUEUED:
+                    await process_task_fn(task)
+                    logger.info("Recovered pending task", task_id=task_id[:8])
+
+    logger.info(
+        "Recovered session",
+        session_id=session.session_id[:8],
+        checkpoints=len(session.checkpoints)
+    )
+    return session
@@ -6,6 +6,10 @@ Extends the existing TaskOrchestrator with Multi-Agent support:
 - Message bus integration for inter-agent communication
 - Quality judge integration via BQAS
 - Heartbeat-based liveness
+
+Split into:
+- enhanced_orchestrator_session.py: Session lifecycle (create/end/recover)
+- enhanced_task_orchestrator.py (this file): Main orchestrator class
 """

 import structlog
@@ -27,6 +31,12 @@ from brain.context_manager import ContextManager, MessageRole
 from orchestrator.message_bus import MessageBus, AgentMessage, MessagePriority
 from orchestrator.task_router import TaskRouter, RoutingStrategy

+from services.enhanced_orchestrator_session import (
+    create_session as _create_session,
+    end_session as _end_session,
+    recover_session as _recover_session,
+)
+
 logger = structlog.get_logger(__name__)


@@ -47,50 +57,25 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
        db_pool=None,
        namespace: str = "breakpilot"
    ):
-        """
-        Initialize the enhanced orchestrator.
-
-        Args:
-            redis_client: Async Redis/Valkey client
-            db_pool: Async PostgreSQL connection pool
-            namespace: Namespace for isolation
-        """
        super().__init__()

-        # Initialize agent-core components
        self.session_manager = SessionManager(
-            redis_client=redis_client,
-            db_pool=db_pool,
-            namespace=namespace
+            redis_client=redis_client, db_pool=db_pool, namespace=namespace
        )
-
        self.memory_store = MemoryStore(
-            redis_client=redis_client,
-            db_pool=db_pool,
-            namespace=namespace
+            redis_client=redis_client, db_pool=db_pool, namespace=namespace
        )
-
        self.context_manager = ContextManager(
-            redis_client=redis_client,
-            db_pool=db_pool,
-            namespace=namespace
+            redis_client=redis_client, db_pool=db_pool, namespace=namespace
        )
-
        self.message_bus = MessageBus(
-            redis_client=redis_client,
-            db_pool=db_pool,
-            namespace=namespace
+            redis_client=redis_client, db_pool=db_pool, namespace=namespace
        )
-
        self.heartbeat = HeartbeatMonitor(
-            timeout_seconds=30,
-            check_interval_seconds=5,
-            max_missed_beats=3
+            timeout_seconds=30, check_interval_seconds=5, max_missed_beats=3
        )
-
        self.task_router = TaskRouter()

-        # Track active sessions by voice session ID
        self._voice_sessions: Dict[str, AgentSession] = {}
        self._heartbeat_clients: Dict[str, HeartbeatClient] = {}

@@ -100,231 +85,98 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
        """Starts the enhanced orchestrator"""
        await self.message_bus.start()
        await self.heartbeat.start_monitoring()
-
-        # Subscribe to messages directed at this orchestrator
-        await self.message_bus.subscribe(
-            "voice-orchestrator",
-            self._handle_agent_message
-        )
-
+        await self.message_bus.subscribe("voice-orchestrator", self._handle_agent_message)
        logger.info("Enhanced TaskOrchestrator started")

    async def stop(self) -> None:
        """Stops the enhanced orchestrator"""
-        # Stop all heartbeat clients
        for client in self._heartbeat_clients.values():
            await client.stop()
        self._heartbeat_clients.clear()
-
        await self.heartbeat.stop_monitoring()
        await self.message_bus.stop()
-
        logger.info("Enhanced TaskOrchestrator stopped")

    async def create_session(
-        self,
-        voice_session_id: str,
-        user_id: str = "",
+        self, voice_session_id: str, user_id: str = "",
        metadata: Optional[Dict[str, Any]] = None
    ) -> AgentSession:
-        """
-        Creates a new agent session for a voice session.
-
-        Args:
-            voice_session_id: The voice session ID
-            user_id: Optional user ID
-            metadata: Additional metadata
-
-        Returns:
-            The created AgentSession
-        """
-        # Create session via session manager
-        session = await self.session_manager.create_session(
-            agent_type="voice-orchestrator",
-            user_id=user_id,
-            context={"voice_session_id": voice_session_id},
-            metadata=metadata
+        return await _create_session(
+            self.session_manager, self.context_manager, self.heartbeat,
+            self._voice_sessions, self._heartbeat_clients,
+            voice_session_id, user_id, metadata, self._get_system_prompt(),
        )

-        # Create conversation context
-        self.context_manager.create_context(
-            session_id=session.session_id,
-            system_prompt=self._get_system_prompt(),
-            max_messages=50
-        )
-
-        # Start heartbeat for this session
-        heartbeat_client = HeartbeatClient(
-            session_id=session.session_id,
-            monitor=self.heartbeat,
-            interval_seconds=10
-        )
-        await heartbeat_client.start()
-
-        # Register heartbeat for monitoring
-        self.heartbeat.register(session.session_id, "voice-orchestrator")
-
-        # Store references
-        self._voice_sessions[voice_session_id] = session
-        self._heartbeat_clients[session.session_id] = heartbeat_client
-
-        logger.info(
-            "Created agent session",
-            session_id=session.session_id[:8],
-            voice_session_id=voice_session_id
-        )
-
-        return session
-
-    async def get_session(
-        self,
-        voice_session_id: str
-    ) -> Optional[AgentSession]:
-        """Gets the agent session for a voice session"""
+    async def get_session(self, voice_session_id: str) -> Optional[AgentSession]:
        return self._voice_sessions.get(voice_session_id)

    async def end_session(self, voice_session_id: str) -> None:
-        """
-        Ends an agent session.
-
-        Args:
-            voice_session_id: The voice session ID
-        """
-        session = self._voice_sessions.get(voice_session_id)
-        if not session:
-            return
-
-        # Stop heartbeat
-        if session.session_id in self._heartbeat_clients:
-            await self._heartbeat_clients[session.session_id].stop()
-            del self._heartbeat_clients[session.session_id]
-
-        # Unregister from heartbeat monitor
-        self.heartbeat.unregister(session.session_id)
-
-        # Mark session as completed
-        session.complete()
-        await self.session_manager.update_session(session)
-
-        # Clean up
-        del self._voice_sessions[voice_session_id]
-
-        logger.info(
-            "Ended agent session",
-            session_id=session.session_id[:8],
-            duration_seconds=session.get_duration().total_seconds()
+        await _end_session(
+            self.session_manager, self.heartbeat,
+            self._voice_sessions, self._heartbeat_clients, voice_session_id,
        )

    async def queue_task(self, task: Task) -> None:
-        """
-        Queue a task with session checkpointing.
-
-        Extends parent to add checkpoint for recovery.
-        """
-        # Get session for this task
+        """Queue a task with session checkpointing."""
        session = self._voice_sessions.get(task.session_id)
-
        if session:
-            # Checkpoint before queueing
            session.checkpoint("task_queued", {
-                "task_id": task.id,
-                "task_type": task.type.value,
+                "task_id": task.id, "task_type": task.type.value,
                "parameters": task.parameters
            })
            await self.session_manager.update_session(session)
-
-        # Call parent implementation
        await super().queue_task(task)

    async def process_task(self, task: Task) -> None:
-        """
-        Process a task with enhanced routing and quality checks.
-
-        Extends parent to:
-        - Route complex tasks to specialized agents
-        - Run quality checks via BQAS
-        - Store results in memory for learning
-        """
+        """Process a task with enhanced routing and quality checks."""
        session = self._voice_sessions.get(task.session_id)
-
        if session:
-            session.checkpoint("task_processing", {
-                "task_id": task.id
-            })
+            session.checkpoint("task_processing", {"task_id": task.id})

-        # Check if this task should be routed to a specialized agent
        if self._needs_specialized_agent(task):
            await self._route_to_agent(task, session)
        else:
-            # Use parent implementation for simple tasks
            await super().process_task(task)

-        # Run quality check on result
        if task.result_ref and self._needs_quality_check(task):
            await self._run_quality_check(task, session)

-        # Store in memory for learning
        if task.state == TaskState.READY and task.result_ref:
            await self._store_task_result(task)

        if session:
            session.checkpoint("task_completed", {
-                "task_id": task.id,
-                "state": task.state.value
+                "task_id": task.id, "state": task.state.value
            })
            await self.session_manager.update_session(session)

    def _needs_specialized_agent(self, task: Task) -> bool:
-        """Check if task needs routing to a specialized agent"""
        from models.task import TaskType
-
-        # Tasks that benefit from specialized agents
-        specialized_types = [
-            TaskType.PARENT_LETTER,      # Could use grader for tone
-            TaskType.FEEDBACK_SUGGEST,   # Quality judge for appropriateness
-        ]
-
-        return task.type in specialized_types
+        return task.type in [TaskType.PARENT_LETTER, TaskType.FEEDBACK_SUGGEST]

    def _needs_quality_check(self, task: Task) -> bool:
-        """Check if task result needs quality validation"""
        from models.task import TaskType
-
-        # Tasks that generate content should be checked
-        content_types = [
-            TaskType.PARENT_LETTER,
-            TaskType.CLASS_MESSAGE,
-            TaskType.FEEDBACK_SUGGEST,
-            TaskType.WORKSHEET_GENERATE,
+        return task.type in [
+            TaskType.PARENT_LETTER, TaskType.CLASS_MESSAGE,
+            TaskType.FEEDBACK_SUGGEST, TaskType.WORKSHEET_GENERATE,
        ]

-        return task.type in content_types
-
-    async def _route_to_agent(
-        self,
-        task: Task,
-        session: Optional[AgentSession]
-    ) -> None:
+    async def _route_to_agent(self, task: Task, session: Optional[AgentSession]) -> None:
        """Routes a task to a specialized agent"""
-        # Determine target agent
        intent = f"task_{task.type.value}"
        routing_result = await self.task_router.route(
-            intent=intent,
-            context={"task": task.parameters},
+            intent=intent, context={"task": task.parameters},
            strategy=RoutingStrategy.LEAST_LOADED
        )

        if not routing_result.success:
-            # Fall back to local processing
            logger.warning(
                "No agent available for task, using local processing",
-                task_id=task.id[:8],
-                reason=routing_result.reason
+                task_id=task.id[:8], reason=routing_result.reason
            )
            await super().process_task(task)
            return

-        # Send to agent via message bus
        try:
            response = await self.message_bus.request(
                AgentMessage(
@@ -332,8 +184,7 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
                    receiver=routing_result.agent_id,
                    message_type=f"process_{task.type.value}",
                    payload={
-                        "task_id": task.id,
-                        "task_type": task.type.value,
+                        "task_id": task.id, "task_type": task.type.value,
                        "parameters": task.parameters,
                        "session_id": session.session_id if session else None
                    },
@@ -341,179 +192,78 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
                ),
                timeout=30.0
            )
-
            task.result_ref = response.get("result", "")
            task.transition_to(TaskState.READY, "agent_processed")
-
        except asyncio.TimeoutError:
            logger.error(
                "Agent timeout, falling back to local",
-                task_id=task.id[:8],
-                agent=routing_result.agent_id
+                task_id=task.id[:8], agent=routing_result.agent_id
            )
            await super().process_task(task)

-    async def _run_quality_check(
-        self,
-        task: Task,
-        session: Optional[AgentSession]
-    ) -> None:
+    async def _run_quality_check(self, task: Task, session: Optional[AgentSession]) -> None:
        """Runs quality check on task result via quality judge"""
        try:
            response = await self.message_bus.request(
                AgentMessage(
-                    sender="voice-orchestrator",
-                    receiver="quality-judge",
+                    sender="voice-orchestrator", receiver="quality-judge",
                    message_type="evaluate_response",
                    payload={
-                        "task_id": task.id,
-                        "task_type": task.type.value,
-                        "response": task.result_ref,
-                        "context": task.parameters
+                        "task_id": task.id, "task_type": task.type.value,
+                        "response": task.result_ref, "context": task.parameters
                    },
                    priority=MessagePriority.NORMAL
                ),
                timeout=10.0
            )
-
            quality_score = response.get("composite_score", 0)
-
            if quality_score < 60:
-                # Mark for review
                task.error_message = f"Quality check failed: {quality_score}"
-                logger.warning(
-                    "Task failed quality check",
-                    task_id=task.id[:8],
-                    score=quality_score
-                )
-
+                logger.warning("Task failed quality check", task_id=task.id[:8], score=quality_score)
        except asyncio.TimeoutError:
-            # Quality check timeout is non-fatal
-            logger.warning(
-                "Quality check timeout",
-                task_id=task.id[:8]
-            )
+            logger.warning("Quality check timeout", task_id=task.id[:8])

    async def _store_task_result(self, task: Task) -> None:
        """Stores task result in memory for learning"""
        await self.memory_store.remember(
            key=f"task:{task.type.value}:{task.id}",
            value={
-                "result": task.result_ref,
-                "parameters": task.parameters,
+                "result": task.result_ref, "parameters": task.parameters,
                "completed_at": datetime.utcnow().isoformat()
            },
-            agent_id="voice-orchestrator",
-            ttl_days=30
+            agent_id="voice-orchestrator", ttl_days=30
        )

-    async def _handle_agent_message(
-        self,
-        message: AgentMessage
-    ) -> Optional[Dict[str, Any]]:
+    async def _handle_agent_message(self, message: AgentMessage) -> Optional[Dict[str, Any]]:
        """Handles incoming messages from other agents"""
-        logger.debug(
-            "Received agent message",
-            sender=message.sender,
-            type=message.message_type
-        )
-
+        logger.debug("Received agent message", sender=message.sender, type=message.message_type)
        if message.message_type == "task_status_update":
-            # Handle task status updates
            task_id = message.payload.get("task_id")
            if task_id in self._tasks:
                task = self._tasks[task_id]
                new_state = message.payload.get("state")
                if new_state:
                    task.transition_to(TaskState(new_state), "agent_update")
-
        return None

    def _get_system_prompt(self) -> str:
-        """Returns the system prompt for the voice assistant"""
-        return """Du bist ein hilfreicher Assistent für Lehrer in der Breakpilot-App.
+        return """Du bist ein hilfreicher Assistent fuer Lehrer in der Breakpilot-App.

 Deine Aufgaben:
- Hilf beim Erstellen von Arbeitsblättern
- Unterstütze bei der Korrektur
+- Hilf beim Erstellen von Arbeitsblaettern
+- Unterstuetze bei der Korrektur
 - Erstelle Elternbriefe und Klassennachrichten
 - Dokumentiere Beobachtungen und Erinnerungen

-Halte dich kurz und präzise. Nutze einfache, klare Sprache.
+Halte dich kurz und praezise. Nutze einfache, klare Sprache.
 Bei Unklarheiten frage nach."""

-    # Recovery methods
-
    async def recover_session(
-        self,
-        voice_session_id: str,
-        session_id: str
+        self, voice_session_id: str, session_id: str
    ) -> Optional[AgentSession]:
-        """
-        Recovers a session from checkpoint.
-
-        Args:
-            voice_session_id: The voice session ID
-            session_id: The agent session ID to recover
-
-        Returns:
-            The recovered session or None
-        """
-        session = await self.session_manager.get_session(session_id)
-
-        if not session:
-            logger.warning(
-                "Session not found for recovery",
-                session_id=session_id
-            )
-            return None
-
-        if session.state != SessionState.ACTIVE:
-            logger.warning(
-                "Session not active for recovery",
-                session_id=session_id,
-                state=session.state.value
-            )
-            return None
-
-        # Resume session
-        session.resume()
-
-        # Restore heartbeat
-        heartbeat_client = HeartbeatClient(
-            session_id=session.session_id,
-            monitor=self.heartbeat,
-            interval_seconds=10
+        return await _recover_session(
+            self.session_manager, self.heartbeat,
+            self._voice_sessions, self._heartbeat_clients,
+            self._tasks, self.process_task,
+            voice_session_id, session_id,
        )
-        await heartbeat_client.start()
-        self.heartbeat.register(session.session_id, "voice-orchestrator")
-
-        # Store references
-        self._voice_sessions[voice_session_id] = session
-        self._heartbeat_clients[session.session_id] = heartbeat_client
-
-        # Recover pending tasks from checkpoints
-        await self._recover_pending_tasks(session)
-
-        logger.info(
-            "Recovered session",
-            session_id=session.session_id[:8],
-            checkpoints=len(session.checkpoints)
-        )
-
-        return session
-
-    async def _recover_pending_tasks(self, session: AgentSession) -> None:
-        """Recovers pending tasks from session checkpoints"""
-        for checkpoint in reversed(session.checkpoints):
-            if checkpoint.name == "task_queued":
-                task_id = checkpoint.data.get("task_id")
-                if task_id and task_id in self._tasks:
-                    task = self._tasks[task_id]
-                    if task.state == TaskState.QUEUED:
-                        # Re-process queued task
-                        await self.process_task(task)
-                        logger.info(
-                            "Recovered pending task",
-                            task_id=task_id[:8]
-                        )