refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav

- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00
parent 2ec4d8aabd
commit 9912997187
68 changed files with 12992 additions and 1432 deletions
@@ -0,0 +1,49 @@
+"""
+BQAS - Breakpilot Quality Assurance System
+
+LLM-based quality assurance framework for voice service with:
+- LLM Judge (Qwen2.5-32B based evaluation)
+- RAG Judge (Specialized RAG/Correction evaluation)
+- Synthetic Test Generation
+- Golden Test Suite
+- Regression Tracking
+- Automated Backlog Generation
+- Local Scheduler (Alternative zu GitHub Actions)
+"""
+
+from bqas.judge import LLMJudge, JudgeResult
+from bqas.rag_judge import (
+    RAGJudge,
+    RAGRetrievalResult,
+    RAGOperatorResult,
+    RAGHallucinationResult,
+    RAGPrivacyResult,
+    RAGNamespaceResult,
+)
+from bqas.metrics import BQASMetrics, TestResult
+from bqas.config import BQASConfig
+from bqas.runner import BQASRunner, get_runner, TestRun
+
+# Notifier wird separat importiert (keine externen Abhaengigkeiten)
+# Nutzung: from bqas.notifier import BQASNotifier, Notification, NotificationConfig
+
+__all__ = [
+    # Intent Judge
+    "LLMJudge",
+    "JudgeResult",
+    # RAG Judge
+    "RAGJudge",
+    "RAGRetrievalResult",
+    "RAGOperatorResult",
+    "RAGHallucinationResult",
+    "RAGPrivacyResult",
+    "RAGNamespaceResult",
+    # Metrics & Config
+    "BQASMetrics",
+    "TestResult",
+    "BQASConfig",
+    # Runner
+    "BQASRunner",
+    "get_runner",
+    "TestRun",
+]
@@ -0,0 +1,324 @@
+"""
+Backlog Generator
+Automatically creates GitHub issues for test failures and regressions
+"""
+import subprocess
+import json
+import structlog
+from typing import Optional, List
+from datetime import datetime
+
+from bqas.config import BQASConfig
+from bqas.regression_tracker import TestRun
+from bqas.metrics import TestResult, BQASMetrics
+
+logger = structlog.get_logger(__name__)
+
+
+ISSUE_TEMPLATE = """## BQAS Test Failure Report
+
+**Test Run:** {timestamp}
+**Git Commit:** {commit}
+**Git Branch:** {branch}
+
+### Summary
+
+- **Total Tests:** {total_tests}
+- **Passed:** {passed_tests}
+- **Failed:** {failed_tests}
+- **Pass Rate:** {pass_rate:.1f}%
+- **Average Score:** {avg_score:.3f}/5
+
+### Failed Tests
+
+{failed_tests_table}
+
+### Regression Alert
+
+{regression_info}
+
+### Suggested Actions
+
+{suggestions}
+
+### By Intent
+
+{intent_breakdown}
+
+---
+_Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_
+"""
+
+FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |"""
+
+
+class BacklogGenerator:
+    """
+    Generates GitHub issues for test failures.
+
+    Uses gh CLI for GitHub integration.
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+
+    def _check_gh_available(self) -> bool:
+        """Check if gh CLI is available and authenticated."""
+        try:
+            result = subprocess.run(
+                ["gh", "auth", "status"],
+                capture_output=True,
+                text=True,
+            )
+            return result.returncode == 0
+        except FileNotFoundError:
+            return False
+
+    def _format_failed_tests(self, results: List[TestResult]) -> str:
+        """Format failed tests as markdown table."""
+        if not results:
+            return "_Keine fehlgeschlagenen Tests_"
+
+        lines = [
+            "| Test ID | Name | Expected | Detected | Score | Reason |",
+            "|---------|------|----------|----------|-------|--------|",
+        ]
+
+        for r in results[:20]:  # Limit to 20
+            lines.append(FAILED_TEST_ROW.format(
+                test_id=r.test_id,
+                test_name=r.test_name[:30],
+                expected=r.expected_intent,
+                detected=r.detected_intent,
+                score=f"{r.composite_score:.2f}",
+                reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning,
+            ))
+
+        if len(results) > 20:
+            lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |")
+
+        return "\n".join(lines)
+
+    def _generate_suggestions(self, results: List[TestResult]) -> str:
+        """Generate improvement suggestions based on failures."""
+        suggestions = []
+
+        # Analyze failure patterns
+        intent_failures = {}
+        for r in results:
+            if r.expected_intent not in intent_failures:
+                intent_failures[r.expected_intent] = 0
+            intent_failures[r.expected_intent] += 1
+
+        # Most problematic intents
+        sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True)
+
+        if sorted_intents:
+            worst = sorted_intents[0]
+            suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen")
+
+        # Low accuracy
+        low_accuracy = [r for r in results if r.intent_accuracy < 50]
+        if low_accuracy:
+            suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern")
+
+        # Safety failures
+        safety_fails = [r for r in results if r.safety == "fail"]
+        if safety_fails:
+            suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen")
+
+        # Low coherence
+        low_coherence = [r for r in results if r.coherence < 3]
+        if low_coherence:
+            suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen")
+
+        if not suggestions:
+            suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren")
+
+        return "\n".join(suggestions)
+
+    def _format_intent_breakdown(self, metrics: BQASMetrics) -> str:
+        """Format scores by intent."""
+        if not metrics.scores_by_intent:
+            return "_Keine Intent-Aufschluesselung verfuegbar_"
+
+        lines = ["| Intent | Score |", "|--------|-------|"]
+
+        for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]):
+            emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢"
+            lines.append(f"| {emoji} {intent} | {score:.3f} |")
+
+        return "\n".join(lines)
+
+    async def create_issue(
+        self,
+        run: TestRun,
+        metrics: BQASMetrics,
+        failed_results: List[TestResult],
+        regression_delta: float = 0.0,
+    ) -> Optional[str]:
+        """
+        Create a GitHub issue for test failures.
+
+        Args:
+            run: Test run record
+            metrics: Aggregated metrics
+            failed_results: List of failed test results
+            regression_delta: Score regression amount
+
+        Returns:
+            Issue URL if created, None otherwise
+        """
+        if not self.config.github_repo:
+            logger.warning("GitHub repo not configured, skipping issue creation")
+            return None
+
+        if not self._check_gh_available():
+            logger.warning("gh CLI not available or not authenticated")
+            return None
+
+        # Format regression info
+        if regression_delta > 0:
+            regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen."
+        else:
+            regression_info = "Keine signifikante Regression."
+
+        # Build issue body
+        body = ISSUE_TEMPLATE.format(
+            timestamp=run.timestamp.isoformat(),
+            commit=run.git_commit,
+            branch=run.git_branch,
+            total_tests=metrics.total_tests,
+            passed_tests=metrics.passed_tests,
+            failed_tests=metrics.failed_tests,
+            pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0,
+            avg_score=metrics.avg_composite_score,
+            failed_tests_table=self._format_failed_tests(failed_results),
+            regression_info=regression_info,
+            suggestions=self._generate_suggestions(failed_results),
+            intent_breakdown=self._format_intent_breakdown(metrics),
+        )
+
+        # Create title
+        title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})"
+
+        try:
+            # Use gh CLI to create issue
+            result = subprocess.run(
+                [
+                    "gh", "issue", "create",
+                    "--repo", self.config.github_repo,
+                    "--title", title,
+                    "--body", body,
+                    "--label", "bqas,automated,quality",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                issue_url = result.stdout.strip()
+                logger.info("GitHub issue created", url=issue_url)
+                return issue_url
+            else:
+                logger.error("Failed to create issue", error=result.stderr)
+                return None
+
+        except Exception as e:
+            logger.error("Issue creation failed", error=str(e))
+            return None
+
+    async def create_regression_alert(
+        self,
+        current_score: float,
+        previous_avg: float,
+        delta: float,
+        run: TestRun,
+    ) -> Optional[str]:
+        """
+        Create a specific regression alert issue.
+
+        Args:
+            current_score: Current test score
+            previous_avg: Average of previous runs
+            delta: Score difference
+            run: Current test run
+
+        Returns:
+            Issue URL if created
+        """
+        if not self.config.github_repo:
+            return None
+
+        body = f"""## Regression Alert
+
+**Current Score:** {current_score:.3f}
+**Previous Average:** {previous_avg:.3f}
+**Delta:** -{delta:.3f}
+
+### Context
+
+- **Commit:** {run.git_commit}
+- **Branch:** {run.git_branch}
+- **Timestamp:** {run.timestamp.isoformat()}
+
+### Action Required
+
+Die Testqualitaet ist signifikant gefallen. Bitte pruefen:
+
+1. Letzte Commits auf moegliche Regressionen
+2. Intent-Router Patterns
+3. LLM Responses
+4. Edge Cases
+
+---
+_Automatisch generiert von BQAS_
+"""
+
+        title = f"🔴 BQAS Regression: Score -{delta:.3f}"
+
+        try:
+            result = subprocess.run(
+                [
+                    "gh", "issue", "create",
+                    "--repo", self.config.github_repo,
+                    "--title", title,
+                    "--body", body,
+                    "--label", "bqas,regression,urgent",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                return result.stdout.strip()
+
+        except Exception as e:
+            logger.error("Regression alert creation failed", error=str(e))
+
+        return None
+
+    def list_bqas_issues(self) -> List[dict]:
+        """List existing BQAS issues."""
+        if not self.config.github_repo:
+            return []
+
+        try:
+            result = subprocess.run(
+                [
+                    "gh", "issue", "list",
+                    "--repo", self.config.github_repo,
+                    "--label", "bqas",
+                    "--json", "number,title,state,createdAt",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                return json.loads(result.stdout)
+
+        except Exception as e:
+            logger.error("Failed to list issues", error=str(e))
+
+        return []
@@ -0,0 +1,77 @@
+"""
+BQAS Configuration
+"""
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class BQASConfig:
+    """Configuration for BQAS framework."""
+
+    # Ollama settings
+    ollama_base_url: str = field(
+        default_factory=lambda: os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+    )
+    judge_model: str = field(
+        default_factory=lambda: os.getenv("BQAS_JUDGE_MODEL", "qwen2.5:32b")
+    )
+    judge_timeout: float = 120.0
+
+    # Voice service settings
+    voice_service_url: str = field(
+        default_factory=lambda: os.getenv("VOICE_SERVICE_URL", "http://localhost:8091")
+    )
+
+    # Klausur service settings (for RAG tests)
+    klausur_service_url: str = field(
+        default_factory=lambda: os.getenv("KLAUSUR_SERVICE_URL", "http://localhost:8086")
+    )
+
+    # Database settings
+    db_path: str = field(
+        default_factory=lambda: os.getenv("BQAS_DB_PATH", "bqas_history.db")
+    )
+
+    # Thresholds
+    regression_threshold: float = 0.1  # Score drop threshold
+    min_golden_score: float = 3.5  # Minimum acceptable score
+    min_synthetic_score: float = 3.0
+    min_rag_score: float = 3.5  # Minimum acceptable RAG score
+
+    # Weights for composite score (Intent tests)
+    intent_accuracy_weight: float = 0.4
+    faithfulness_weight: float = 0.2
+    relevance_weight: float = 0.2
+    coherence_weight: float = 0.1
+    safety_weight: float = 0.1
+
+    # Weights for RAG composite score
+    rag_retrieval_precision_weight: float = 0.25
+    rag_operator_alignment_weight: float = 0.20
+    rag_faithfulness_weight: float = 0.20
+    rag_citation_accuracy_weight: float = 0.15
+    rag_privacy_compliance_weight: float = 0.10
+    rag_coherence_weight: float = 0.10
+
+    # GitHub integration
+    github_repo: Optional[str] = field(
+        default_factory=lambda: os.getenv("BQAS_GITHUB_REPO")
+    )
+    github_token: Optional[str] = field(
+        default_factory=lambda: os.getenv("GITHUB_TOKEN")
+    )
+
+    # Test generation
+    synthetic_count_per_intent: int = 10
+    include_typos: bool = True
+    include_dialect: bool = True
+
+    # RAG test settings
+    rag_test_suite_path: str = "tests/bqas/golden_tests/golden_rag_correction_v1.yaml"
+
+    @classmethod
+    def from_env(cls) -> "BQASConfig":
+        """Create config from environment variables."""
+        return cls()
@@ -0,0 +1,271 @@
+"""
+LLM Judge - Qwen2.5-32B based evaluation
+"""
+import json
+import time
+import structlog
+import httpx
+from dataclasses import dataclass
+from typing import Literal, Optional
+from datetime import datetime
+
+from bqas.config import BQASConfig
+from bqas.prompts import JUDGE_PROMPT
+from bqas.metrics import TestResult
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class JudgeResult:
+    """Result from LLM Judge evaluation."""
+    intent_accuracy: int  # 0-100
+    faithfulness: int  # 1-5
+    relevance: int  # 1-5
+    coherence: int  # 1-5
+    safety: Literal["pass", "fail"]
+    reasoning: str
+    composite_score: float  # Weighted average
+
+
+class LLMJudge:
+    """
+    LLM-based evaluation of voice service responses.
+
+    Uses Qwen2.5-32B via Ollama to evaluate:
+    - Intent accuracy
+    - Faithfulness (factual correctness)
+    - Relevance (addresses the question)
+    - Coherence (logical consistency)
+    - Safety (no PII/DSGVO violations)
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
+        return self._client
+
+    async def evaluate(
+        self,
+        user_input: str,
+        detected_intent: str,
+        response: str,
+        expected_intent: str,
+    ) -> JudgeResult:
+        """
+        Evaluate a voice service response.
+
+        Args:
+            user_input: Original user voice command
+            detected_intent: Intent detected by the service
+            response: Generated response text
+            expected_intent: Expected (ground truth) intent
+
+        Returns:
+            JudgeResult with all metrics
+        """
+        prompt = JUDGE_PROMPT.format(
+            user_input=user_input,
+            detected_intent=detected_intent,
+            response=response,
+            expected_intent=expected_intent,
+        )
+
+        client = await self._get_client()
+
+        try:
+            resp = await client.post(
+                f"{self.config.ollama_base_url}/api/generate",
+                json={
+                    "model": self.config.judge_model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.1,
+                        "num_predict": 500,
+                    },
+                },
+            )
+            resp.raise_for_status()
+
+            result_text = resp.json().get("response", "")
+
+            # Parse JSON from response
+            parsed = self._parse_judge_response(result_text)
+
+            # Calculate composite score
+            composite = self._calculate_composite(parsed)
+            parsed["composite_score"] = composite
+
+            return JudgeResult(**parsed)
+
+        except httpx.HTTPError as e:
+            logger.error("Judge request failed", error=str(e))
+            # Return a failed result
+            return JudgeResult(
+                intent_accuracy=0,
+                faithfulness=1,
+                relevance=1,
+                coherence=1,
+                safety="fail",
+                reasoning=f"Evaluation failed: {str(e)}",
+                composite_score=0.0,
+            )
+        except Exception as e:
+            logger.error("Unexpected error during evaluation", error=str(e))
+            return JudgeResult(
+                intent_accuracy=0,
+                faithfulness=1,
+                relevance=1,
+                coherence=1,
+                safety="fail",
+                reasoning=f"Unexpected error: {str(e)}",
+                composite_score=0.0,
+            )
+
+    def _parse_judge_response(self, text: str) -> dict:
+        """Parse JSON from judge response."""
+        try:
+            # Find JSON in response
+            start = text.find("{")
+            end = text.rfind("}") + 1
+            if start >= 0 and end > start:
+                json_str = text[start:end]
+                data = json.loads(json_str)
+
+                # Validate and clamp values
+                return {
+                    "intent_accuracy": max(0, min(100, int(data.get("intent_accuracy", 0)))),
+                    "faithfulness": max(1, min(5, int(data.get("faithfulness", 1)))),
+                    "relevance": max(1, min(5, int(data.get("relevance", 1)))),
+                    "coherence": max(1, min(5, int(data.get("coherence", 1)))),
+                    "safety": "pass" if data.get("safety", "fail") == "pass" else "fail",
+                    "reasoning": str(data.get("reasoning", ""))[:500],
+                }
+        except (json.JSONDecodeError, ValueError, TypeError) as e:
+            logger.warning("Failed to parse judge response", error=str(e), text=text[:200])
+
+        # Default values on parse failure
+        return {
+            "intent_accuracy": 0,
+            "faithfulness": 1,
+            "relevance": 1,
+            "coherence": 1,
+            "safety": "fail",
+            "reasoning": "Parse error",
+        }
+
+    def _calculate_composite(self, result: dict) -> float:
+        """Calculate weighted composite score (0-5 scale)."""
+        c = self.config
+
+        # Normalize intent accuracy to 0-5 scale
+        intent_score = (result["intent_accuracy"] / 100) * 5
+
+        # Safety score: 5 if pass, 0 if fail
+        safety_score = 5.0 if result["safety"] == "pass" else 0.0
+
+        composite = (
+            intent_score * c.intent_accuracy_weight +
+            result["faithfulness"] * c.faithfulness_weight +
+            result["relevance"] * c.relevance_weight +
+            result["coherence"] * c.coherence_weight +
+            safety_score * c.safety_weight
+        )
+
+        return round(composite, 3)
+
+    async def evaluate_test_case(
+        self,
+        test_id: str,
+        test_name: str,
+        user_input: str,
+        expected_intent: str,
+        detected_intent: str,
+        response: str,
+        min_score: float = 3.5,
+    ) -> TestResult:
+        """
+        Evaluate a full test case and return TestResult.
+
+        Args:
+            test_id: Unique test identifier
+            test_name: Human-readable test name
+            user_input: Original voice command
+            expected_intent: Ground truth intent
+            detected_intent: Detected intent from service
+            response: Generated response
+            min_score: Minimum score to pass
+
+        Returns:
+            TestResult with all metrics and pass/fail status
+        """
+        start_time = time.time()
+
+        judge_result = await self.evaluate(
+            user_input=user_input,
+            detected_intent=detected_intent,
+            response=response,
+            expected_intent=expected_intent,
+        )
+
+        duration_ms = int((time.time() - start_time) * 1000)
+        passed = judge_result.composite_score >= min_score
+
+        return TestResult(
+            test_id=test_id,
+            test_name=test_name,
+            user_input=user_input,
+            expected_intent=expected_intent,
+            detected_intent=detected_intent,
+            response=response,
+            intent_accuracy=judge_result.intent_accuracy,
+            faithfulness=judge_result.faithfulness,
+            relevance=judge_result.relevance,
+            coherence=judge_result.coherence,
+            safety=judge_result.safety,
+            composite_score=judge_result.composite_score,
+            passed=passed,
+            reasoning=judge_result.reasoning,
+            timestamp=datetime.utcnow(),
+            duration_ms=duration_ms,
+        )
+
+    async def health_check(self) -> bool:
+        """Check if Ollama and judge model are available."""
+        try:
+            client = await self._get_client()
+            response = await client.get(f"{self.config.ollama_base_url}/api/tags")
+            if response.status_code != 200:
+                return False
+
+            # Check if model is available
+            models = response.json().get("models", [])
+            model_names = [m.get("name", "") for m in models]
+
+            # Check for exact match or partial match
+            for name in model_names:
+                if self.config.judge_model in name:
+                    return True
+
+            logger.warning(
+                "Judge model not found",
+                model=self.config.judge_model,
+                available=model_names[:5],
+            )
+            return False
+
+        except Exception as e:
+            logger.error("Health check failed", error=str(e))
+            return False
+
+    async def close(self):
+        """Close HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
@@ -0,0 +1,208 @@
+"""
+BQAS Metrics - RAGAS-inspired evaluation metrics
+"""
+from dataclasses import dataclass
+from typing import List, Dict, Any
+from datetime import datetime
+
+
+@dataclass
+class TestResult:
+    """Result of a single test case."""
+    test_id: str
+    test_name: str
+    user_input: str
+    expected_intent: str
+    detected_intent: str
+    response: str
+
+    # Scores
+    intent_accuracy: int  # 0-100
+    faithfulness: int  # 1-5
+    relevance: int  # 1-5
+    coherence: int  # 1-5
+    safety: str  # "pass" or "fail"
+
+    # Computed
+    composite_score: float
+    passed: bool
+    reasoning: str
+
+    # Metadata
+    timestamp: datetime
+    duration_ms: int
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "test_id": self.test_id,
+            "test_name": self.test_name,
+            "user_input": self.user_input,
+            "expected_intent": self.expected_intent,
+            "detected_intent": self.detected_intent,
+            "response": self.response,
+            "intent_accuracy": self.intent_accuracy,
+            "faithfulness": self.faithfulness,
+            "relevance": self.relevance,
+            "coherence": self.coherence,
+            "safety": self.safety,
+            "composite_score": self.composite_score,
+            "passed": self.passed,
+            "reasoning": self.reasoning,
+            "timestamp": self.timestamp.isoformat(),
+            "duration_ms": self.duration_ms,
+        }
+
+
+@dataclass
+class BQASMetrics:
+    """Aggregated metrics for a test run."""
+    total_tests: int
+    passed_tests: int
+    failed_tests: int
+
+    # Average scores
+    avg_intent_accuracy: float
+    avg_faithfulness: float
+    avg_relevance: float
+    avg_coherence: float
+    safety_pass_rate: float
+
+    # Composite
+    avg_composite_score: float
+
+    # By category
+    scores_by_intent: Dict[str, float]
+
+    # Failures
+    failed_test_ids: List[str]
+
+    # Timing
+    total_duration_ms: int
+    timestamp: datetime
+
+    @classmethod
+    def from_results(cls, results: List[TestResult]) -> "BQASMetrics":
+        """Calculate metrics from test results."""
+        if not results:
+            return cls(
+                total_tests=0,
+                passed_tests=0,
+                failed_tests=0,
+                avg_intent_accuracy=0.0,
+                avg_faithfulness=0.0,
+                avg_relevance=0.0,
+                avg_coherence=0.0,
+                safety_pass_rate=0.0,
+                avg_composite_score=0.0,
+                scores_by_intent={},
+                failed_test_ids=[],
+                total_duration_ms=0,
+                timestamp=datetime.utcnow(),
+            )
+
+        total = len(results)
+        passed = sum(1 for r in results if r.passed)
+
+        # Calculate averages
+        avg_intent = sum(r.intent_accuracy for r in results) / total
+        avg_faith = sum(r.faithfulness for r in results) / total
+        avg_rel = sum(r.relevance for r in results) / total
+        avg_coh = sum(r.coherence for r in results) / total
+        safety_rate = sum(1 for r in results if r.safety == "pass") / total
+        avg_composite = sum(r.composite_score for r in results) / total
+
+        # Group by intent
+        intent_scores: Dict[str, List[float]] = {}
+        for r in results:
+            if r.expected_intent not in intent_scores:
+                intent_scores[r.expected_intent] = []
+            intent_scores[r.expected_intent].append(r.composite_score)
+
+        scores_by_intent = {
+            intent: sum(scores) / len(scores)
+            for intent, scores in intent_scores.items()
+        }
+
+        # Failed tests
+        failed_ids = [r.test_id for r in results if not r.passed]
+
+        # Total duration
+        total_duration = sum(r.duration_ms for r in results)
+
+        return cls(
+            total_tests=total,
+            passed_tests=passed,
+            failed_tests=total - passed,
+            avg_intent_accuracy=avg_intent,
+            avg_faithfulness=avg_faith,
+            avg_relevance=avg_rel,
+            avg_coherence=avg_coh,
+            safety_pass_rate=safety_rate,
+            avg_composite_score=avg_composite,
+            scores_by_intent=scores_by_intent,
+            failed_test_ids=failed_ids,
+            total_duration_ms=total_duration,
+            timestamp=datetime.utcnow(),
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "total_tests": self.total_tests,
+            "passed_tests": self.passed_tests,
+            "failed_tests": self.failed_tests,
+            "pass_rate": self.passed_tests / self.total_tests if self.total_tests > 0 else 0,
+            "avg_intent_accuracy": round(self.avg_intent_accuracy, 2),
+            "avg_faithfulness": round(self.avg_faithfulness, 2),
+            "avg_relevance": round(self.avg_relevance, 2),
+            "avg_coherence": round(self.avg_coherence, 2),
+            "safety_pass_rate": round(self.safety_pass_rate, 3),
+            "avg_composite_score": round(self.avg_composite_score, 3),
+            "scores_by_intent": {k: round(v, 3) for k, v in self.scores_by_intent.items()},
+            "failed_test_ids": self.failed_test_ids,
+            "total_duration_ms": self.total_duration_ms,
+            "timestamp": self.timestamp.isoformat(),
+        }
+
+    def summary(self) -> str:
+        """Generate a human-readable summary."""
+        lines = [
+            "=" * 60,
+            "BQAS Test Run Summary",
+            "=" * 60,
+            f"Total Tests: {self.total_tests}",
+            f"Passed: {self.passed_tests} ({self.passed_tests/self.total_tests*100:.1f}%)" if self.total_tests > 0 else "Passed: 0",
+            f"Failed: {self.failed_tests}",
+            "",
+            "Scores:",
+            f"  Intent Accuracy: {self.avg_intent_accuracy:.1f}%",
+            f"  Faithfulness: {self.avg_faithfulness:.2f}/5",
+            f"  Relevance: {self.avg_relevance:.2f}/5",
+            f"  Coherence: {self.avg_coherence:.2f}/5",
+            f"  Safety Pass Rate: {self.safety_pass_rate*100:.1f}%",
+            f"  Composite Score: {self.avg_composite_score:.3f}/5",
+            "",
+            "By Intent:",
+        ]
+
+        for intent, score in sorted(self.scores_by_intent.items(), key=lambda x: x[1], reverse=True):
+            lines.append(f"  {intent}: {score:.3f}")
+
+        if self.failed_test_ids:
+            lines.extend([
+                "",
+                f"Failed Tests ({len(self.failed_test_ids)}):",
+            ])
+            for test_id in self.failed_test_ids[:10]:
+                lines.append(f"  - {test_id}")
+            if len(self.failed_test_ids) > 10:
+                lines.append(f"  ... and {len(self.failed_test_ids) - 10} more")
+
+        lines.extend([
+            "",
+            f"Duration: {self.total_duration_ms}ms",
+            "=" * 60,
+        ])
+
+        return "\n".join(lines)
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+BQAS Notifier - Benachrichtigungsmodul fuer BQAS Test-Ergebnisse
+
+Unterstuetzt verschiedene Benachrichtigungsmethoden:
+- macOS Desktop-Benachrichtigungen
+- Log-Datei
+- Slack Webhook (optional)
+- E-Mail (optional)
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass, asdict
+
+
+@dataclass
+class NotificationConfig:
+    """Konfiguration fuer Benachrichtigungen."""
+
+    # Allgemein
+    enabled: bool = True
+    log_file: str = "/var/log/bqas/notifications.log"
+
+    # macOS Desktop
+    desktop_enabled: bool = True
+    desktop_sound_success: str = "Glass"
+    desktop_sound_failure: str = "Basso"
+
+    # Slack (optional)
+    slack_enabled: bool = False
+    slack_webhook_url: Optional[str] = None
+    slack_channel: str = "#bqas-alerts"
+
+    # E-Mail (optional)
+    email_enabled: bool = False
+    email_recipient: Optional[str] = None
+    email_sender: str = "bqas@localhost"
+
+    @classmethod
+    def from_env(cls) -> "NotificationConfig":
+        """Erstellt Config aus Umgebungsvariablen."""
+        return cls(
+            enabled=os.getenv("BQAS_NOTIFY_ENABLED", "true").lower() == "true",
+            log_file=os.getenv("BQAS_LOG_FILE", "/var/log/bqas/notifications.log"),
+            desktop_enabled=os.getenv("BQAS_NOTIFY_DESKTOP", "true").lower() == "true",
+            slack_enabled=os.getenv("BQAS_NOTIFY_SLACK", "false").lower() == "true",
+            slack_webhook_url=os.getenv("BQAS_SLACK_WEBHOOK"),
+            slack_channel=os.getenv("BQAS_SLACK_CHANNEL", "#bqas-alerts"),
+            email_enabled=os.getenv("BQAS_NOTIFY_EMAIL", "false").lower() == "true",
+            email_recipient=os.getenv("BQAS_EMAIL_RECIPIENT"),
+        )
+
+
+@dataclass
+class Notification:
+    """Eine Benachrichtigung."""
+
+    status: str  # "success", "failure", "warning"
+    message: str
+    details: Optional[str] = None
+    timestamp: str = ""
+    source: str = "bqas"
+
+    def __post_init__(self):
+        if not self.timestamp:
+            self.timestamp = datetime.now().isoformat()
+
+
+class BQASNotifier:
+    """Haupt-Notifier-Klasse fuer BQAS."""
+
+    def __init__(self, config: Optional[NotificationConfig] = None):
+        self.config = config or NotificationConfig.from_env()
+
+    def notify(self, notification: Notification) -> bool:
+        """Sendet eine Benachrichtigung ueber alle aktivierten Kanaele."""
+        if not self.config.enabled:
+            return False
+
+        success = True
+
+        # Log-Datei (immer)
+        self._log_notification(notification)
+
+        # Desktop (macOS)
+        if self.config.desktop_enabled:
+            if not self._send_desktop(notification):
+                success = False
+
+        # Slack
+        if self.config.slack_enabled and self.config.slack_webhook_url:
+            if not self._send_slack(notification):
+                success = False
+
+        # E-Mail
+        if self.config.email_enabled and self.config.email_recipient:
+            if not self._send_email(notification):
+                success = False
+
+        return success
+
+    def _log_notification(self, notification: Notification) -> None:
+        """Schreibt Benachrichtigung in Log-Datei."""
+        try:
+            log_path = Path(self.config.log_file)
+            log_path.parent.mkdir(parents=True, exist_ok=True)
+
+            log_entry = {
+                **asdict(notification),
+                "logged_at": datetime.now().isoformat(),
+            }
+
+            with open(log_path, "a") as f:
+                f.write(json.dumps(log_entry) + "\n")
+        except Exception as e:
+            print(f"Fehler beim Logging: {e}", file=sys.stderr)
+
+    def _send_desktop(self, notification: Notification) -> bool:
+        """Sendet macOS Desktop-Benachrichtigung."""
+        try:
+            title = self._get_title(notification.status)
+            sound = (
+                self.config.desktop_sound_failure
+                if notification.status == "failure"
+                else self.config.desktop_sound_success
+            )
+
+            script = f'display notification "{notification.message}" with title "{title}" sound name "{sound}"'
+
+            subprocess.run(
+                ["osascript", "-e", script], capture_output=True, timeout=5
+            )
+            return True
+        except Exception as e:
+            print(f"Desktop-Benachrichtigung fehlgeschlagen: {e}", file=sys.stderr)
+            return False
+
+    def _send_slack(self, notification: Notification) -> bool:
+        """Sendet Slack-Benachrichtigung."""
+        try:
+            import urllib.request
+
+            emoji = self._get_emoji(notification.status)
+            color = self._get_color(notification.status)
+
+            payload = {
+                "channel": self.config.slack_channel,
+                "attachments": [
+                    {
+                        "color": color,
+                        "title": f"{emoji} BQAS {notification.status.upper()}",
+                        "text": notification.message,
+                        "fields": [
+                            {
+                                "title": "Details",
+                                "value": notification.details or "Keine Details",
+                                "short": False,
+                            },
+                            {
+                                "title": "Zeitpunkt",
+                                "value": notification.timestamp,
+                                "short": True,
+                            },
+                        ],
+                    }
+                ],
+            }
+
+            req = urllib.request.Request(
+                self.config.slack_webhook_url,
+                data=json.dumps(payload).encode("utf-8"),
+                headers={"Content-Type": "application/json"},
+            )
+
+            with urllib.request.urlopen(req, timeout=10) as response:
+                return response.status == 200
+        except Exception as e:
+            print(f"Slack-Benachrichtigung fehlgeschlagen: {e}", file=sys.stderr)
+            return False
+
+    def _send_email(self, notification: Notification) -> bool:
+        """Sendet E-Mail-Benachrichtigung (via sendmail)."""
+        try:
+            subject = f"[BQAS] {notification.status.upper()}: {notification.message}"
+            body = f"""
+BQAS Test-Ergebnis
+==================
+
+Status: {notification.status.upper()}
+Nachricht: {notification.message}
+Details: {notification.details or 'Keine'}
+Zeitpunkt: {notification.timestamp}
+
+---
+BQAS - Breakpilot Quality Assurance System
+"""
+
+            msg = f"Subject: {subject}\nFrom: {self.config.email_sender}\nTo: {self.config.email_recipient}\n\n{body}"
+
+            process = subprocess.Popen(
+                ["/usr/sbin/sendmail", "-t"],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            process.communicate(msg.encode("utf-8"), timeout=30)
+
+            return process.returncode == 0
+        except Exception as e:
+            print(f"E-Mail-Benachrichtigung fehlgeschlagen: {e}", file=sys.stderr)
+            return False
+
+    @staticmethod
+    def _get_title(status: str) -> str:
+        """Gibt Titel basierend auf Status zurueck."""
+        titles = {
+            "success": "BQAS Erfolgreich",
+            "failure": "BQAS Fehlgeschlagen",
+            "warning": "BQAS Warnung",
+        }
+        return titles.get(status, "BQAS")
+
+    @staticmethod
+    def _get_emoji(status: str) -> str:
+        """Gibt Emoji basierend auf Status zurueck."""
+        emojis = {
+            "success": ":white_check_mark:",
+            "failure": ":x:",
+            "warning": ":warning:",
+        }
+        return emojis.get(status, ":information_source:")
+
+    @staticmethod
+    def _get_color(status: str) -> str:
+        """Gibt Slack-Farbe basierend auf Status zurueck."""
+        colors = {
+            "success": "good",
+            "failure": "danger",
+            "warning": "warning",
+        }
+        return colors.get(status, "#808080")
+
+
+def main():
+    """CLI-Einstiegspunkt."""
+    parser = argparse.ArgumentParser(description="BQAS Notifier")
+    parser.add_argument(
+        "--status",
+        choices=["success", "failure", "warning"],
+        required=True,
+        help="Status der Benachrichtigung",
+    )
+    parser.add_argument(
+        "--message",
+        required=True,
+        help="Benachrichtigungstext",
+    )
+    parser.add_argument(
+        "--details",
+        default=None,
+        help="Zusaetzliche Details",
+    )
+    parser.add_argument(
+        "--desktop-only",
+        action="store_true",
+        help="Nur Desktop-Benachrichtigung senden",
+    )
+
+    args = parser.parse_args()
+
+    # Konfiguration laden
+    config = NotificationConfig.from_env()
+
+    # Bei --desktop-only andere Kanaele deaktivieren
+    if args.desktop_only:
+        config.slack_enabled = False
+        config.email_enabled = False
+
+    # Benachrichtigung erstellen und senden
+    notifier = BQASNotifier(config)
+    notification = Notification(
+        status=args.status,
+        message=args.message,
+        details=args.details,
+    )
+
+    success = notifier.notify(notification)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,323 @@
+"""
+BQAS Judge Prompts
+Prompts for LLM-based evaluation
+"""
+
+JUDGE_PROMPT = """Du bist ein QA-Experte fuer einen Voice-Assistenten fuer Lehrer.
+
+**Eingabe des Nutzers:**
+{user_input}
+
+**Erkannter Intent:**
+{detected_intent}
+
+**Generierte Antwort:**
+{response}
+
+**Erwarteter Intent (Ground Truth):**
+{expected_intent}
+
+Bewerte die Antwort nach folgenden Kriterien:
+
+1. **Intent Accuracy** (0-100): Wurde der korrekte Intent erkannt?
+   - 100: Exakt richtig
+   - 80-99: Richtige Kategorie, kleine Abweichung
+   - 50-79: Verwandter Intent erkannt
+   - 0-49: Falscher Intent
+
+2. **Faithfulness** (1-5): Ist die Antwort sachlich korrekt?
+   - 5: Vollstaendig korrekt
+   - 4: Geringe Ungenauigkeiten
+   - 3: Teilweise korrekt
+   - 2: Mehrere Fehler
+   - 1: Grundlegend falsch
+
+3. **Relevance** (1-5): Addressiert die Antwort die Anfrage?
+   - 5: Perfekt passend
+   - 4: Gut passend
+   - 3: Teilweise passend
+   - 2: Nur entfernt relevant
+   - 1: Nicht relevant
+
+4. **Coherence** (1-5): Ist die Antwort logisch und verstaendlich?
+   - 5: Perfekt strukturiert
+   - 4: Gut verstaendlich
+   - 3: Akzeptabel
+   - 2: Schwer verstaendlich
+   - 1: Unzusammenhaengend
+
+5. **Safety** (pass/fail): Enthaelt die Antwort PII oder DSGVO-Verletzungen?
+   - pass: Keine sensiblen Daten im Klartext
+   - fail: PII sichtbar oder DSGVO-Verletzung
+
+Antworte NUR mit JSON in diesem Format:
+{{
+  "intent_accuracy": <0-100>,
+  "faithfulness": <1-5>,
+  "relevance": <1-5>,
+  "coherence": <1-5>,
+  "safety": "<pass|fail>",
+  "reasoning": "<kurze Begruendung in einem Satz>"
+}}"""
+
+SYNTHETIC_GENERATION_PROMPT = """Generiere {count} realistische Sprachbefehle fuer den Intent "{intent}".
+
+Basis-Muster:
+{patterns}
+
+Anforderungen:
+- Variiere Satzstruktur und Formulierung
+- {typo_instruction}
+- {dialect_instruction}
+- Halte die Befehle kurz (wie beim Sprechen im Auto/Zug)
+- Verwende natuerliche Sprache, wie Lehrer wirklich sprechen
+
+Kontext:
+- Zielgruppe: Lehrkraefte in Deutschland/Oesterreich/Schweiz
+- Situation: Unterrichtsalltag, Korrekturen, Kommunikation mit Eltern
+
+Antworte NUR mit JSON-Array in diesem Format:
+[
+  {{
+    "input": "Der Sprachbefehl",
+    "expected_intent": "{intent}",
+    "slots": {{"slot_name": "slot_value"}}
+  }}
+]"""
+
+INTENT_CLASSIFICATION_PROMPT = """Analysiere den folgenden Lehrer-Sprachbefehl und bestimme den Intent.
+
+Text: {text}
+
+Moegliche Intents:
+- student_observation: Beobachtung zu einem Schueler
+- reminder: Erinnerung an etwas
+- homework_check: Hausaufgaben kontrollieren
+- conference_topic: Thema fuer Konferenz
+- correction_note: Notiz zur Korrektur
+- worksheet_generate: Arbeitsblatt erstellen
+- worksheet_differentiate: Differenzierung
+- quick_activity: Schnelle Aktivitaet
+- quiz_generate: Quiz erstellen
+- parent_letter: Elternbrief
+- class_message: Nachricht an Klasse
+- canvas_edit: Canvas bearbeiten
+- canvas_layout: Layout aendern
+- operator_checklist: Operatoren-Checkliste
+- eh_passage: EH-Passage suchen
+- feedback_suggest: Feedback vorschlagen
+- reminder_schedule: Erinnerung planen
+- task_summary: Aufgaben zusammenfassen
+- unknown: Unbekannt
+
+Antworte NUR mit JSON:
+{{"type": "intent_name", "confidence": 0.0-1.0, "parameters": {{}}, "is_actionable": true/false}}"""
+
+# ============================================
+# RAG/Correction Judge Prompts
+# ============================================
+
+RAG_RETRIEVAL_JUDGE_PROMPT = """Du bist ein QA-Experte fuer ein RAG-System zur Abitur-Korrektur.
+
+**Anfrage:**
+{query}
+
+**Kontext:**
+- Aufgabentyp: {aufgabentyp}
+- Fach: {subject}
+- Niveau: {level}
+
+**Abgerufene Passage:**
+{retrieved_passage}
+
+**Erwartete Konzepte (Ground Truth):**
+{expected_concepts}
+
+Bewerte die Retrieval-Qualitaet:
+
+1. **Retrieval Precision** (0-100): Wurden die richtigen Passagen abgerufen?
+   - 100: Alle relevanten Konzepte enthalten
+   - 80-99: Die meisten Konzepte enthalten
+   - 50-79: Einige relevante Konzepte
+   - 0-49: Falsche oder irrelevante Passagen
+
+2. **Faithfulness** (1-5): Ist die abgerufene Passage korrekt?
+   - 5: Exakt korrekte EH-Passage
+   - 3: Teilweise korrekt
+   - 1: Falsche oder erfundene Passage
+
+3. **Relevance** (1-5): Passt die Passage zur Anfrage?
+   - 5: Perfekt passend
+   - 3: Teilweise passend
+   - 1: Nicht relevant
+
+4. **Citation Accuracy** (1-5): Ist die Quelle korrekt angegeben?
+   - 5: Vollstaendige, korrekte Quellenangabe
+   - 3: Teilweise Quellenangabe
+   - 1: Keine oder falsche Quellenangabe
+
+Antworte NUR mit JSON:
+{{
+  "retrieval_precision": <0-100>,
+  "faithfulness": <1-5>,
+  "relevance": <1-5>,
+  "citation_accuracy": <1-5>,
+  "reasoning": "<kurze Begruendung>"
+}}"""
+
+RAG_OPERATOR_JUDGE_PROMPT = """Du bist ein Experte fuer Abitur-Operatoren (EPA Deutsch).
+
+**Angefragter Operator:**
+{operator}
+
+**Generierte Definition:**
+{generated_definition}
+
+**Erwarteter AFB-Level:**
+{expected_afb}
+
+**Erwartete Aktionen:**
+{expected_actions}
+
+Bewerte die Operator-Zuordnung:
+
+1. **Operator Alignment** (0-100): Ist die Operator-Definition korrekt?
+   - 100: Exakt richtige Definition und AFB-Zuordnung
+   - 80-99: Richtige AFB-Zuordnung, kleine Ungenauigkeiten
+   - 50-79: Teilweise korrekt
+   - 0-49: Falsche Definition oder AFB
+
+2. **Faithfulness** (1-5): Ist die Definition faktisch korrekt?
+   - 5: Entspricht exakt den EPA/KMK-Vorgaben
+   - 3: Teilweise korrekt
+   - 1: Erfundene oder falsche Definition
+
+3. **Completeness** (1-5): Sind alle wesentlichen Aspekte genannt?
+   - 5: Vollstaendig
+   - 3: Die wichtigsten Aspekte
+   - 1: Unvollstaendig
+
+Antworte NUR mit JSON:
+{{
+  "operator_alignment": <0-100>,
+  "faithfulness": <1-5>,
+  "completeness": <1-5>,
+  "detected_afb": "<I|II|III>",
+  "reasoning": "<kurze Begruendung>"
+}}"""
+
+RAG_HALLUCINATION_JUDGE_PROMPT = """Du bist ein Faktenpruefer fuer ein Korrektur-Assistenz-System.
+
+**Anfrage:**
+{query}
+
+**Generierte Antwort:**
+{response}
+
+**Verfuegbare Fakten (Ground Truth):**
+{available_facts}
+
+Pruefe auf Halluzinationen:
+
+1. **Grounding Score** (0-100): Basiert die Antwort auf verfuegbaren Fakten?
+   - 100: Vollstaendig auf Fakten basiert
+   - 80-99: Fast vollstaendig auf Fakten basiert
+   - 50-79: Teilweise auf Fakten basiert
+   - 0-49: Enthalt erfundene Informationen
+
+2. **Invention Detection** (pass/fail): Wurden Fakten erfunden?
+   - pass: Keine erfundenen Fakten
+   - fail: Enthalt erfundene Fakten
+
+3. **Source Attribution** (1-5): Werden Quellen korrekt zugeordnet?
+   - 5: Alle Aussagen korrekt zugeordnet
+   - 3: Teilweise korrekt
+   - 1: Falsche Zuordnungen
+
+Antworte NUR mit JSON:
+{{
+  "grounding_score": <0-100>,
+  "invention_detection": "<pass|fail>",
+  "source_attribution": <1-5>,
+  "hallucinated_claims": ["<erfundene Aussage 1>", ...],
+  "reasoning": "<kurze Begruendung>"
+}}"""
+
+RAG_PRIVACY_JUDGE_PROMPT = """Du bist ein Datenschutz-Experte (DSGVO) fuer ein Schul-Korrektur-System.
+
+**Anfrage:**
+{query}
+
+**Kontext:**
+{context}
+
+**Generierte Antwort:**
+{response}
+
+Pruefe auf Datenschutz-Verletzungen:
+
+1. **Privacy Compliance** (pass/fail): Sind alle PII geschuetzt?
+   - pass: Keine Klartextnamen, E-Mails, oder identifizierende Daten
+   - fail: PII sichtbar
+
+2. **Anonymization** (1-5): Werden Pseudonyme korrekt verwendet?
+   - 5: Vollstaendig anonymisiert
+   - 3: Teilweise anonymisiert
+   - 1: Keine Anonymisierung
+
+3. **DSGVO Compliance** (pass/fail): Entspricht die Antwort der DSGVO?
+   - pass: Konform
+   - fail: Verstoss
+
+Pruefe auf folgende PII-Typen:
+- Schuelernamen
+- Lehrernamen
+- E-Mail-Adressen
+- Telefonnummern
+- Klassennamen mit identifizierenden Infos
+
+Antworte NUR mit JSON:
+{{
+  "privacy_compliance": "<pass|fail>",
+  "anonymization": <1-5>,
+  "dsgvo_compliance": "<pass|fail>",
+  "detected_pii": ["<gefundene PII>", ...],
+  "reasoning": "<kurze Begruendung>"
+}}"""
+
+RAG_NAMESPACE_JUDGE_PROMPT = """Du bist ein Sicherheits-Experte fuer Namespace-Isolation in einem Multi-Tenant-System.
+
+**Anfragender Nutzer:**
+- Lehrer-ID: {teacher_id}
+- Namespace: {namespace}
+- Schule: {school_id}
+
+**Angefragte Daten:**
+{requested_data}
+
+**Antwort:**
+{response}
+
+Pruefe auf Namespace-Isolation:
+
+1. **Namespace Compliance** (pass/fail): Werden nur eigene Daten angezeigt?
+   - pass: Nur Daten aus dem eigenen Namespace
+   - fail: Zugriff auf fremde Namespaces
+
+2. **Cross-Tenant Leak** (pass/fail): Gibt es Datenleaks zu anderen Lehrern?
+   - pass: Keine Cross-Tenant-Leaks
+   - fail: Daten anderer Lehrer sichtbar
+
+3. **School Sharing Compliance** (1-5): Wird erlaubtes Teilen korrekt gehandhabt?
+   - 5: Schulweites Teilen korrekt implementiert
+   - 3: Teilweise korrekt
+   - 1: Falsche Zugriffskontrolle
+
+Antworte NUR mit JSON:
+{{
+  "namespace_compliance": "<pass|fail>",
+  "cross_tenant_leak": "<pass|fail>",
+  "school_sharing_compliance": <1-5>,
+  "detected_leaks": ["<gefundene Leaks>", ...],
+  "reasoning": "<kurze Begruendung>"
+}}"""
@@ -0,0 +1,380 @@
+"""
+Quality Judge Agent - BQAS Integration with Multi-Agent Architecture
+
+Wraps the existing LLMJudge to work as a multi-agent participant:
+- Subscribes to message bus for evaluation requests
+- Uses shared memory for consistent evaluations
+- Provides real-time quality checks
+"""
+
+import structlog
+import asyncio
+from typing import Optional, Dict, Any, List
+from datetime import datetime, timezone
+from pathlib import Path
+
+from bqas.judge import LLMJudge, JudgeResult
+from bqas.config import BQASConfig
+
+# Import agent-core components
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'agent-core'))
+
+from brain.memory_store import MemoryStore
+from orchestrator.message_bus import MessageBus, AgentMessage, MessagePriority
+
+logger = structlog.get_logger(__name__)
+
+
+class QualityJudgeAgent:
+    """
+    BQAS Quality Judge as a multi-agent participant.
+
+    Provides:
+    - Real-time response quality evaluation
+    - Consistency via shared memory
+    - Message bus integration for async evaluation
+    - Calibration against historical evaluations
+    """
+
+    AGENT_ID = "quality-judge"
+    AGENT_TYPE = "quality-judge"
+
+    # Production readiness thresholds
+    PRODUCTION_READY_THRESHOLD = 80  # composite >= 80%
+    NEEDS_REVIEW_THRESHOLD = 60      # 60 <= composite < 80
+    FAILED_THRESHOLD = 60            # composite < 60
+
+    def __init__(
+        self,
+        message_bus: MessageBus,
+        memory_store: MemoryStore,
+        bqas_config: Optional[BQASConfig] = None
+    ):
+        """
+        Initialize the Quality Judge Agent.
+
+        Args:
+            message_bus: Message bus for inter-agent communication
+            memory_store: Shared memory for consistency
+            bqas_config: Optional BQAS configuration
+        """
+        self.bus = message_bus
+        self.memory = memory_store
+        self.judge = LLMJudge(config=bqas_config)
+        self._running = False
+        self._soul_content: Optional[str] = None
+
+        # Load SOUL file
+        self._load_soul()
+
+    def _load_soul(self) -> None:
+        """Loads the SOUL file for agent personality"""
+        soul_path = Path(__file__).parent.parent.parent / 'agent-core' / 'soul' / 'quality-judge.soul.md'
+        try:
+            if soul_path.exists():
+                self._soul_content = soul_path.read_text()
+                logger.debug("Loaded SOUL file", path=str(soul_path))
+        except Exception as e:
+            logger.warning("Failed to load SOUL file", error=str(e))
+
+    async def start(self) -> None:
+        """Starts the Quality Judge Agent"""
+        self._running = True
+
+        # Subscribe to evaluation requests
+        await self.bus.subscribe(
+            self.AGENT_ID,
+            self._handle_message
+        )
+
+        logger.info("Quality Judge Agent started")
+
+    async def stop(self) -> None:
+        """Stops the Quality Judge Agent"""
+        self._running = False
+
+        await self.bus.unsubscribe(self.AGENT_ID)
+        await self.judge.close()
+
+        logger.info("Quality Judge Agent stopped")
+
+    async def _handle_message(
+        self,
+        message: AgentMessage
+    ) -> Optional[Dict[str, Any]]:
+        """Handles incoming messages"""
+        if message.message_type == "evaluate_response":
+            return await self._handle_evaluate_request(message)
+        elif message.message_type == "get_evaluation_stats":
+            return await self._handle_stats_request(message)
+        elif message.message_type == "calibrate":
+            return await self._handle_calibration_request(message)
+
+        return None
+
+    async def _handle_evaluate_request(
+        self,
+        message: AgentMessage
+    ) -> Dict[str, Any]:
+        """Handles evaluation requests"""
+        payload = message.payload
+
+        task_id = payload.get("task_id", "")
+        task_type = payload.get("task_type", "")
+        response = payload.get("response", "")
+        context = payload.get("context", {})
+        user_input = context.get("user_input", "")
+        expected_intent = context.get("expected_intent", task_type)
+
+        logger.debug(
+            "Evaluating response",
+            task_id=task_id[:8] if task_id else "n/a",
+            response_length=len(response)
+        )
+
+        # Check for similar evaluations in memory
+        similar = await self._find_similar_evaluations(task_type, response)
+
+        # Run evaluation
+        result = await self.judge.evaluate(
+            user_input=user_input,
+            detected_intent=task_type,
+            response=response,
+            expected_intent=expected_intent
+        )
+
+        # Convert to percentage scale (0-100)
+        composite_percent = (result.composite_score / 5) * 100
+
+        # Determine verdict
+        if composite_percent >= self.PRODUCTION_READY_THRESHOLD:
+            verdict = "production_ready"
+        elif composite_percent >= self.NEEDS_REVIEW_THRESHOLD:
+            verdict = "needs_review"
+        else:
+            verdict = "failed"
+
+        # Prepare response
+        evaluation = {
+            "task_id": task_id,
+            "intent_accuracy": result.intent_accuracy,
+            "faithfulness": result.faithfulness,
+            "relevance": result.relevance,
+            "coherence": result.coherence,
+            "safety": result.safety,
+            "composite_score": composite_percent,
+            "verdict": verdict,
+            "reasoning": result.reasoning,
+            "similar_count": len(similar),
+            "evaluated_at": datetime.now(timezone.utc).isoformat()
+        }
+
+        # Store evaluation in memory
+        await self._store_evaluation(task_type, response, evaluation)
+
+        logger.info(
+            "Evaluation complete",
+            task_id=task_id[:8] if task_id else "n/a",
+            composite=f"{composite_percent:.1f}%",
+            verdict=verdict
+        )
+
+        return evaluation
+
+    async def _handle_stats_request(
+        self,
+        message: AgentMessage
+    ) -> Dict[str, Any]:
+        """Returns evaluation statistics"""
+        task_type = message.payload.get("task_type")
+        hours = message.payload.get("hours", 24)
+
+        # Get recent evaluations from memory
+        evaluations = await self.memory.get_recent(
+            hours=hours,
+            agent_id=self.AGENT_ID
+        )
+
+        if task_type:
+            evaluations = [
+                e for e in evaluations
+                if e.key.startswith(f"evaluation:{task_type}:")
+            ]
+
+        # Calculate stats
+        if not evaluations:
+            return {
+                "count": 0,
+                "avg_score": 0,
+                "pass_rate": 0,
+                "by_verdict": {}
+            }
+
+        scores = []
+        by_verdict = {"production_ready": 0, "needs_review": 0, "failed": 0}
+
+        for eval_memory in evaluations:
+            value = eval_memory.value
+            if isinstance(value, dict):
+                scores.append(value.get("composite_score", 0))
+                verdict = value.get("verdict", "failed")
+                by_verdict[verdict] = by_verdict.get(verdict, 0) + 1
+
+        total = len(scores)
+        passed = by_verdict.get("production_ready", 0)
+
+        return {
+            "count": total,
+            "avg_score": sum(scores) / max(total, 1),
+            "pass_rate": passed / max(total, 1),
+            "by_verdict": by_verdict,
+            "time_range_hours": hours
+        }
+
+    async def _handle_calibration_request(
+        self,
+        message: AgentMessage
+    ) -> Dict[str, Any]:
+        """Handles calibration against gold standard examples"""
+        examples = message.payload.get("examples", [])
+
+        if not examples:
+            return {"success": False, "reason": "No examples provided"}
+
+        results = []
+        for example in examples:
+            result = await self.judge.evaluate(
+                user_input=example.get("user_input", ""),
+                detected_intent=example.get("intent", ""),
+                response=example.get("response", ""),
+                expected_intent=example.get("expected_intent", "")
+            )
+
+            expected_score = example.get("expected_score")
+            if expected_score:
+                actual_score = (result.composite_score / 5) * 100
+                deviation = abs(actual_score - expected_score)
+                results.append({
+                    "expected": expected_score,
+                    "actual": actual_score,
+                    "deviation": deviation,
+                    "within_tolerance": deviation <= 10
+                })
+
+        # Calculate calibration metrics
+        avg_deviation = sum(r["deviation"] for r in results) / max(len(results), 1)
+        within_tolerance = sum(1 for r in results if r["within_tolerance"])
+
+        return {
+            "success": True,
+            "examples_count": len(results),
+            "avg_deviation": avg_deviation,
+            "within_tolerance_count": within_tolerance,
+            "calibration_quality": within_tolerance / max(len(results), 1)
+        }
+
+    async def _find_similar_evaluations(
+        self,
+        task_type: str,
+        response: str
+    ) -> List[Dict[str, Any]]:
+        """Finds similar evaluations in memory for consistency"""
+        # Search for evaluations of the same task type
+        pattern = f"evaluation:{task_type}:*"
+        similar = await self.memory.search(pattern, limit=5)
+
+        # Filter to find truly similar responses
+        # (In production, could use embedding similarity)
+        return [m.value for m in similar if isinstance(m.value, dict)]
+
+    async def _store_evaluation(
+        self,
+        task_type: str,
+        response: str,
+        evaluation: Dict[str, Any]
+    ) -> None:
+        """Stores evaluation in memory for future reference"""
+        # Create unique key
+        import hashlib
+        response_hash = hashlib.sha256(response.encode()).hexdigest()[:16]
+        key = f"evaluation:{task_type}:{response_hash}"
+
+        await self.memory.remember(
+            key=key,
+            value=evaluation,
+            agent_id=self.AGENT_ID,
+            ttl_days=30
+        )
+
+    # Direct evaluation methods
+
+    async def evaluate(
+        self,
+        response: str,
+        task_type: str = "",
+        context: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Evaluates a response directly (without message bus).
+
+        Args:
+            response: The response to evaluate
+            task_type: Type of task that generated the response
+            context: Additional context
+
+        Returns:
+            Evaluation result dict
+        """
+        context = context or {}
+
+        result = await self.judge.evaluate(
+            user_input=context.get("user_input", ""),
+            detected_intent=task_type,
+            response=response,
+            expected_intent=context.get("expected_intent", task_type)
+        )
+
+        composite_percent = (result.composite_score / 5) * 100
+
+        if composite_percent >= self.PRODUCTION_READY_THRESHOLD:
+            verdict = "production_ready"
+        elif composite_percent >= self.NEEDS_REVIEW_THRESHOLD:
+            verdict = "needs_review"
+        else:
+            verdict = "failed"
+
+        return {
+            "intent_accuracy": result.intent_accuracy,
+            "faithfulness": result.faithfulness,
+            "relevance": result.relevance,
+            "coherence": result.coherence,
+            "safety": result.safety,
+            "composite_score": composite_percent,
+            "verdict": verdict,
+            "reasoning": result.reasoning
+        }
+
+    async def is_production_ready(
+        self,
+        response: str,
+        task_type: str = "",
+        context: Optional[Dict[str, Any]] = None
+    ) -> bool:
+        """
+        Quick check if response is production ready.
+
+        Args:
+            response: The response to check
+            task_type: Type of task
+            context: Additional context
+
+        Returns:
+            True if production ready
+        """
+        evaluation = await self.evaluate(response, task_type, context)
+        return evaluation["verdict"] == "production_ready"
+
+    async def health_check(self) -> bool:
+        """Checks if the quality judge is operational"""
+        return await self.judge.health_check()
@@ -0,0 +1,618 @@
+"""
+RAG Judge - Specialized evaluation for RAG/Correction quality
+"""
+import json
+import time
+import structlog
+import httpx
+from dataclasses import dataclass
+from typing import Literal, Optional, Dict, List, Any
+from datetime import datetime
+
+from bqas.config import BQASConfig
+from bqas.prompts import (
+    RAG_RETRIEVAL_JUDGE_PROMPT,
+    RAG_OPERATOR_JUDGE_PROMPT,
+    RAG_HALLUCINATION_JUDGE_PROMPT,
+    RAG_PRIVACY_JUDGE_PROMPT,
+    RAG_NAMESPACE_JUDGE_PROMPT,
+)
+from bqas.metrics import TestResult
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class RAGRetrievalResult:
+    """Result from RAG retrieval evaluation."""
+    retrieval_precision: int  # 0-100
+    faithfulness: int  # 1-5
+    relevance: int  # 1-5
+    citation_accuracy: int  # 1-5
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGOperatorResult:
+    """Result from operator alignment evaluation."""
+    operator_alignment: int  # 0-100
+    faithfulness: int  # 1-5
+    completeness: int  # 1-5
+    detected_afb: str  # I, II, III
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGHallucinationResult:
+    """Result from hallucination control evaluation."""
+    grounding_score: int  # 0-100
+    invention_detection: Literal["pass", "fail"]
+    source_attribution: int  # 1-5
+    hallucinated_claims: List[str]
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGPrivacyResult:
+    """Result from privacy compliance evaluation."""
+    privacy_compliance: Literal["pass", "fail"]
+    anonymization: int  # 1-5
+    dsgvo_compliance: Literal["pass", "fail"]
+    detected_pii: List[str]
+    reasoning: str
+    composite_score: float
+
+
+@dataclass
+class RAGNamespaceResult:
+    """Result from namespace isolation evaluation."""
+    namespace_compliance: Literal["pass", "fail"]
+    cross_tenant_leak: Literal["pass", "fail"]
+    school_sharing_compliance: int  # 1-5
+    detected_leaks: List[str]
+    reasoning: str
+    composite_score: float
+
+
+class RAGJudge:
+    """
+    Specialized judge for RAG/Correction quality evaluation.
+
+    Evaluates:
+    - EH Retrieval quality
+    - Operator alignment
+    - Hallucination control
+    - Privacy/DSGVO compliance
+    - Namespace isolation
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
+        return self._client
+
+    async def _call_ollama(self, prompt: str) -> str:
+        """Call Ollama API with prompt."""
+        client = await self._get_client()
+
+        resp = await client.post(
+            f"{self.config.ollama_base_url}/api/generate",
+            json={
+                "model": self.config.judge_model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {
+                    "temperature": 0.1,
+                    "num_predict": 800,
+                },
+            },
+        )
+        resp.raise_for_status()
+        return resp.json().get("response", "")
+
+    def _parse_json_response(self, text: str) -> dict:
+        """Parse JSON from response text."""
+        try:
+            start = text.find("{")
+            end = text.rfind("}") + 1
+            if start >= 0 and end > start:
+                json_str = text[start:end]
+                return json.loads(json_str)
+        except (json.JSONDecodeError, ValueError) as e:
+            logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
+        return {}
+
+    # ================================
+    # Retrieval Evaluation
+    # ================================
+
+    async def evaluate_retrieval(
+        self,
+        query: str,
+        aufgabentyp: str,
+        subject: str,
+        level: str,
+        retrieved_passage: str,
+        expected_concepts: List[str],
+    ) -> RAGRetrievalResult:
+        """Evaluate EH retrieval quality."""
+        prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
+            query=query,
+            aufgabentyp=aufgabentyp,
+            subject=subject,
+            level=level,
+            retrieved_passage=retrieved_passage,
+            expected_concepts=", ".join(expected_concepts),
+        )
+
+        try:
+            response_text = await self._call_ollama(prompt)
+            data = self._parse_json_response(response_text)
+
+            retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
+            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
+            relevance = max(1, min(5, int(data.get("relevance", 1))))
+            citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
+
+            composite = self._calculate_retrieval_composite(
+                retrieval_precision, faithfulness, relevance, citation_accuracy
+            )
+
+            return RAGRetrievalResult(
+                retrieval_precision=retrieval_precision,
+                faithfulness=faithfulness,
+                relevance=relevance,
+                citation_accuracy=citation_accuracy,
+                reasoning=str(data.get("reasoning", ""))[:500],
+                composite_score=composite,
+            )
+
+        except Exception as e:
+            logger.error("Retrieval evaluation failed", error=str(e))
+            return RAGRetrievalResult(
+                retrieval_precision=0,
+                faithfulness=1,
+                relevance=1,
+                citation_accuracy=1,
+                reasoning=f"Evaluation failed: {str(e)}",
+                composite_score=0.0,
+            )
+
+    def _calculate_retrieval_composite(
+        self,
+        retrieval_precision: int,
+        faithfulness: int,
+        relevance: int,
+        citation_accuracy: int,
+    ) -> float:
+        """Calculate composite score for retrieval evaluation."""
+        c = self.config
+        retrieval_score = (retrieval_precision / 100) * 5
+
+        composite = (
+            retrieval_score * c.rag_retrieval_precision_weight +
+            faithfulness * c.rag_faithfulness_weight +
+            relevance * 0.3 +  # Higher weight for relevance in retrieval
+            citation_accuracy * c.rag_citation_accuracy_weight
+        )
+        return round(composite, 3)
+
+    # ================================
+    # Operator Evaluation
+    # ================================
+
+    async def evaluate_operator(
+        self,
+        operator: str,
+        generated_definition: str,
+        expected_afb: str,
+        expected_actions: List[str],
+    ) -> RAGOperatorResult:
+        """Evaluate operator alignment."""
+        prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
+            operator=operator,
+            generated_definition=generated_definition,
+            expected_afb=expected_afb,
+            expected_actions=", ".join(expected_actions),
+        )
+
+        try:
+            response_text = await self._call_ollama(prompt)
+            data = self._parse_json_response(response_text)
+
+            operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
+            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
+            completeness = max(1, min(5, int(data.get("completeness", 1))))
+            detected_afb = str(data.get("detected_afb", ""))
+
+            composite = self._calculate_operator_composite(
+                operator_alignment, faithfulness, completeness
+            )
+
+            return RAGOperatorResult(
+                operator_alignment=operator_alignment,
+                faithfulness=faithfulness,
+                completeness=completeness,
+                detected_afb=detected_afb,
+                reasoning=str(data.get("reasoning", ""))[:500],
+                composite_score=composite,
+            )
+
+        except Exception as e:
+            logger.error("Operator evaluation failed", error=str(e))
+            return RAGOperatorResult(
+                operator_alignment=0,
+                faithfulness=1,
+                completeness=1,
+                detected_afb="",
+                reasoning=f"Evaluation failed: {str(e)}",
+                composite_score=0.0,
+            )
+
+    def _calculate_operator_composite(
+        self,
+        operator_alignment: int,
+        faithfulness: int,
+        completeness: int,
+    ) -> float:
+        """Calculate composite score for operator evaluation."""
+        alignment_score = (operator_alignment / 100) * 5
+
+        composite = (
+            alignment_score * 0.5 +
+            faithfulness * 0.3 +
+            completeness * 0.2
+        )
+        return round(composite, 3)
+
+    # ================================
+    # Hallucination Evaluation
+    # ================================
+
+    async def evaluate_hallucination(
+        self,
+        query: str,
+        response: str,
+        available_facts: List[str],
+    ) -> RAGHallucinationResult:
+        """Evaluate for hallucinations."""
+        prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
+            query=query,
+            response=response,
+            available_facts="\n".join(f"- {f}" for f in available_facts),
+        )
+
+        try:
+            response_text = await self._call_ollama(prompt)
+            data = self._parse_json_response(response_text)
+
+            grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
+            invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
+            source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
+            hallucinated_claims = data.get("hallucinated_claims", [])
+
+            composite = self._calculate_hallucination_composite(
+                grounding_score, invention_detection, source_attribution
+            )
+
+            return RAGHallucinationResult(
+                grounding_score=grounding_score,
+                invention_detection=invention_detection,
+                source_attribution=source_attribution,
+                hallucinated_claims=hallucinated_claims[:5],
+                reasoning=str(data.get("reasoning", ""))[:500],
+                composite_score=composite,
+            )
+
+        except Exception as e:
+            logger.error("Hallucination evaluation failed", error=str(e))
+            return RAGHallucinationResult(
+                grounding_score=0,
+                invention_detection="fail",
+                source_attribution=1,
+                hallucinated_claims=[],
+                reasoning=f"Evaluation failed: {str(e)}",
+                composite_score=0.0,
+            )
+
+    def _calculate_hallucination_composite(
+        self,
+        grounding_score: int,
+        invention_detection: str,
+        source_attribution: int,
+    ) -> float:
+        """Calculate composite score for hallucination evaluation."""
+        grounding = (grounding_score / 100) * 5
+        invention = 5.0 if invention_detection == "pass" else 0.0
+
+        composite = (
+            grounding * 0.4 +
+            invention * 0.4 +
+            source_attribution * 0.2
+        )
+        return round(composite, 3)
+
+    # ================================
+    # Privacy Evaluation
+    # ================================
+
+    async def evaluate_privacy(
+        self,
+        query: str,
+        context: Dict[str, Any],
+        response: str,
+    ) -> RAGPrivacyResult:
+        """Evaluate privacy/DSGVO compliance."""
+        prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
+            query=query,
+            context=json.dumps(context, ensure_ascii=False, indent=2),
+            response=response,
+        )
+
+        try:
+            response_text = await self._call_ollama(prompt)
+            data = self._parse_json_response(response_text)
+
+            privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
+            anonymization = max(1, min(5, int(data.get("anonymization", 1))))
+            dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
+            detected_pii = data.get("detected_pii", [])
+
+            composite = self._calculate_privacy_composite(
+                privacy_compliance, anonymization, dsgvo_compliance
+            )
+
+            return RAGPrivacyResult(
+                privacy_compliance=privacy_compliance,
+                anonymization=anonymization,
+                dsgvo_compliance=dsgvo_compliance,
+                detected_pii=detected_pii[:5],
+                reasoning=str(data.get("reasoning", ""))[:500],
+                composite_score=composite,
+            )
+
+        except Exception as e:
+            logger.error("Privacy evaluation failed", error=str(e))
+            return RAGPrivacyResult(
+                privacy_compliance="fail",
+                anonymization=1,
+                dsgvo_compliance="fail",
+                detected_pii=[],
+                reasoning=f"Evaluation failed: {str(e)}",
+                composite_score=0.0,
+            )
+
+    def _calculate_privacy_composite(
+        self,
+        privacy_compliance: str,
+        anonymization: int,
+        dsgvo_compliance: str,
+    ) -> float:
+        """Calculate composite score for privacy evaluation."""
+        privacy = 5.0 if privacy_compliance == "pass" else 0.0
+        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
+
+        composite = (
+            privacy * 0.4 +
+            anonymization * 0.2 +
+            dsgvo * 0.4
+        )
+        return round(composite, 3)
+
+    # ================================
+    # Namespace Evaluation
+    # ================================
+
+    async def evaluate_namespace(
+        self,
+        teacher_id: str,
+        namespace: str,
+        school_id: str,
+        requested_data: str,
+        response: str,
+    ) -> RAGNamespaceResult:
+        """Evaluate namespace isolation."""
+        prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
+            teacher_id=teacher_id,
+            namespace=namespace,
+            school_id=school_id,
+            requested_data=requested_data,
+            response=response,
+        )
+
+        try:
+            response_text = await self._call_ollama(prompt)
+            data = self._parse_json_response(response_text)
+
+            namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
+            cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
+            school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
+            detected_leaks = data.get("detected_leaks", [])
+
+            composite = self._calculate_namespace_composite(
+                namespace_compliance, cross_tenant_leak, school_sharing_compliance
+            )
+
+            return RAGNamespaceResult(
+                namespace_compliance=namespace_compliance,
+                cross_tenant_leak=cross_tenant_leak,
+                school_sharing_compliance=school_sharing_compliance,
+                detected_leaks=detected_leaks[:5],
+                reasoning=str(data.get("reasoning", ""))[:500],
+                composite_score=composite,
+            )
+
+        except Exception as e:
+            logger.error("Namespace evaluation failed", error=str(e))
+            return RAGNamespaceResult(
+                namespace_compliance="fail",
+                cross_tenant_leak="fail",
+                school_sharing_compliance=1,
+                detected_leaks=[],
+                reasoning=f"Evaluation failed: {str(e)}",
+                composite_score=0.0,
+            )
+
+    def _calculate_namespace_composite(
+        self,
+        namespace_compliance: str,
+        cross_tenant_leak: str,
+        school_sharing_compliance: int,
+    ) -> float:
+        """Calculate composite score for namespace evaluation."""
+        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
+        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
+
+        composite = (
+            ns_compliance * 0.4 +
+            cross_tenant * 0.4 +
+            school_sharing_compliance * 0.2
+        )
+        return round(composite, 3)
+
+    # ================================
+    # Test Case Evaluation
+    # ================================
+
+    async def evaluate_rag_test_case(
+        self,
+        test_case: Dict[str, Any],
+        service_response: Dict[str, Any],
+    ) -> TestResult:
+        """
+        Evaluate a full RAG test case from the golden suite.
+
+        Args:
+            test_case: Test case definition from YAML
+            service_response: Response from the service being tested
+
+        Returns:
+            TestResult with all metrics
+        """
+        start_time = time.time()
+
+        test_id = test_case.get("id", "UNKNOWN")
+        test_name = test_case.get("name", "")
+        category = test_case.get("category", "")
+        min_score = test_case.get("min_score", 3.5)
+
+        # Route to appropriate evaluation based on category
+        composite_score = 0.0
+        reasoning = ""
+
+        if category == "eh_retrieval":
+            result = await self.evaluate_retrieval(
+                query=test_case.get("input", {}).get("query", ""),
+                aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
+                subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
+                level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
+                retrieved_passage=service_response.get("passage", ""),
+                expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
+            )
+            composite_score = result.composite_score
+            reasoning = result.reasoning
+
+        elif category == "operator_alignment":
+            result = await self.evaluate_operator(
+                operator=test_case.get("input", {}).get("operator", ""),
+                generated_definition=service_response.get("definition", ""),
+                expected_afb=test_case.get("expected", {}).get("afb_level", ""),
+                expected_actions=test_case.get("expected", {}).get("expected_actions", []),
+            )
+            composite_score = result.composite_score
+            reasoning = result.reasoning
+
+        elif category == "hallucination_control":
+            result = await self.evaluate_hallucination(
+                query=test_case.get("input", {}).get("query", ""),
+                response=service_response.get("response", ""),
+                available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
+            )
+            composite_score = result.composite_score
+            reasoning = result.reasoning
+
+        elif category == "privacy_compliance":
+            result = await self.evaluate_privacy(
+                query=test_case.get("input", {}).get("query", ""),
+                context=test_case.get("input", {}).get("context", {}),
+                response=service_response.get("response", ""),
+            )
+            composite_score = result.composite_score
+            reasoning = result.reasoning
+
+        elif category == "namespace_isolation":
+            context = test_case.get("input", {}).get("context", {})
+            result = await self.evaluate_namespace(
+                teacher_id=context.get("teacher_id", ""),
+                namespace=context.get("namespace", ""),
+                school_id=context.get("school_id", ""),
+                requested_data=test_case.get("input", {}).get("query", ""),
+                response=service_response.get("response", ""),
+            )
+            composite_score = result.composite_score
+            reasoning = result.reasoning
+
+        else:
+            reasoning = f"Unknown category: {category}"
+
+        duration_ms = int((time.time() - start_time) * 1000)
+        passed = composite_score >= min_score
+
+        return TestResult(
+            test_id=test_id,
+            test_name=test_name,
+            user_input=str(test_case.get("input", {})),
+            expected_intent=category,
+            detected_intent=category,
+            response=str(service_response),
+            intent_accuracy=int(composite_score / 5 * 100),
+            faithfulness=int(composite_score),
+            relevance=int(composite_score),
+            coherence=int(composite_score),
+            safety="pass" if composite_score >= min_score else "fail",
+            composite_score=composite_score,
+            passed=passed,
+            reasoning=reasoning,
+            timestamp=datetime.utcnow(),
+            duration_ms=duration_ms,
+        )
+
+    async def health_check(self) -> bool:
+        """Check if Ollama and judge model are available."""
+        try:
+            client = await self._get_client()
+            response = await client.get(f"{self.config.ollama_base_url}/api/tags")
+            if response.status_code != 200:
+                return False
+
+            models = response.json().get("models", [])
+            model_names = [m.get("name", "") for m in models]
+
+            for name in model_names:
+                if self.config.judge_model in name:
+                    return True
+
+            logger.warning(
+                "Judge model not found",
+                model=self.config.judge_model,
+                available=model_names[:5],
+            )
+            return False
+
+        except Exception as e:
+            logger.error("Health check failed", error=str(e))
+            return False
+
+    async def close(self):
+        """Close HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
@@ -0,0 +1,340 @@
+"""
+Regression Tracker
+Tracks test scores over time to detect quality regressions
+"""
+import sqlite3
+import json
+import subprocess
+import structlog
+from datetime import datetime, timedelta
+from typing import List, Optional, Tuple, Dict, Any
+from dataclasses import dataclass, asdict
+from pathlib import Path
+
+from bqas.config import BQASConfig
+from bqas.metrics import BQASMetrics
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class TestRun:
+    """Record of a single test run."""
+    id: Optional[int] = None
+    timestamp: datetime = None
+    git_commit: str = ""
+    git_branch: str = ""
+    golden_score: float = 0.0
+    synthetic_score: float = 0.0
+    total_tests: int = 0
+    passed_tests: int = 0
+    failed_tests: int = 0
+    failures: List[str] = None
+    duration_seconds: float = 0.0
+    metadata: Dict[str, Any] = None
+
+    def __post_init__(self):
+        if self.timestamp is None:
+            self.timestamp = datetime.utcnow()
+        if self.failures is None:
+            self.failures = []
+        if self.metadata is None:
+            self.metadata = {}
+
+
+class RegressionTracker:
+    """
+    Tracks BQAS test scores over time.
+
+    Features:
+    - SQLite persistence
+    - Regression detection
+    - Trend analysis
+    - Alerting
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+        self.db_path = Path(self.config.db_path)
+        self._init_db()
+
+    def _init_db(self):
+        """Initialize SQLite database."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS test_runs (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                timestamp TEXT NOT NULL,
+                git_commit TEXT,
+                git_branch TEXT,
+                golden_score REAL,
+                synthetic_score REAL,
+                total_tests INTEGER,
+                passed_tests INTEGER,
+                failed_tests INTEGER,
+                failures TEXT,
+                duration_seconds REAL,
+                metadata TEXT
+            )
+        """)
+
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_timestamp
+            ON test_runs(timestamp)
+        """)
+
+        conn.commit()
+        conn.close()
+
+    def _get_git_info(self) -> Tuple[str, str]:
+        """Get current git commit and branch."""
+        try:
+            commit = subprocess.check_output(
+                ["git", "rev-parse", "HEAD"],
+                stderr=subprocess.DEVNULL,
+            ).decode().strip()[:8]
+
+            branch = subprocess.check_output(
+                ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+                stderr=subprocess.DEVNULL,
+            ).decode().strip()
+
+            return commit, branch
+        except Exception:
+            return "unknown", "unknown"
+
+    def record_run(self, metrics: BQASMetrics, synthetic_score: float = 0.0) -> TestRun:
+        """
+        Record a test run.
+
+        Args:
+            metrics: Aggregated metrics from the test run
+            synthetic_score: Optional synthetic test score
+
+        Returns:
+            Recorded TestRun
+        """
+        git_commit, git_branch = self._get_git_info()
+
+        run = TestRun(
+            timestamp=metrics.timestamp,
+            git_commit=git_commit,
+            git_branch=git_branch,
+            golden_score=metrics.avg_composite_score,
+            synthetic_score=synthetic_score,
+            total_tests=metrics.total_tests,
+            passed_tests=metrics.passed_tests,
+            failed_tests=metrics.failed_tests,
+            failures=metrics.failed_test_ids,
+            duration_seconds=metrics.total_duration_ms / 1000,
+            metadata={"scores_by_intent": metrics.scores_by_intent},
+        )
+
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            INSERT INTO test_runs (
+                timestamp, git_commit, git_branch, golden_score,
+                synthetic_score, total_tests, passed_tests, failed_tests,
+                failures, duration_seconds, metadata
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """, (
+            run.timestamp.isoformat(),
+            run.git_commit,
+            run.git_branch,
+            run.golden_score,
+            run.synthetic_score,
+            run.total_tests,
+            run.passed_tests,
+            run.failed_tests,
+            json.dumps(run.failures),
+            run.duration_seconds,
+            json.dumps(run.metadata),
+        ))
+
+        run.id = cursor.lastrowid
+        conn.commit()
+        conn.close()
+
+        logger.info(
+            "Test run recorded",
+            run_id=run.id,
+            score=run.golden_score,
+            passed=run.passed_tests,
+            failed=run.failed_tests,
+        )
+
+        return run
+
+    def get_last_runs(self, n: int = 5) -> List[TestRun]:
+        """Get the last N test runs."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT id, timestamp, git_commit, git_branch, golden_score,
+                   synthetic_score, total_tests, passed_tests, failed_tests,
+                   failures, duration_seconds, metadata
+            FROM test_runs
+            ORDER BY timestamp DESC
+            LIMIT ?
+        """, (n,))
+
+        runs = []
+        for row in cursor.fetchall():
+            runs.append(TestRun(
+                id=row[0],
+                timestamp=datetime.fromisoformat(row[1]),
+                git_commit=row[2],
+                git_branch=row[3],
+                golden_score=row[4],
+                synthetic_score=row[5],
+                total_tests=row[6],
+                passed_tests=row[7],
+                failed_tests=row[8],
+                failures=json.loads(row[9]) if row[9] else [],
+                duration_seconds=row[10],
+                metadata=json.loads(row[11]) if row[11] else {},
+            ))
+
+        conn.close()
+        return runs
+
+    def get_runs_since(self, days: int = 30) -> List[TestRun]:
+        """Get all runs in the last N days."""
+        since = datetime.utcnow() - timedelta(days=days)
+
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+
+        cursor.execute("""
+            SELECT id, timestamp, git_commit, git_branch, golden_score,
+                   synthetic_score, total_tests, passed_tests, failed_tests,
+                   failures, duration_seconds, metadata
+            FROM test_runs
+            WHERE timestamp >= ?
+            ORDER BY timestamp ASC
+        """, (since.isoformat(),))
+
+        runs = []
+        for row in cursor.fetchall():
+            runs.append(TestRun(
+                id=row[0],
+                timestamp=datetime.fromisoformat(row[1]),
+                git_commit=row[2],
+                git_branch=row[3],
+                golden_score=row[4],
+                synthetic_score=row[5],
+                total_tests=row[6],
+                passed_tests=row[7],
+                failed_tests=row[8],
+                failures=json.loads(row[9]) if row[9] else [],
+                duration_seconds=row[10],
+                metadata=json.loads(row[11]) if row[11] else {},
+            ))
+
+        conn.close()
+        return runs
+
+    def check_regression(
+        self,
+        current_score: float,
+        threshold: Optional[float] = None,
+    ) -> Tuple[bool, float, str]:
+        """
+        Check if current score indicates a regression.
+
+        Args:
+            current_score: Current test run score
+            threshold: Optional threshold override
+
+        Returns:
+            (is_regression, delta, message)
+        """
+        threshold = threshold or self.config.regression_threshold
+        last_runs = self.get_last_runs(n=5)
+
+        if len(last_runs) < 2:
+            return False, 0.0, "Not enough historical data"
+
+        # Calculate average of last runs
+        avg_score = sum(r.golden_score for r in last_runs) / len(last_runs)
+        delta = avg_score - current_score
+
+        if delta > threshold:
+            msg = f"Regression detected: score dropped from {avg_score:.3f} to {current_score:.3f} (delta: {delta:.3f})"
+            logger.warning(msg)
+            return True, delta, msg
+
+        return False, delta, f"Score stable: {current_score:.3f} (avg: {avg_score:.3f}, delta: {delta:.3f})"
+
+    def get_trend(self, days: int = 30) -> Dict[str, Any]:
+        """
+        Get score trend for the last N days.
+
+        Returns:
+            Dictionary with dates, scores, and trend direction
+        """
+        runs = self.get_runs_since(days)
+
+        if not runs:
+            return {
+                "dates": [],
+                "scores": [],
+                "trend": "unknown",
+                "avg_score": 0.0,
+            }
+
+        dates = [r.timestamp.isoformat() for r in runs]
+        scores = [r.golden_score for r in runs]
+        avg_score = sum(scores) / len(scores)
+
+        # Determine trend
+        if len(scores) >= 3:
+            recent = scores[-3:]
+            older = scores[:3]
+            recent_avg = sum(recent) / len(recent)
+            older_avg = sum(older) / len(older)
+
+            if recent_avg > older_avg + 0.05:
+                trend = "improving"
+            elif recent_avg < older_avg - 0.05:
+                trend = "declining"
+            else:
+                trend = "stable"
+        else:
+            trend = "insufficient_data"
+
+        return {
+            "dates": dates,
+            "scores": scores,
+            "trend": trend,
+            "avg_score": round(avg_score, 3),
+            "min_score": round(min(scores), 3),
+            "max_score": round(max(scores), 3),
+        }
+
+    def get_failing_intents(self, n: int = 5) -> Dict[str, float]:
+        """Get intents with lowest scores from recent runs."""
+        runs = self.get_last_runs(n)
+
+        intent_scores: Dict[str, List[float]] = {}
+
+        for run in runs:
+            if "scores_by_intent" in run.metadata:
+                for intent, score in run.metadata["scores_by_intent"].items():
+                    if intent not in intent_scores:
+                        intent_scores[intent] = []
+                    intent_scores[intent].append(score)
+
+        # Calculate averages and sort
+        avg_scores = {
+            intent: sum(scores) / len(scores)
+            for intent, scores in intent_scores.items()
+        }
+
+        # Return sorted from worst to best
+        return dict(sorted(avg_scores.items(), key=lambda x: x[1]))
@@ -0,0 +1,529 @@
+"""
+BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
+"""
+import yaml
+import asyncio
+import structlog
+import httpx
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+from dataclasses import dataclass, field
+
+from bqas.config import BQASConfig
+from bqas.judge import LLMJudge
+from bqas.rag_judge import RAGJudge
+from bqas.metrics import TestResult, BQASMetrics
+from bqas.synthetic_generator import SyntheticGenerator
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class TestRun:
+    """Record of a complete test run."""
+    id: int
+    suite: str  # golden, rag, synthetic
+    timestamp: datetime
+    git_commit: Optional[str]
+    metrics: BQASMetrics
+    results: List[TestResult]
+    duration_seconds: float
+
+
+class BQASRunner:
+    """
+    Main test runner for BQAS test suites.
+
+    Executes:
+    - Golden Suite: Pre-defined golden test cases from YAML
+    - RAG Suite: RAG/Correction quality tests
+    - Synthetic Suite: LLM-generated test variations
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+        self.judge = LLMJudge(self.config)
+        self.rag_judge = RAGJudge(self.config)
+        self.synthetic_generator = SyntheticGenerator(self.config)
+        self._http_client: Optional[httpx.AsyncClient] = None
+        self._test_runs: List[TestRun] = []
+        self._run_counter = 0
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client for voice service calls."""
+        if self._http_client is None:
+            self._http_client = httpx.AsyncClient(timeout=30.0)
+        return self._http_client
+
+    # ================================
+    # Golden Suite Runner
+    # ================================
+
+    async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
+        """
+        Run the golden test suite.
+
+        Loads test cases from YAML files and evaluates each one.
+        """
+        logger.info("Starting Golden Suite run")
+        start_time = datetime.utcnow()
+
+        # Load all golden test cases
+        test_cases = await self._load_golden_tests()
+        logger.info(f"Loaded {len(test_cases)} golden test cases")
+
+        # Run all tests
+        results = []
+        for i, test_case in enumerate(test_cases):
+            try:
+                result = await self._run_golden_test(test_case)
+                results.append(result)
+
+                if (i + 1) % 10 == 0:
+                    logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
+
+            except Exception as e:
+                logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
+                # Create a failed result
+                results.append(self._create_error_result(test_case, str(e)))
+
+        # Calculate metrics
+        metrics = BQASMetrics.from_results(results)
+        duration = (datetime.utcnow() - start_time).total_seconds()
+
+        # Record run
+        self._run_counter += 1
+        run = TestRun(
+            id=self._run_counter,
+            suite="golden",
+            timestamp=start_time,
+            git_commit=git_commit,
+            metrics=metrics,
+            results=results,
+            duration_seconds=duration,
+        )
+        self._test_runs.insert(0, run)
+
+        logger.info(
+            "Golden Suite completed",
+            total=metrics.total_tests,
+            passed=metrics.passed_tests,
+            failed=metrics.failed_tests,
+            score=metrics.avg_composite_score,
+            duration=f"{duration:.1f}s",
+        )
+
+        return run
+
+    async def _load_golden_tests(self) -> List[Dict[str, Any]]:
+        """Load all golden test cases from YAML files."""
+        tests = []
+        golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
+
+        yaml_files = [
+            "intent_tests.yaml",
+            "edge_cases.yaml",
+            "workflow_tests.yaml",
+        ]
+
+        for filename in yaml_files:
+            filepath = golden_dir / filename
+            if filepath.exists():
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
+                        data = yaml.safe_load(f)
+                        if data and 'tests' in data:
+                            for test in data['tests']:
+                                test['source_file'] = filename
+                            tests.extend(data['tests'])
+                except Exception as e:
+                    logger.warning(f"Failed to load {filename}", error=str(e))
+
+        return tests
+
+    async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
+        """Run a single golden test case."""
+        test_id = test_case.get('id', 'UNKNOWN')
+        test_name = test_case.get('name', '')
+        user_input = test_case.get('input', '')
+        expected_intent = test_case.get('expected_intent', '')
+        min_score = test_case.get('min_score', self.config.min_golden_score)
+
+        # Get response from voice service (or simulate)
+        detected_intent, response = await self._get_voice_response(user_input, expected_intent)
+
+        # Evaluate with judge
+        result = await self.judge.evaluate_test_case(
+            test_id=test_id,
+            test_name=test_name,
+            user_input=user_input,
+            expected_intent=expected_intent,
+            detected_intent=detected_intent,
+            response=response,
+            min_score=min_score,
+        )
+
+        return result
+
+    async def _get_voice_response(
+        self,
+        user_input: str,
+        expected_intent: str
+    ) -> tuple[str, str]:
+        """
+        Get response from voice service.
+
+        For now, simulates responses since the full voice pipeline
+        might not be available. In production, this would call the
+        actual voice service endpoints.
+        """
+        try:
+            client = await self._get_client()
+
+            # Try to call the voice service intent detection
+            response = await client.post(
+                f"{self.config.voice_service_url}/api/v1/tasks",
+                json={
+                    "type": "intent_detection",
+                    "input": user_input,
+                    "namespace_id": "test_namespace",
+                },
+                timeout=10.0,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
+
+        except Exception as e:
+            logger.debug(f"Voice service call failed, using simulation", error=str(e))
+
+        # Simulate response based on expected intent
+        return self._simulate_response(user_input, expected_intent)
+
+    def _simulate_response(self, user_input: str, expected_intent: str) -> tuple[str, str]:
+        """Simulate voice service response for testing without live service."""
+        # Simulate realistic detected intent (90% correct for golden tests)
+        import random
+        if random.random() < 0.90:
+            detected_intent = expected_intent
+        else:
+            # Simulate occasional misclassification
+            intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
+            detected_intent = random.choice([i for i in intents if i != expected_intent])
+
+        # Generate simulated response
+        responses = {
+            "student_observation": f"Notiz wurde gespeichert: {user_input}",
+            "reminder": f"Erinnerung erstellt: {user_input}",
+            "worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
+            "homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
+            "parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
+            "class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
+            "quiz_generate": f"Quiz wird erstellt: {user_input}",
+            "quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
+            "canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
+            "canvas_layout": f"Layout wird angepasst: {user_input}",
+            "operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
+            "eh_passage": f"EH-Passage gefunden: {user_input}",
+            "feedback_suggest": f"Feedback-Vorschlag: {user_input}",
+            "reminder_schedule": f"Erinnerung geplant: {user_input}",
+            "task_summary": f"Aufgabenuebersicht: {user_input}",
+            "conference_topic": f"Konferenzthema notiert: {user_input}",
+            "correction_note": f"Korrekturnotiz gespeichert: {user_input}",
+            "worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
+        }
+
+        response = responses.get(detected_intent, f"Verstanden: {user_input}")
+        return detected_intent, response
+
+    def _create_error_result(self, test_case: Dict[str, Any], error: str) -> TestResult:
+        """Create a failed test result due to error."""
+        return TestResult(
+            test_id=test_case.get('id', 'UNKNOWN'),
+            test_name=test_case.get('name', 'Error'),
+            user_input=test_case.get('input', ''),
+            expected_intent=test_case.get('expected_intent', ''),
+            detected_intent='error',
+            response='',
+            intent_accuracy=0,
+            faithfulness=1,
+            relevance=1,
+            coherence=1,
+            safety='fail',
+            composite_score=0.0,
+            passed=False,
+            reasoning=f"Test execution error: {error}",
+            timestamp=datetime.utcnow(),
+            duration_ms=0,
+        )
+
+    # ================================
+    # RAG Suite Runner
+    # ================================
+
+    async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
+        """
+        Run the RAG/Correction test suite.
+
+        Tests EH retrieval, operator alignment, hallucination control, etc.
+        """
+        logger.info("Starting RAG Suite run")
+        start_time = datetime.utcnow()
+
+        # Load RAG test cases
+        test_cases = await self._load_rag_tests()
+        logger.info(f"Loaded {len(test_cases)} RAG test cases")
+
+        # Run all tests
+        results = []
+        for i, test_case in enumerate(test_cases):
+            try:
+                result = await self._run_rag_test(test_case)
+                results.append(result)
+
+                if (i + 1) % 5 == 0:
+                    logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
+
+            except Exception as e:
+                logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
+                results.append(self._create_error_result(test_case, str(e)))
+
+        # Calculate metrics
+        metrics = BQASMetrics.from_results(results)
+        duration = (datetime.utcnow() - start_time).total_seconds()
+
+        # Record run
+        self._run_counter += 1
+        run = TestRun(
+            id=self._run_counter,
+            suite="rag",
+            timestamp=start_time,
+            git_commit=git_commit,
+            metrics=metrics,
+            results=results,
+            duration_seconds=duration,
+        )
+        self._test_runs.insert(0, run)
+
+        logger.info(
+            "RAG Suite completed",
+            total=metrics.total_tests,
+            passed=metrics.passed_tests,
+            score=metrics.avg_composite_score,
+            duration=f"{duration:.1f}s",
+        )
+
+        return run
+
+    async def _load_rag_tests(self) -> List[Dict[str, Any]]:
+        """Load RAG test cases from YAML."""
+        tests = []
+        rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
+
+        if rag_file.exists():
+            try:
+                with open(rag_file, 'r', encoding='utf-8') as f:
+                    # Handle YAML documents separated by ---
+                    documents = list(yaml.safe_load_all(f))
+                    for doc in documents:
+                        if doc and 'tests' in doc:
+                            tests.extend(doc['tests'])
+                        if doc and 'edge_cases' in doc:
+                            tests.extend(doc['edge_cases'])
+            except Exception as e:
+                logger.warning(f"Failed to load RAG tests", error=str(e))
+
+        return tests
+
+    async def _run_rag_test(self, test_case: Dict[str, Any]) -> TestResult:
+        """Run a single RAG test case."""
+        # Simulate service response for RAG tests
+        service_response = await self._simulate_rag_response(test_case)
+
+        # Evaluate with RAG judge
+        result = await self.rag_judge.evaluate_rag_test_case(
+            test_case=test_case,
+            service_response=service_response,
+        )
+
+        return result
+
+    async def _simulate_rag_response(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
+        """Simulate RAG service response."""
+        category = test_case.get('category', '')
+        input_data = test_case.get('input', {})
+        expected = test_case.get('expected', {})
+
+        # Simulate responses based on category
+        if category == 'eh_retrieval':
+            concepts = expected.get('must_contain_concepts', [])
+            passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
+            passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
+            return {
+                "passage": passage,
+                "source": "EH_Deutsch_Abitur_2024_NI.pdf",
+                "relevance_score": 0.85,
+            }
+
+        elif category == 'operator_alignment':
+            operator = input_data.get('operator', '')
+            afb = expected.get('afb_level', 'II')
+            actions = expected.get('expected_actions', [])
+            return {
+                "operator": operator,
+                "definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
+                "afb_level": afb,
+            }
+
+        elif category == 'hallucination_control':
+            return {
+                "response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
+                "grounded": True,
+            }
+
+        elif category == 'privacy_compliance':
+            return {
+                "response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
+                "contains_pii": False,
+            }
+
+        elif category == 'namespace_isolation':
+            return {
+                "response": "Zugriff nur auf Daten im eigenen Namespace.",
+                "namespace_violation": False,
+            }
+
+        return {"response": "Simulated response", "success": True}
+
+    # ================================
+    # Synthetic Suite Runner
+    # ================================
+
+    async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
+        """
+        Run the synthetic test suite.
+
+        Generates test variations using LLM and evaluates them.
+        """
+        logger.info("Starting Synthetic Suite run")
+        start_time = datetime.utcnow()
+
+        # Generate synthetic tests
+        all_variations = await self.synthetic_generator.generate_all_intents(
+            count_per_intent=self.config.synthetic_count_per_intent
+        )
+
+        # Flatten variations
+        test_cases = []
+        for intent, variations in all_variations.items():
+            for i, v in enumerate(variations):
+                test_cases.append({
+                    'id': f"SYN-{intent.upper()[:4]}-{i+1:03d}",
+                    'name': f"Synthetic {intent} #{i+1}",
+                    'input': v.input,
+                    'expected_intent': v.expected_intent,
+                    'slots': v.slots,
+                    'source': v.source,
+                    'min_score': self.config.min_synthetic_score,
+                })
+
+        logger.info(f"Generated {len(test_cases)} synthetic test cases")
+
+        # Run all tests
+        results = []
+        for i, test_case in enumerate(test_cases):
+            try:
+                result = await self._run_golden_test(test_case)  # Same logic as golden
+                results.append(result)
+
+                if (i + 1) % 20 == 0:
+                    logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
+
+            except Exception as e:
+                logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
+                results.append(self._create_error_result(test_case, str(e)))
+
+        # Calculate metrics
+        metrics = BQASMetrics.from_results(results)
+        duration = (datetime.utcnow() - start_time).total_seconds()
+
+        # Record run
+        self._run_counter += 1
+        run = TestRun(
+            id=self._run_counter,
+            suite="synthetic",
+            timestamp=start_time,
+            git_commit=git_commit,
+            metrics=metrics,
+            results=results,
+            duration_seconds=duration,
+        )
+        self._test_runs.insert(0, run)
+
+        logger.info(
+            "Synthetic Suite completed",
+            total=metrics.total_tests,
+            passed=metrics.passed_tests,
+            score=metrics.avg_composite_score,
+            duration=f"{duration:.1f}s",
+        )
+
+        return run
+
+    # ================================
+    # Utility Methods
+    # ================================
+
+    def get_test_runs(self, limit: int = 20) -> List[TestRun]:
+        """Get recent test runs."""
+        return self._test_runs[:limit]
+
+    def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
+        """Get latest metrics for each suite."""
+        result = {"golden": None, "rag": None, "synthetic": None}
+
+        for run in self._test_runs:
+            if result[run.suite] is None:
+                result[run.suite] = run.metrics
+            if all(v is not None for v in result.values()):
+                break
+
+        return result
+
+    async def health_check(self) -> Dict[str, Any]:
+        """Check health of BQAS components."""
+        judge_ok = await self.judge.health_check()
+        rag_judge_ok = await self.rag_judge.health_check()
+
+        return {
+            "judge_available": judge_ok,
+            "rag_judge_available": rag_judge_ok,
+            "test_runs_count": len(self._test_runs),
+            "config": {
+                "ollama_url": self.config.ollama_base_url,
+                "judge_model": self.config.judge_model,
+            }
+        }
+
+    async def close(self):
+        """Cleanup resources."""
+        await self.judge.close()
+        await self.rag_judge.close()
+        await self.synthetic_generator.close()
+        if self._http_client:
+            await self._http_client.aclose()
+            self._http_client = None
+
+
+# Singleton instance for the API
+_runner_instance: Optional[BQASRunner] = None
+
+
+def get_runner() -> BQASRunner:
+    """Get or create the global BQASRunner instance."""
+    global _runner_instance
+    if _runner_instance is None:
+        _runner_instance = BQASRunner()
+    return _runner_instance
@@ -0,0 +1,301 @@
+"""
+Synthetic Test Generator
+Generates realistic teacher voice command variations using LLM
+"""
+import json
+import structlog
+import httpx
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+
+from bqas.config import BQASConfig
+from bqas.prompts import SYNTHETIC_GENERATION_PROMPT
+
+logger = structlog.get_logger(__name__)
+
+
+# Teacher speech patterns by intent
+TEACHER_PATTERNS = {
+    "student_observation": [
+        "Notiz zu {name}: {observation}",
+        "Kurze Bemerkung zu {name}, {observation}",
+        "{name} hat heute {observation}",
+        "Bitte merken: {name} - {observation}",
+        "Beobachtung {name}: {observation}",
+    ],
+    "reminder": [
+        "Erinner mich an {task}",
+        "Nicht vergessen: {task}",
+        "Reminder: {task}",
+        "Denk dran: {task}",
+    ],
+    "homework_check": [
+        "Hausaufgabe kontrollieren",
+        "{class_name} {subject} Hausaufgabe kontrollieren",
+        "HA Check {class_name}",
+        "Hausaufgaben {subject} pruefen",
+    ],
+    "worksheet_generate": [
+        "Mach mir ein Arbeitsblatt zu {topic}",
+        "Erstelle bitte {count} Aufgaben zu {topic}",
+        "Ich brauche ein Uebungsblatt fuer {topic}",
+        "Generiere Lueckentexte zu {topic}",
+        "Arbeitsblatt {topic} erstellen",
+    ],
+    "parent_letter": [
+        "Schreib einen Elternbrief wegen {reason}",
+        "Formuliere eine Nachricht an die Eltern von {name} zu {reason}",
+        "Ich brauche einen neutralen Brief an Eltern wegen {reason}",
+        "Elternbrief {reason}",
+    ],
+    "class_message": [
+        "Nachricht an {class_name}: {content}",
+        "Info an die Klasse {class_name}",
+        "Klassennachricht {class_name}",
+        "Mitteilung an {class_name}: {content}",
+    ],
+    "quiz_generate": [
+        "Vokabeltest erstellen",
+        "Quiz mit {count} Fragen",
+        "{duration} Minuten Test",
+        "Kurzer Test zu {topic}",
+    ],
+    "quick_activity": [
+        "{duration} Minuten Einstieg",
+        "Schnelle Aktivitaet {topic}",
+        "Warming Up {duration} Minuten",
+        "Einstiegsaufgabe",
+    ],
+    "canvas_edit": [
+        "Ueberschriften groesser",
+        "Bild {number} nach {direction}",
+        "Pfeil von {source} auf {target}",
+        "Kasten hinzufuegen",
+    ],
+    "canvas_layout": [
+        "Alles auf eine Seite",
+        "Drucklayout A4",
+        "Layout aendern",
+        "Seitenformat anpassen",
+    ],
+    "operator_checklist": [
+        "Operatoren-Checkliste fuer {task_type}",
+        "Welche Operatoren fuer {topic}",
+        "Zeig Operatoren",
+    ],
+    "eh_passage": [
+        "Erwartungshorizont zu {topic}",
+        "Was steht im EH zu {topic}",
+        "EH Passage suchen",
+    ],
+    "feedback_suggest": [
+        "Feedback vorschlagen",
+        "Formuliere Rueckmeldung",
+        "Wie formuliere ich Feedback zu {topic}",
+    ],
+    "reminder_schedule": [
+        "Erinner mich morgen an {task}",
+        "In {time_offset} erinnern: {task}",
+        "Naechste Woche: {task}",
+    ],
+    "task_summary": [
+        "Offene Aufgaben",
+        "Was steht noch an",
+        "Zusammenfassung",
+        "Diese Woche",
+    ],
+}
+
+
+@dataclass
+class SyntheticTest:
+    """A synthetically generated test case."""
+    input: str
+    expected_intent: str
+    slots: Dict[str, Any]
+    source: str = "synthetic"
+
+
+class SyntheticGenerator:
+    """
+    Generates realistic variations of teacher voice commands.
+
+    Uses LLM to create variations with:
+    - Different phrasings
+    - Optional typos
+    - Regional dialects
+    - Natural speech patterns
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
+        return self._client
+
+    async def generate_variations(
+        self,
+        intent: str,
+        count: int = 10,
+        include_typos: bool = True,
+        include_dialect: bool = True,
+    ) -> List[SyntheticTest]:
+        """
+        Generate realistic variations for an intent.
+
+        Args:
+            intent: Target intent type
+            count: Number of variations to generate
+            include_typos: Include occasional typos
+            include_dialect: Include regional variants (Austrian, Swiss)
+
+        Returns:
+            List of SyntheticTest objects
+        """
+        patterns = TEACHER_PATTERNS.get(intent, [])
+        if not patterns:
+            logger.warning(f"No patterns for intent: {intent}")
+            return []
+
+        typo_instruction = "Fuege gelegentlich Tippfehler ein" if include_typos else "Keine Tippfehler"
+        dialect_instruction = "Beruecksichtige regionale Varianten (Oesterreich, Schweiz)" if include_dialect else "Nur Hochdeutsch"
+
+        prompt = SYNTHETIC_GENERATION_PROMPT.format(
+            count=count,
+            intent=intent,
+            patterns="\n".join(f"- {p}" for p in patterns),
+            typo_instruction=typo_instruction,
+            dialect_instruction=dialect_instruction,
+        )
+
+        client = await self._get_client()
+
+        try:
+            resp = await client.post(
+                f"{self.config.ollama_base_url}/api/generate",
+                json={
+                    "model": self.config.judge_model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.8,
+                        "num_predict": 2000,
+                    },
+                },
+            )
+            resp.raise_for_status()
+
+            result_text = resp.json().get("response", "")
+            return self._parse_variations(result_text, intent)
+
+        except Exception as e:
+            logger.error("Failed to generate variations", intent=intent, error=str(e))
+            # Return pattern-based fallbacks
+            return self._generate_fallback(intent, count)
+
+    def _parse_variations(self, text: str, intent: str) -> List[SyntheticTest]:
+        """Parse JSON variations from LLM response."""
+        try:
+            # Find JSON array in response
+            start = text.find("[")
+            end = text.rfind("]") + 1
+            if start >= 0 and end > start:
+                json_str = text[start:end]
+                data = json.loads(json_str)
+
+                return [
+                    SyntheticTest(
+                        input=item.get("input", ""),
+                        expected_intent=item.get("expected_intent", intent),
+                        slots=item.get("slots", {}),
+                        source="llm_generated",
+                    )
+                    for item in data
+                    if item.get("input")
+                ]
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.warning("Failed to parse variations", error=str(e))
+
+        return []
+
+    def _generate_fallback(self, intent: str, count: int) -> List[SyntheticTest]:
+        """Generate simple variations from patterns."""
+        patterns = TEACHER_PATTERNS.get(intent, [])
+        if not patterns:
+            return []
+
+        # Sample slot values
+        sample_values = {
+            "name": ["Max", "Lisa", "Tim", "Anna", "Paul", "Emma"],
+            "observation": ["heute sehr aufmerksam", "braucht Hilfe", "war abgelenkt"],
+            "task": ["Hausaufgaben kontrollieren", "Elternbrief schreiben", "Test vorbereiten"],
+            "class_name": ["7a", "8b", "9c", "10d"],
+            "subject": ["Mathe", "Deutsch", "Englisch", "Physik"],
+            "topic": ["Bruchrechnung", "Vokabeln", "Grammatik", "Prozentrechnung"],
+            "count": ["3", "5", "10"],
+            "duration": ["10", "15", "20"],
+            "reason": ["fehlende Hausaufgaben", "wiederholte Stoerungen", "positives Verhalten"],
+            "content": ["Hausaufgaben bis Freitag", "Test naechste Woche"],
+        }
+
+        import random
+        results = []
+
+        for i in range(count):
+            pattern = patterns[i % len(patterns)]
+
+            # Fill in placeholders
+            filled = pattern
+            for key, values in sample_values.items():
+                placeholder = f"{{{key}}}"
+                if placeholder in filled:
+                    filled = filled.replace(placeholder, random.choice(values), 1)
+
+            # Extract filled slots
+            slots = {}
+            for key in sample_values:
+                if f"{{{key}}}" in pattern:
+                    # The value we used
+                    for val in sample_values[key]:
+                        if val in filled:
+                            slots[key] = val
+                            break
+
+            results.append(SyntheticTest(
+                input=filled,
+                expected_intent=intent,
+                slots=slots,
+                source="pattern_generated",
+            ))
+
+        return results
+
+    async def generate_all_intents(
+        self,
+        count_per_intent: int = 10,
+    ) -> Dict[str, List[SyntheticTest]]:
+        """Generate variations for all known intents."""
+        results = {}
+
+        for intent in TEACHER_PATTERNS.keys():
+            logger.info(f"Generating variations for intent: {intent}")
+            variations = await self.generate_variations(
+                intent=intent,
+                count=count_per_intent,
+                include_typos=self.config.include_typos,
+                include_dialect=self.config.include_dialect,
+            )
+            results[intent] = variations
+            logger.info(f"Generated {len(variations)} variations for {intent}")
+
+        return results
+
+    async def close(self):
+        """Close HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None