[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions

View File

@@ -1,82 +1,49 @@
"""
RAG Judge - Specialized evaluation for RAG/Correction quality
Split into:
- rag_judge_types.py: Data classes for evaluation results
- rag_judge_evaluators.py: Individual evaluation methods
- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
"""
import json
import time
import structlog
import httpx
from dataclasses import dataclass
from typing import Literal, Optional, Dict, List, Any
from datetime import datetime
from typing import Optional, Dict, List, Any
from bqas.config import BQASConfig
from bqas.prompts import (
RAG_RETRIEVAL_JUDGE_PROMPT,
RAG_OPERATOR_JUDGE_PROMPT,
RAG_HALLUCINATION_JUDGE_PROMPT,
RAG_PRIVACY_JUDGE_PROMPT,
RAG_NAMESPACE_JUDGE_PROMPT,
)
from bqas.metrics import TestResult
# Re-export types for backward compatibility
from bqas.rag_judge_types import (
RAGRetrievalResult,
RAGOperatorResult,
RAGHallucinationResult,
RAGPrivacyResult,
RAGNamespaceResult,
)
from bqas.rag_judge_evaluators import (
evaluate_retrieval as _evaluate_retrieval,
evaluate_operator as _evaluate_operator,
evaluate_hallucination as _evaluate_hallucination,
evaluate_privacy as _evaluate_privacy,
evaluate_namespace as _evaluate_namespace,
evaluate_rag_test_case as _evaluate_rag_test_case,
)
__all__ = [
"RAGJudge",
"RAGRetrievalResult",
"RAGOperatorResult",
"RAGHallucinationResult",
"RAGPrivacyResult",
"RAGNamespaceResult",
]
logger = structlog.get_logger(__name__)
@dataclass
class RAGRetrievalResult:
"""Result from RAG retrieval evaluation."""
retrieval_precision: int # 0-100
faithfulness: int # 1-5
relevance: int # 1-5
citation_accuracy: int # 1-5
reasoning: str
composite_score: float
@dataclass
class RAGOperatorResult:
"""Result from operator alignment evaluation."""
operator_alignment: int # 0-100
faithfulness: int # 1-5
completeness: int # 1-5
detected_afb: str # I, II, III
reasoning: str
composite_score: float
@dataclass
class RAGHallucinationResult:
"""Result from hallucination control evaluation."""
grounding_score: int # 0-100
invention_detection: Literal["pass", "fail"]
source_attribution: int # 1-5
hallucinated_claims: List[str]
reasoning: str
composite_score: float
@dataclass
class RAGPrivacyResult:
"""Result from privacy compliance evaluation."""
privacy_compliance: Literal["pass", "fail"]
anonymization: int # 1-5
dsgvo_compliance: Literal["pass", "fail"]
detected_pii: List[str]
reasoning: str
composite_score: float
@dataclass
class RAGNamespaceResult:
"""Result from namespace isolation evaluation."""
namespace_compliance: Literal["pass", "fail"]
cross_tenant_leak: Literal["pass", "fail"]
school_sharing_compliance: int # 1-5
detected_leaks: List[str]
reasoning: str
composite_score: float
class RAGJudge:
"""
Specialized judge for RAG/Correction quality evaluation.
@@ -130,460 +97,53 @@ class RAGJudge:
logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
return {}
# ================================
# Retrieval Evaluation
# ================================
async def evaluate_retrieval(
self,
query: str,
aufgabentyp: str,
subject: str,
level: str,
retrieved_passage: str,
expected_concepts: List[str],
self, query: str, aufgabentyp: str, subject: str, level: str,
retrieved_passage: str, expected_concepts: List[str],
) -> RAGRetrievalResult:
"""Evaluate EH retrieval quality."""
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
query=query,
aufgabentyp=aufgabentyp,
subject=subject,
level=level,
retrieved_passage=retrieved_passage,
expected_concepts=", ".join(expected_concepts),
return await _evaluate_retrieval(
self._call_ollama, self._parse_json_response, self.config,
query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
relevance = max(1, min(5, int(data.get("relevance", 1))))
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
composite = self._calculate_retrieval_composite(
retrieval_precision, faithfulness, relevance, citation_accuracy
)
return RAGRetrievalResult(
retrieval_precision=retrieval_precision,
faithfulness=faithfulness,
relevance=relevance,
citation_accuracy=citation_accuracy,
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Retrieval evaluation failed", error=str(e))
return RAGRetrievalResult(
retrieval_precision=0,
faithfulness=1,
relevance=1,
citation_accuracy=1,
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_retrieval_composite(
self,
retrieval_precision: int,
faithfulness: int,
relevance: int,
citation_accuracy: int,
) -> float:
"""Calculate composite score for retrieval evaluation."""
c = self.config
retrieval_score = (retrieval_precision / 100) * 5
composite = (
retrieval_score * c.rag_retrieval_precision_weight +
faithfulness * c.rag_faithfulness_weight +
relevance * 0.3 + # Higher weight for relevance in retrieval
citation_accuracy * c.rag_citation_accuracy_weight
)
return round(composite, 3)
# ================================
# Operator Evaluation
# ================================
async def evaluate_operator(
self,
operator: str,
generated_definition: str,
expected_afb: str,
expected_actions: List[str],
self, operator: str, generated_definition: str,
expected_afb: str, expected_actions: List[str],
) -> RAGOperatorResult:
"""Evaluate operator alignment."""
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
operator=operator,
generated_definition=generated_definition,
expected_afb=expected_afb,
expected_actions=", ".join(expected_actions),
return await _evaluate_operator(
self._call_ollama, self._parse_json_response,
operator, generated_definition, expected_afb, expected_actions,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
completeness = max(1, min(5, int(data.get("completeness", 1))))
detected_afb = str(data.get("detected_afb", ""))
composite = self._calculate_operator_composite(
operator_alignment, faithfulness, completeness
)
return RAGOperatorResult(
operator_alignment=operator_alignment,
faithfulness=faithfulness,
completeness=completeness,
detected_afb=detected_afb,
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Operator evaluation failed", error=str(e))
return RAGOperatorResult(
operator_alignment=0,
faithfulness=1,
completeness=1,
detected_afb="",
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_operator_composite(
self,
operator_alignment: int,
faithfulness: int,
completeness: int,
) -> float:
"""Calculate composite score for operator evaluation."""
alignment_score = (operator_alignment / 100) * 5
composite = (
alignment_score * 0.5 +
faithfulness * 0.3 +
completeness * 0.2
)
return round(composite, 3)
# ================================
# Hallucination Evaluation
# ================================
async def evaluate_hallucination(
self,
query: str,
response: str,
available_facts: List[str],
self, query: str, response: str, available_facts: List[str],
) -> RAGHallucinationResult:
"""Evaluate for hallucinations."""
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
query=query,
response=response,
available_facts="\n".join(f"- {f}" for f in available_facts),
return await _evaluate_hallucination(
self._call_ollama, self._parse_json_response,
query, response, available_facts,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
hallucinated_claims = data.get("hallucinated_claims", [])
composite = self._calculate_hallucination_composite(
grounding_score, invention_detection, source_attribution
)
return RAGHallucinationResult(
grounding_score=grounding_score,
invention_detection=invention_detection,
source_attribution=source_attribution,
hallucinated_claims=hallucinated_claims[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Hallucination evaluation failed", error=str(e))
return RAGHallucinationResult(
grounding_score=0,
invention_detection="fail",
source_attribution=1,
hallucinated_claims=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_hallucination_composite(
self,
grounding_score: int,
invention_detection: str,
source_attribution: int,
) -> float:
"""Calculate composite score for hallucination evaluation."""
grounding = (grounding_score / 100) * 5
invention = 5.0 if invention_detection == "pass" else 0.0
composite = (
grounding * 0.4 +
invention * 0.4 +
source_attribution * 0.2
)
return round(composite, 3)
# ================================
# Privacy Evaluation
# ================================
async def evaluate_privacy(
self,
query: str,
context: Dict[str, Any],
response: str,
self, query: str, context: Dict[str, Any], response: str,
) -> RAGPrivacyResult:
"""Evaluate privacy/DSGVO compliance."""
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
query=query,
context=json.dumps(context, ensure_ascii=False, indent=2),
response=response,
return await _evaluate_privacy(
self._call_ollama, self._parse_json_response,
query, context, response,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
detected_pii = data.get("detected_pii", [])
composite = self._calculate_privacy_composite(
privacy_compliance, anonymization, dsgvo_compliance
)
return RAGPrivacyResult(
privacy_compliance=privacy_compliance,
anonymization=anonymization,
dsgvo_compliance=dsgvo_compliance,
detected_pii=detected_pii[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Privacy evaluation failed", error=str(e))
return RAGPrivacyResult(
privacy_compliance="fail",
anonymization=1,
dsgvo_compliance="fail",
detected_pii=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_privacy_composite(
self,
privacy_compliance: str,
anonymization: int,
dsgvo_compliance: str,
) -> float:
"""Calculate composite score for privacy evaluation."""
privacy = 5.0 if privacy_compliance == "pass" else 0.0
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
composite = (
privacy * 0.4 +
anonymization * 0.2 +
dsgvo * 0.4
)
return round(composite, 3)
# ================================
# Namespace Evaluation
# ================================
async def evaluate_namespace(
self,
teacher_id: str,
namespace: str,
school_id: str,
requested_data: str,
response: str,
self, teacher_id: str, namespace: str, school_id: str,
requested_data: str, response: str,
) -> RAGNamespaceResult:
"""Evaluate namespace isolation."""
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
teacher_id=teacher_id,
namespace=namespace,
school_id=school_id,
requested_data=requested_data,
response=response,
return await _evaluate_namespace(
self._call_ollama, self._parse_json_response,
teacher_id, namespace, school_id, requested_data, response,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
detected_leaks = data.get("detected_leaks", [])
composite = self._calculate_namespace_composite(
namespace_compliance, cross_tenant_leak, school_sharing_compliance
)
return RAGNamespaceResult(
namespace_compliance=namespace_compliance,
cross_tenant_leak=cross_tenant_leak,
school_sharing_compliance=school_sharing_compliance,
detected_leaks=detected_leaks[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Namespace evaluation failed", error=str(e))
return RAGNamespaceResult(
namespace_compliance="fail",
cross_tenant_leak="fail",
school_sharing_compliance=1,
detected_leaks=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_namespace_composite(
self,
namespace_compliance: str,
cross_tenant_leak: str,
school_sharing_compliance: int,
) -> float:
"""Calculate composite score for namespace evaluation."""
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
composite = (
ns_compliance * 0.4 +
cross_tenant * 0.4 +
school_sharing_compliance * 0.2
)
return round(composite, 3)
# ================================
# Test Case Evaluation
# ================================
async def evaluate_rag_test_case(
self,
test_case: Dict[str, Any],
service_response: Dict[str, Any],
self, test_case: Dict[str, Any], service_response: Dict[str, Any],
) -> TestResult:
"""
Evaluate a full RAG test case from the golden suite.
Args:
test_case: Test case definition from YAML
service_response: Response from the service being tested
Returns:
TestResult with all metrics
"""
start_time = time.time()
test_id = test_case.get("id", "UNKNOWN")
test_name = test_case.get("name", "")
category = test_case.get("category", "")
min_score = test_case.get("min_score", 3.5)
# Route to appropriate evaluation based on category
composite_score = 0.0
reasoning = ""
if category == "eh_retrieval":
result = await self.evaluate_retrieval(
query=test_case.get("input", {}).get("query", ""),
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
retrieved_passage=service_response.get("passage", ""),
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "operator_alignment":
result = await self.evaluate_operator(
operator=test_case.get("input", {}).get("operator", ""),
generated_definition=service_response.get("definition", ""),
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "hallucination_control":
result = await self.evaluate_hallucination(
query=test_case.get("input", {}).get("query", ""),
response=service_response.get("response", ""),
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "privacy_compliance":
result = await self.evaluate_privacy(
query=test_case.get("input", {}).get("query", ""),
context=test_case.get("input", {}).get("context", {}),
response=service_response.get("response", ""),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "namespace_isolation":
context = test_case.get("input", {}).get("context", {})
result = await self.evaluate_namespace(
teacher_id=context.get("teacher_id", ""),
namespace=context.get("namespace", ""),
school_id=context.get("school_id", ""),
requested_data=test_case.get("input", {}).get("query", ""),
response=service_response.get("response", ""),
)
composite_score = result.composite_score
reasoning = result.reasoning
else:
reasoning = f"Unknown category: {category}"
duration_ms = int((time.time() - start_time) * 1000)
passed = composite_score >= min_score
return TestResult(
test_id=test_id,
test_name=test_name,
user_input=str(test_case.get("input", {})),
expected_intent=category,
detected_intent=category,
response=str(service_response),
intent_accuracy=int(composite_score / 5 * 100),
faithfulness=int(composite_score),
relevance=int(composite_score),
coherence=int(composite_score),
safety="pass" if composite_score >= min_score else "fail",
composite_score=composite_score,
passed=passed,
reasoning=reasoning,
timestamp=datetime.utcnow(),
duration_ms=duration_ms,
)
return await _evaluate_rag_test_case(self, test_case, service_response)
async def health_check(self) -> bool:
"""Check if Ollama and judge model are available."""

View File

@@ -0,0 +1,397 @@
"""
RAG Judge Evaluators - Individual evaluation methods for RAG quality
"""
import json
import time
import structlog
from typing import List, Dict, Any
from datetime import datetime
from bqas.config import BQASConfig
from bqas.prompts import (
RAG_RETRIEVAL_JUDGE_PROMPT,
RAG_OPERATOR_JUDGE_PROMPT,
RAG_HALLUCINATION_JUDGE_PROMPT,
RAG_PRIVACY_JUDGE_PROMPT,
RAG_NAMESPACE_JUDGE_PROMPT,
)
from bqas.metrics import TestResult
from bqas.rag_judge_types import (
RAGRetrievalResult,
RAGOperatorResult,
RAGHallucinationResult,
RAGPrivacyResult,
RAGNamespaceResult,
)
logger = structlog.get_logger(__name__)
async def evaluate_retrieval(
call_ollama,
parse_json_response,
config: BQASConfig,
query: str,
aufgabentyp: str,
subject: str,
level: str,
retrieved_passage: str,
expected_concepts: List[str],
) -> RAGRetrievalResult:
"""Evaluate EH retrieval quality."""
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
query=query,
aufgabentyp=aufgabentyp,
subject=subject,
level=level,
retrieved_passage=retrieved_passage,
expected_concepts=", ".join(expected_concepts),
)
try:
response_text = await call_ollama(prompt)
data = parse_json_response(response_text)
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
relevance = max(1, min(5, int(data.get("relevance", 1))))
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
composite = _calculate_retrieval_composite(
config, retrieval_precision, faithfulness, relevance, citation_accuracy
)
return RAGRetrievalResult(
retrieval_precision=retrieval_precision,
faithfulness=faithfulness,
relevance=relevance,
citation_accuracy=citation_accuracy,
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Retrieval evaluation failed", error=str(e))
return RAGRetrievalResult(
retrieval_precision=0,
faithfulness=1,
relevance=1,
citation_accuracy=1,
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_retrieval_composite(
config: BQASConfig,
retrieval_precision: int,
faithfulness: int,
relevance: int,
citation_accuracy: int,
) -> float:
"""Calculate composite score for retrieval evaluation."""
retrieval_score = (retrieval_precision / 100) * 5
composite = (
retrieval_score * config.rag_retrieval_precision_weight +
faithfulness * config.rag_faithfulness_weight +
relevance * 0.3 +
citation_accuracy * config.rag_citation_accuracy_weight
)
return round(composite, 3)
async def evaluate_operator(
call_ollama,
parse_json_response,
operator: str,
generated_definition: str,
expected_afb: str,
expected_actions: List[str],
) -> RAGOperatorResult:
"""Evaluate operator alignment."""
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
operator=operator,
generated_definition=generated_definition,
expected_afb=expected_afb,
expected_actions=", ".join(expected_actions),
)
try:
response_text = await call_ollama(prompt)
data = parse_json_response(response_text)
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
completeness = max(1, min(5, int(data.get("completeness", 1))))
detected_afb = str(data.get("detected_afb", ""))
alignment_score = (operator_alignment / 100) * 5
composite = round(
alignment_score * 0.5 + faithfulness * 0.3 + completeness * 0.2, 3
)
return RAGOperatorResult(
operator_alignment=operator_alignment,
faithfulness=faithfulness,
completeness=completeness,
detected_afb=detected_afb,
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Operator evaluation failed", error=str(e))
return RAGOperatorResult(
operator_alignment=0,
faithfulness=1,
completeness=1,
detected_afb="",
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
async def evaluate_hallucination(
call_ollama,
parse_json_response,
query: str,
response: str,
available_facts: List[str],
) -> RAGHallucinationResult:
"""Evaluate for hallucinations."""
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
query=query,
response=response,
available_facts="\n".join(f"- {f}" for f in available_facts),
)
try:
response_text = await call_ollama(prompt)
data = parse_json_response(response_text)
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
hallucinated_claims = data.get("hallucinated_claims", [])
grounding = (grounding_score / 100) * 5
invention = 5.0 if invention_detection == "pass" else 0.0
composite = round(grounding * 0.4 + invention * 0.4 + source_attribution * 0.2, 3)
return RAGHallucinationResult(
grounding_score=grounding_score,
invention_detection=invention_detection,
source_attribution=source_attribution,
hallucinated_claims=hallucinated_claims[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Hallucination evaluation failed", error=str(e))
return RAGHallucinationResult(
grounding_score=0,
invention_detection="fail",
source_attribution=1,
hallucinated_claims=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
async def evaluate_privacy(
call_ollama,
parse_json_response,
query: str,
context: Dict[str, Any],
response: str,
) -> RAGPrivacyResult:
"""Evaluate privacy/DSGVO compliance."""
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
query=query,
context=json.dumps(context, ensure_ascii=False, indent=2),
response=response,
)
try:
response_text = await call_ollama(prompt)
data = parse_json_response(response_text)
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
detected_pii = data.get("detected_pii", [])
privacy = 5.0 if privacy_compliance == "pass" else 0.0
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
composite = round(privacy * 0.4 + anonymization * 0.2 + dsgvo * 0.4, 3)
return RAGPrivacyResult(
privacy_compliance=privacy_compliance,
anonymization=anonymization,
dsgvo_compliance=dsgvo_compliance,
detected_pii=detected_pii[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Privacy evaluation failed", error=str(e))
return RAGPrivacyResult(
privacy_compliance="fail",
anonymization=1,
dsgvo_compliance="fail",
detected_pii=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
async def evaluate_namespace(
call_ollama,
parse_json_response,
teacher_id: str,
namespace: str,
school_id: str,
requested_data: str,
response: str,
) -> RAGNamespaceResult:
"""Evaluate namespace isolation."""
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
teacher_id=teacher_id,
namespace=namespace,
school_id=school_id,
requested_data=requested_data,
response=response,
)
try:
response_text = await call_ollama(prompt)
data = parse_json_response(response_text)
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
detected_leaks = data.get("detected_leaks", [])
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
composite = round(
ns_compliance * 0.4 + cross_tenant * 0.4 + school_sharing_compliance * 0.2, 3
)
return RAGNamespaceResult(
namespace_compliance=namespace_compliance,
cross_tenant_leak=cross_tenant_leak,
school_sharing_compliance=school_sharing_compliance,
detected_leaks=detected_leaks[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Namespace evaluation failed", error=str(e))
return RAGNamespaceResult(
namespace_compliance="fail",
cross_tenant_leak="fail",
school_sharing_compliance=1,
detected_leaks=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
async def evaluate_rag_test_case(
judge_instance,
test_case: Dict[str, Any],
service_response: Dict[str, Any],
) -> TestResult:
"""
Evaluate a full RAG test case from the golden suite.
"""
start_time = time.time()
test_id = test_case.get("id", "UNKNOWN")
test_name = test_case.get("name", "")
category = test_case.get("category", "")
min_score = test_case.get("min_score", 3.5)
composite_score = 0.0
reasoning = ""
if category == "eh_retrieval":
result = await judge_instance.evaluate_retrieval(
query=test_case.get("input", {}).get("query", ""),
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
retrieved_passage=service_response.get("passage", ""),
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "operator_alignment":
result = await judge_instance.evaluate_operator(
operator=test_case.get("input", {}).get("operator", ""),
generated_definition=service_response.get("definition", ""),
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "hallucination_control":
result = await judge_instance.evaluate_hallucination(
query=test_case.get("input", {}).get("query", ""),
response=service_response.get("response", ""),
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "privacy_compliance":
result = await judge_instance.evaluate_privacy(
query=test_case.get("input", {}).get("query", ""),
context=test_case.get("input", {}).get("context", {}),
response=service_response.get("response", ""),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "namespace_isolation":
context = test_case.get("input", {}).get("context", {})
result = await judge_instance.evaluate_namespace(
teacher_id=context.get("teacher_id", ""),
namespace=context.get("namespace", ""),
school_id=context.get("school_id", ""),
requested_data=test_case.get("input", {}).get("query", ""),
response=service_response.get("response", ""),
)
composite_score = result.composite_score
reasoning = result.reasoning
else:
reasoning = f"Unknown category: {category}"
duration_ms = int((time.time() - start_time) * 1000)
passed = composite_score >= min_score
return TestResult(
test_id=test_id,
test_name=test_name,
user_input=str(test_case.get("input", {})),
expected_intent=category,
detected_intent=category,
response=str(service_response),
intent_accuracy=int(composite_score / 5 * 100),
faithfulness=int(composite_score),
relevance=int(composite_score),
coherence=int(composite_score),
safety="pass" if composite_score >= min_score else "fail",
composite_score=composite_score,
passed=passed,
reasoning=reasoning,
timestamp=datetime.utcnow(),
duration_ms=duration_ms,
)

View File

@@ -0,0 +1,60 @@
"""
RAG Judge Types - Data classes for RAG evaluation results
"""
from dataclasses import dataclass
from typing import Literal, List
@dataclass
class RAGRetrievalResult:
"""Result from RAG retrieval evaluation."""
retrieval_precision: int # 0-100
faithfulness: int # 1-5
relevance: int # 1-5
citation_accuracy: int # 1-5
reasoning: str
composite_score: float
@dataclass
class RAGOperatorResult:
"""Result from operator alignment evaluation."""
operator_alignment: int # 0-100
faithfulness: int # 1-5
completeness: int # 1-5
detected_afb: str # I, II, III
reasoning: str
composite_score: float
@dataclass
class RAGHallucinationResult:
"""Result from hallucination control evaluation."""
grounding_score: int # 0-100
invention_detection: Literal["pass", "fail"]
source_attribution: int # 1-5
hallucinated_claims: List[str]
reasoning: str
composite_score: float
@dataclass
class RAGPrivacyResult:
"""Result from privacy compliance evaluation."""
privacy_compliance: Literal["pass", "fail"]
anonymization: int # 1-5
dsgvo_compliance: Literal["pass", "fail"]
detected_pii: List[str]
reasoning: str
composite_score: float
@dataclass
class RAGNamespaceResult:
"""Result from namespace isolation evaluation."""
namespace_compliance: Literal["pass", "fail"]
cross_tenant_leak: Literal["pass", "fail"]
school_sharing_compliance: int # 1-5
detected_leaks: List[str]
reasoning: str
composite_score: float

View File

@@ -1,11 +1,12 @@
"""
BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
Split into:
- runner_golden.py: Test loading, simulation helpers, error result creation
- runner.py (this file): BQASRunner class, singleton
"""
import yaml
import asyncio
import structlog
import httpx
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
from dataclasses import dataclass, field
@@ -15,6 +16,13 @@ from bqas.judge import LLMJudge
from bqas.rag_judge import RAGJudge
from bqas.metrics import TestResult, BQASMetrics
from bqas.synthetic_generator import SyntheticGenerator
from bqas.runner_golden import (
load_golden_tests,
load_rag_tests,
simulate_response,
create_error_result,
simulate_rag_response,
)
logger = structlog.get_logger(__name__)
@@ -61,87 +69,42 @@ class BQASRunner:
# ================================
async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""
Run the golden test suite.
Loads test cases from YAML files and evaluates each one.
"""
"""Run the golden test suite."""
logger.info("Starting Golden Suite run")
start_time = datetime.utcnow()
# Load all golden test cases
test_cases = await self._load_golden_tests()
test_cases = await load_golden_tests()
logger.info(f"Loaded {len(test_cases)} golden test cases")
# Run all tests
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_golden_test(test_case)
results.append(result)
if (i + 1) % 10 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
except Exception as e:
logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
# Create a failed result
results.append(self._create_error_result(test_case, str(e)))
results.append(create_error_result(test_case, str(e)))
# Calculate metrics
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
# Record run
self._run_counter += 1
run = TestRun(
id=self._run_counter,
suite="golden",
timestamp=start_time,
git_commit=git_commit,
metrics=metrics,
results=results,
id=self._run_counter, suite="golden", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"Golden Suite completed",
total=metrics.total_tests,
passed=metrics.passed_tests,
failed=metrics.failed_tests,
score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
"Golden Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, failed=metrics.failed_tests,
score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
)
return run
async def _load_golden_tests(self) -> List[Dict[str, Any]]:
"""Load all golden test cases from YAML files."""
tests = []
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
yaml_files = [
"intent_tests.yaml",
"edge_cases.yaml",
"workflow_tests.yaml",
]
for filename in yaml_files:
filepath = golden_dir / filename
if filepath.exists():
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and 'tests' in data:
for test in data['tests']:
test['source_file'] = filename
tests.extend(data['tests'])
except Exception as e:
logger.warning(f"Failed to load {filename}", error=str(e))
return tests
async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
"""Run a single golden test case."""
test_id = test_case.get('id', 'UNKNOWN')
@@ -150,38 +113,19 @@ class BQASRunner:
expected_intent = test_case.get('expected_intent', '')
min_score = test_case.get('min_score', self.config.min_golden_score)
# Get response from voice service (or simulate)
detected_intent, response = await self._get_voice_response(user_input, expected_intent)
# Evaluate with judge
result = await self.judge.evaluate_test_case(
test_id=test_id,
test_name=test_name,
user_input=user_input,
expected_intent=expected_intent,
detected_intent=detected_intent,
response=response,
min_score=min_score,
test_id=test_id, test_name=test_name, user_input=user_input,
expected_intent=expected_intent, detected_intent=detected_intent,
response=response, min_score=min_score,
)
return result
async def _get_voice_response(
self,
user_input: str,
expected_intent: str
) -> tuple[str, str]:
"""
Get response from voice service.
For now, simulates responses since the full voice pipeline
might not be available. In production, this would call the
actual voice service endpoints.
"""
async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
"""Get response from voice service."""
try:
client = await self._get_client()
# Try to call the voice service intent detection
response = await client.post(
f"{self.config.voice_service_url}/api/v1/tasks",
json={
@@ -191,231 +135,71 @@ class BQASRunner:
},
timeout=10.0,
)
if response.status_code == 200:
data = response.json()
return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
except Exception as e:
logger.debug(f"Voice service call failed, using simulation", error=str(e))
# Simulate response based on expected intent
return self._simulate_response(user_input, expected_intent)
def _simulate_response(self, user_input: str, expected_intent: str) -> tuple[str, str]:
"""Simulate voice service response for testing without live service."""
# Simulate realistic detected intent (90% correct for golden tests)
import random
if random.random() < 0.90:
detected_intent = expected_intent
else:
# Simulate occasional misclassification
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
detected_intent = random.choice([i for i in intents if i != expected_intent])
# Generate simulated response
responses = {
"student_observation": f"Notiz wurde gespeichert: {user_input}",
"reminder": f"Erinnerung erstellt: {user_input}",
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
"quiz_generate": f"Quiz wird erstellt: {user_input}",
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
"canvas_layout": f"Layout wird angepasst: {user_input}",
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
"eh_passage": f"EH-Passage gefunden: {user_input}",
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
"reminder_schedule": f"Erinnerung geplant: {user_input}",
"task_summary": f"Aufgabenuebersicht: {user_input}",
"conference_topic": f"Konferenzthema notiert: {user_input}",
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
}
response = responses.get(detected_intent, f"Verstanden: {user_input}")
return detected_intent, response
def _create_error_result(self, test_case: Dict[str, Any], error: str) -> TestResult:
"""Create a failed test result due to error."""
return TestResult(
test_id=test_case.get('id', 'UNKNOWN'),
test_name=test_case.get('name', 'Error'),
user_input=test_case.get('input', ''),
expected_intent=test_case.get('expected_intent', ''),
detected_intent='error',
response='',
intent_accuracy=0,
faithfulness=1,
relevance=1,
coherence=1,
safety='fail',
composite_score=0.0,
passed=False,
reasoning=f"Test execution error: {error}",
timestamp=datetime.utcnow(),
duration_ms=0,
)
return simulate_response(user_input, expected_intent)
# ================================
# RAG Suite Runner
# ================================
async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""
Run the RAG/Correction test suite.
Tests EH retrieval, operator alignment, hallucination control, etc.
"""
"""Run the RAG/Correction test suite."""
logger.info("Starting RAG Suite run")
start_time = datetime.utcnow()
# Load RAG test cases
test_cases = await self._load_rag_tests()
test_cases = await load_rag_tests()
logger.info(f"Loaded {len(test_cases)} RAG test cases")
# Run all tests
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_rag_test(test_case)
service_response = await simulate_rag_response(test_case)
result = await self.rag_judge.evaluate_rag_test_case(
test_case=test_case, service_response=service_response,
)
results.append(result)
if (i + 1) % 5 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
except Exception as e:
logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
results.append(self._create_error_result(test_case, str(e)))
results.append(create_error_result(test_case, str(e)))
# Calculate metrics
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
# Record run
self._run_counter += 1
run = TestRun(
id=self._run_counter,
suite="rag",
timestamp=start_time,
git_commit=git_commit,
metrics=metrics,
results=results,
id=self._run_counter, suite="rag", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"RAG Suite completed",
total=metrics.total_tests,
passed=metrics.passed_tests,
score=metrics.avg_composite_score,
"RAG Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
)
return run
async def _load_rag_tests(self) -> List[Dict[str, Any]]:
"""Load RAG test cases from YAML."""
tests = []
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
if rag_file.exists():
try:
with open(rag_file, 'r', encoding='utf-8') as f:
# Handle YAML documents separated by ---
documents = list(yaml.safe_load_all(f))
for doc in documents:
if doc and 'tests' in doc:
tests.extend(doc['tests'])
if doc and 'edge_cases' in doc:
tests.extend(doc['edge_cases'])
except Exception as e:
logger.warning(f"Failed to load RAG tests", error=str(e))
return tests
async def _run_rag_test(self, test_case: Dict[str, Any]) -> TestResult:
"""Run a single RAG test case."""
# Simulate service response for RAG tests
service_response = await self._simulate_rag_response(test_case)
# Evaluate with RAG judge
result = await self.rag_judge.evaluate_rag_test_case(
test_case=test_case,
service_response=service_response,
)
return result
async def _simulate_rag_response(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
"""Simulate RAG service response."""
category = test_case.get('category', '')
input_data = test_case.get('input', {})
expected = test_case.get('expected', {})
# Simulate responses based on category
if category == 'eh_retrieval':
concepts = expected.get('must_contain_concepts', [])
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
return {
"passage": passage,
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
"relevance_score": 0.85,
}
elif category == 'operator_alignment':
operator = input_data.get('operator', '')
afb = expected.get('afb_level', 'II')
actions = expected.get('expected_actions', [])
return {
"operator": operator,
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
"afb_level": afb,
}
elif category == 'hallucination_control':
return {
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
"grounded": True,
}
elif category == 'privacy_compliance':
return {
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
"contains_pii": False,
}
elif category == 'namespace_isolation':
return {
"response": "Zugriff nur auf Daten im eigenen Namespace.",
"namespace_violation": False,
}
return {"response": "Simulated response", "success": True}
# ================================
# Synthetic Suite Runner
# ================================
async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
"""
Run the synthetic test suite.
Generates test variations using LLM and evaluates them.
"""
"""Run the synthetic test suite."""
logger.info("Starting Synthetic Suite run")
start_time = datetime.utcnow()
# Generate synthetic tests
all_variations = await self.synthetic_generator.generate_all_intents(
count_per_intent=self.config.synthetic_count_per_intent
)
# Flatten variations
test_cases = []
for intent, variations in all_variations.items():
for i, v in enumerate(variations):
@@ -431,45 +215,33 @@ class BQASRunner:
logger.info(f"Generated {len(test_cases)} synthetic test cases")
# Run all tests
results = []
for i, test_case in enumerate(test_cases):
try:
result = await self._run_golden_test(test_case) # Same logic as golden
result = await self._run_golden_test(test_case)
results.append(result)
if (i + 1) % 20 == 0:
logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
except Exception as e:
logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
results.append(self._create_error_result(test_case, str(e)))
results.append(create_error_result(test_case, str(e)))
# Calculate metrics
metrics = BQASMetrics.from_results(results)
duration = (datetime.utcnow() - start_time).total_seconds()
# Record run
self._run_counter += 1
run = TestRun(
id=self._run_counter,
suite="synthetic",
timestamp=start_time,
git_commit=git_commit,
metrics=metrics,
results=results,
id=self._run_counter, suite="synthetic", timestamp=start_time,
git_commit=git_commit, metrics=metrics, results=results,
duration_seconds=duration,
)
self._test_runs.insert(0, run)
logger.info(
"Synthetic Suite completed",
total=metrics.total_tests,
passed=metrics.passed_tests,
score=metrics.avg_composite_score,
"Synthetic Suite completed", total=metrics.total_tests,
passed=metrics.passed_tests, score=metrics.avg_composite_score,
duration=f"{duration:.1f}s",
)
return run
# ================================
@@ -483,20 +255,17 @@ class BQASRunner:
def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
"""Get latest metrics for each suite."""
result = {"golden": None, "rag": None, "synthetic": None}
for run in self._test_runs:
if result[run.suite] is None:
result[run.suite] = run.metrics
if all(v is not None for v in result.values()):
break
return result
async def health_check(self) -> Dict[str, Any]:
"""Check health of BQAS components."""
judge_ok = await self.judge.health_check()
rag_judge_ok = await self.rag_judge.health_check()
return {
"judge_available": judge_ok,
"rag_judge_available": rag_judge_ok,

View File

@@ -0,0 +1,162 @@
"""
BQAS Golden Suite Runner - Loads and executes golden test cases
"""
import yaml
import structlog
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
from bqas.metrics import TestResult
logger = structlog.get_logger(__name__)
async def load_golden_tests() -> List[Dict[str, Any]]:
"""Load all golden test cases from YAML files."""
tests = []
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
yaml_files = [
"intent_tests.yaml",
"edge_cases.yaml",
"workflow_tests.yaml",
]
for filename in yaml_files:
filepath = golden_dir / filename
if filepath.exists():
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data and 'tests' in data:
for test in data['tests']:
test['source_file'] = filename
tests.extend(data['tests'])
except Exception as e:
logger.warning(f"Failed to load {filename}", error=str(e))
return tests
async def load_rag_tests() -> List[Dict[str, Any]]:
"""Load RAG test cases from YAML."""
tests = []
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
if rag_file.exists():
try:
with open(rag_file, 'r', encoding='utf-8') as f:
documents = list(yaml.safe_load_all(f))
for doc in documents:
if doc and 'tests' in doc:
tests.extend(doc['tests'])
if doc and 'edge_cases' in doc:
tests.extend(doc['edge_cases'])
except Exception as e:
logger.warning(f"Failed to load RAG tests", error=str(e))
return tests
def simulate_response(user_input: str, expected_intent: str) -> tuple:
"""Simulate voice service response for testing without live service."""
import random
if random.random() < 0.90:
detected_intent = expected_intent
else:
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
detected_intent = random.choice([i for i in intents if i != expected_intent])
responses = {
"student_observation": f"Notiz wurde gespeichert: {user_input}",
"reminder": f"Erinnerung erstellt: {user_input}",
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
"quiz_generate": f"Quiz wird erstellt: {user_input}",
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
"canvas_layout": f"Layout wird angepasst: {user_input}",
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
"eh_passage": f"EH-Passage gefunden: {user_input}",
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
"reminder_schedule": f"Erinnerung geplant: {user_input}",
"task_summary": f"Aufgabenuebersicht: {user_input}",
"conference_topic": f"Konferenzthema notiert: {user_input}",
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
}
response = responses.get(detected_intent, f"Verstanden: {user_input}")
return detected_intent, response
def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult:
"""Create a failed test result due to error."""
return TestResult(
test_id=test_case.get('id', 'UNKNOWN'),
test_name=test_case.get('name', 'Error'),
user_input=test_case.get('input', ''),
expected_intent=test_case.get('expected_intent', ''),
detected_intent='error',
response='',
intent_accuracy=0,
faithfulness=1,
relevance=1,
coherence=1,
safety='fail',
composite_score=0.0,
passed=False,
reasoning=f"Test execution error: {error}",
timestamp=datetime.utcnow(),
duration_ms=0,
)
async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]:
"""Simulate RAG service response."""
category = test_case.get('category', '')
input_data = test_case.get('input', {})
expected = test_case.get('expected', {})
if category == 'eh_retrieval':
concepts = expected.get('must_contain_concepts', [])
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
return {
"passage": passage,
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
"relevance_score": 0.85,
}
elif category == 'operator_alignment':
operator = input_data.get('operator', '')
afb = expected.get('afb_level', 'II')
actions = expected.get('expected_actions', [])
return {
"operator": operator,
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
"afb_level": afb,
}
elif category == 'hallucination_control':
return {
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
"grounded": True,
}
elif category == 'privacy_compliance':
return {
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
"contains_pii": False,
}
elif category == 'namespace_isolation':
return {
"response": "Zugriff nur auf Daten im eigenen Namespace.",
"namespace_violation": False,
}
return {"response": "Simulated response", "success": True}

View File

@@ -0,0 +1,141 @@
"""
Enhanced Orchestrator Session Management
Session lifecycle methods extracted from EnhancedTaskOrchestrator.
"""
import structlog
from typing import Optional, Dict, Any
from sessions.session_manager import SessionManager, AgentSession, SessionState
from sessions.heartbeat import HeartbeatMonitor, HeartbeatClient
from brain.context_manager import ContextManager
logger = structlog.get_logger(__name__)
async def create_session(
session_manager: SessionManager,
context_manager: ContextManager,
heartbeat: HeartbeatMonitor,
voice_sessions: Dict[str, AgentSession],
heartbeat_clients: Dict[str, HeartbeatClient],
voice_session_id: str,
user_id: str = "",
metadata: Optional[Dict[str, Any]] = None,
system_prompt: str = "",
) -> AgentSession:
"""Creates a new agent session for a voice session."""
session = await session_manager.create_session(
agent_type="voice-orchestrator",
user_id=user_id,
context={"voice_session_id": voice_session_id},
metadata=metadata
)
context_manager.create_context(
session_id=session.session_id,
system_prompt=system_prompt,
max_messages=50
)
heartbeat_client = HeartbeatClient(
session_id=session.session_id,
monitor=heartbeat,
interval_seconds=10
)
await heartbeat_client.start()
heartbeat.register(session.session_id, "voice-orchestrator")
voice_sessions[voice_session_id] = session
heartbeat_clients[session.session_id] = heartbeat_client
logger.info(
"Created agent session",
session_id=session.session_id[:8],
voice_session_id=voice_session_id
)
return session
async def end_session(
session_manager: SessionManager,
heartbeat: HeartbeatMonitor,
voice_sessions: Dict[str, AgentSession],
heartbeat_clients: Dict[str, HeartbeatClient],
voice_session_id: str,
) -> None:
"""Ends an agent session."""
session = voice_sessions.get(voice_session_id)
if not session:
return
if session.session_id in heartbeat_clients:
await heartbeat_clients[session.session_id].stop()
del heartbeat_clients[session.session_id]
heartbeat.unregister(session.session_id)
session.complete()
await session_manager.update_session(session)
del voice_sessions[voice_session_id]
logger.info(
"Ended agent session",
session_id=session.session_id[:8],
duration_seconds=session.get_duration().total_seconds()
)
async def recover_session(
session_manager: SessionManager,
heartbeat: HeartbeatMonitor,
voice_sessions: Dict[str, AgentSession],
heartbeat_clients: Dict[str, HeartbeatClient],
tasks: Dict[str, Any],
process_task_fn,
voice_session_id: str,
session_id: str,
) -> Optional[AgentSession]:
"""Recovers a session from checkpoint."""
session = await session_manager.get_session(session_id)
if not session:
logger.warning("Session not found for recovery", session_id=session_id)
return None
if session.state != SessionState.ACTIVE:
logger.warning(
"Session not active for recovery",
session_id=session_id, state=session.state.value
)
return None
session.resume()
heartbeat_client = HeartbeatClient(
session_id=session.session_id,
monitor=heartbeat,
interval_seconds=10
)
await heartbeat_client.start()
heartbeat.register(session.session_id, "voice-orchestrator")
voice_sessions[voice_session_id] = session
heartbeat_clients[session.session_id] = heartbeat_client
# Recover pending tasks from checkpoints
from models.task import TaskState
for checkpoint in reversed(session.checkpoints):
if checkpoint.name == "task_queued":
task_id = checkpoint.data.get("task_id")
if task_id and task_id in tasks:
task = tasks[task_id]
if task.state == TaskState.QUEUED:
await process_task_fn(task)
logger.info("Recovered pending task", task_id=task_id[:8])
logger.info(
"Recovered session",
session_id=session.session_id[:8],
checkpoints=len(session.checkpoints)
)
return session

View File

@@ -6,6 +6,10 @@ Extends the existing TaskOrchestrator with Multi-Agent support:
- Message bus integration for inter-agent communication
- Quality judge integration via BQAS
- Heartbeat-based liveness
Split into:
- enhanced_orchestrator_session.py: Session lifecycle (create/end/recover)
- enhanced_task_orchestrator.py (this file): Main orchestrator class
"""
import structlog
@@ -27,6 +31,12 @@ from brain.context_manager import ContextManager, MessageRole
from orchestrator.message_bus import MessageBus, AgentMessage, MessagePriority
from orchestrator.task_router import TaskRouter, RoutingStrategy
from services.enhanced_orchestrator_session import (
create_session as _create_session,
end_session as _end_session,
recover_session as _recover_session,
)
logger = structlog.get_logger(__name__)
@@ -47,50 +57,25 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
db_pool=None,
namespace: str = "breakpilot"
):
"""
Initialize the enhanced orchestrator.
Args:
redis_client: Async Redis/Valkey client
db_pool: Async PostgreSQL connection pool
namespace: Namespace for isolation
"""
super().__init__()
# Initialize agent-core components
self.session_manager = SessionManager(
redis_client=redis_client,
db_pool=db_pool,
namespace=namespace
redis_client=redis_client, db_pool=db_pool, namespace=namespace
)
self.memory_store = MemoryStore(
redis_client=redis_client,
db_pool=db_pool,
namespace=namespace
redis_client=redis_client, db_pool=db_pool, namespace=namespace
)
self.context_manager = ContextManager(
redis_client=redis_client,
db_pool=db_pool,
namespace=namespace
redis_client=redis_client, db_pool=db_pool, namespace=namespace
)
self.message_bus = MessageBus(
redis_client=redis_client,
db_pool=db_pool,
namespace=namespace
redis_client=redis_client, db_pool=db_pool, namespace=namespace
)
self.heartbeat = HeartbeatMonitor(
timeout_seconds=30,
check_interval_seconds=5,
max_missed_beats=3
timeout_seconds=30, check_interval_seconds=5, max_missed_beats=3
)
self.task_router = TaskRouter()
# Track active sessions by voice session ID
self._voice_sessions: Dict[str, AgentSession] = {}
self._heartbeat_clients: Dict[str, HeartbeatClient] = {}
@@ -100,231 +85,98 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
"""Starts the enhanced orchestrator"""
await self.message_bus.start()
await self.heartbeat.start_monitoring()
# Subscribe to messages directed at this orchestrator
await self.message_bus.subscribe(
"voice-orchestrator",
self._handle_agent_message
)
await self.message_bus.subscribe("voice-orchestrator", self._handle_agent_message)
logger.info("Enhanced TaskOrchestrator started")
async def stop(self) -> None:
"""Stops the enhanced orchestrator"""
# Stop all heartbeat clients
for client in self._heartbeat_clients.values():
await client.stop()
self._heartbeat_clients.clear()
await self.heartbeat.stop_monitoring()
await self.message_bus.stop()
logger.info("Enhanced TaskOrchestrator stopped")
async def create_session(
self,
voice_session_id: str,
user_id: str = "",
self, voice_session_id: str, user_id: str = "",
metadata: Optional[Dict[str, Any]] = None
) -> AgentSession:
"""
Creates a new agent session for a voice session.
Args:
voice_session_id: The voice session ID
user_id: Optional user ID
metadata: Additional metadata
Returns:
The created AgentSession
"""
# Create session via session manager
session = await self.session_manager.create_session(
agent_type="voice-orchestrator",
user_id=user_id,
context={"voice_session_id": voice_session_id},
metadata=metadata
return await _create_session(
self.session_manager, self.context_manager, self.heartbeat,
self._voice_sessions, self._heartbeat_clients,
voice_session_id, user_id, metadata, self._get_system_prompt(),
)
# Create conversation context
self.context_manager.create_context(
session_id=session.session_id,
system_prompt=self._get_system_prompt(),
max_messages=50
)
# Start heartbeat for this session
heartbeat_client = HeartbeatClient(
session_id=session.session_id,
monitor=self.heartbeat,
interval_seconds=10
)
await heartbeat_client.start()
# Register heartbeat for monitoring
self.heartbeat.register(session.session_id, "voice-orchestrator")
# Store references
self._voice_sessions[voice_session_id] = session
self._heartbeat_clients[session.session_id] = heartbeat_client
logger.info(
"Created agent session",
session_id=session.session_id[:8],
voice_session_id=voice_session_id
)
return session
async def get_session(
self,
voice_session_id: str
) -> Optional[AgentSession]:
"""Gets the agent session for a voice session"""
async def get_session(self, voice_session_id: str) -> Optional[AgentSession]:
return self._voice_sessions.get(voice_session_id)
async def end_session(self, voice_session_id: str) -> None:
"""
Ends an agent session.
Args:
voice_session_id: The voice session ID
"""
session = self._voice_sessions.get(voice_session_id)
if not session:
return
# Stop heartbeat
if session.session_id in self._heartbeat_clients:
await self._heartbeat_clients[session.session_id].stop()
del self._heartbeat_clients[session.session_id]
# Unregister from heartbeat monitor
self.heartbeat.unregister(session.session_id)
# Mark session as completed
session.complete()
await self.session_manager.update_session(session)
# Clean up
del self._voice_sessions[voice_session_id]
logger.info(
"Ended agent session",
session_id=session.session_id[:8],
duration_seconds=session.get_duration().total_seconds()
await _end_session(
self.session_manager, self.heartbeat,
self._voice_sessions, self._heartbeat_clients, voice_session_id,
)
async def queue_task(self, task: Task) -> None:
"""
Queue a task with session checkpointing.
Extends parent to add checkpoint for recovery.
"""
# Get session for this task
"""Queue a task with session checkpointing."""
session = self._voice_sessions.get(task.session_id)
if session:
# Checkpoint before queueing
session.checkpoint("task_queued", {
"task_id": task.id,
"task_type": task.type.value,
"task_id": task.id, "task_type": task.type.value,
"parameters": task.parameters
})
await self.session_manager.update_session(session)
# Call parent implementation
await super().queue_task(task)
async def process_task(self, task: Task) -> None:
"""
Process a task with enhanced routing and quality checks.
Extends parent to:
- Route complex tasks to specialized agents
- Run quality checks via BQAS
- Store results in memory for learning
"""
"""Process a task with enhanced routing and quality checks."""
session = self._voice_sessions.get(task.session_id)
if session:
session.checkpoint("task_processing", {
"task_id": task.id
})
session.checkpoint("task_processing", {"task_id": task.id})
# Check if this task should be routed to a specialized agent
if self._needs_specialized_agent(task):
await self._route_to_agent(task, session)
else:
# Use parent implementation for simple tasks
await super().process_task(task)
# Run quality check on result
if task.result_ref and self._needs_quality_check(task):
await self._run_quality_check(task, session)
# Store in memory for learning
if task.state == TaskState.READY and task.result_ref:
await self._store_task_result(task)
if session:
session.checkpoint("task_completed", {
"task_id": task.id,
"state": task.state.value
"task_id": task.id, "state": task.state.value
})
await self.session_manager.update_session(session)
def _needs_specialized_agent(self, task: Task) -> bool:
"""Check if task needs routing to a specialized agent"""
from models.task import TaskType
# Tasks that benefit from specialized agents
specialized_types = [
TaskType.PARENT_LETTER, # Could use grader for tone
TaskType.FEEDBACK_SUGGEST, # Quality judge for appropriateness
]
return task.type in specialized_types
return task.type in [TaskType.PARENT_LETTER, TaskType.FEEDBACK_SUGGEST]
def _needs_quality_check(self, task: Task) -> bool:
"""Check if task result needs quality validation"""
from models.task import TaskType
# Tasks that generate content should be checked
content_types = [
TaskType.PARENT_LETTER,
TaskType.CLASS_MESSAGE,
TaskType.FEEDBACK_SUGGEST,
TaskType.WORKSHEET_GENERATE,
return task.type in [
TaskType.PARENT_LETTER, TaskType.CLASS_MESSAGE,
TaskType.FEEDBACK_SUGGEST, TaskType.WORKSHEET_GENERATE,
]
return task.type in content_types
async def _route_to_agent(
self,
task: Task,
session: Optional[AgentSession]
) -> None:
async def _route_to_agent(self, task: Task, session: Optional[AgentSession]) -> None:
"""Routes a task to a specialized agent"""
# Determine target agent
intent = f"task_{task.type.value}"
routing_result = await self.task_router.route(
intent=intent,
context={"task": task.parameters},
intent=intent, context={"task": task.parameters},
strategy=RoutingStrategy.LEAST_LOADED
)
if not routing_result.success:
# Fall back to local processing
logger.warning(
"No agent available for task, using local processing",
task_id=task.id[:8],
reason=routing_result.reason
task_id=task.id[:8], reason=routing_result.reason
)
await super().process_task(task)
return
# Send to agent via message bus
try:
response = await self.message_bus.request(
AgentMessage(
@@ -332,8 +184,7 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
receiver=routing_result.agent_id,
message_type=f"process_{task.type.value}",
payload={
"task_id": task.id,
"task_type": task.type.value,
"task_id": task.id, "task_type": task.type.value,
"parameters": task.parameters,
"session_id": session.session_id if session else None
},
@@ -341,179 +192,78 @@ class EnhancedTaskOrchestrator(TaskOrchestrator):
),
timeout=30.0
)
task.result_ref = response.get("result", "")
task.transition_to(TaskState.READY, "agent_processed")
except asyncio.TimeoutError:
logger.error(
"Agent timeout, falling back to local",
task_id=task.id[:8],
agent=routing_result.agent_id
task_id=task.id[:8], agent=routing_result.agent_id
)
await super().process_task(task)
async def _run_quality_check(
self,
task: Task,
session: Optional[AgentSession]
) -> None:
async def _run_quality_check(self, task: Task, session: Optional[AgentSession]) -> None:
"""Runs quality check on task result via quality judge"""
try:
response = await self.message_bus.request(
AgentMessage(
sender="voice-orchestrator",
receiver="quality-judge",
sender="voice-orchestrator", receiver="quality-judge",
message_type="evaluate_response",
payload={
"task_id": task.id,
"task_type": task.type.value,
"response": task.result_ref,
"context": task.parameters
"task_id": task.id, "task_type": task.type.value,
"response": task.result_ref, "context": task.parameters
},
priority=MessagePriority.NORMAL
),
timeout=10.0
)
quality_score = response.get("composite_score", 0)
if quality_score < 60:
# Mark for review
task.error_message = f"Quality check failed: {quality_score}"
logger.warning(
"Task failed quality check",
task_id=task.id[:8],
score=quality_score
)
logger.warning("Task failed quality check", task_id=task.id[:8], score=quality_score)
except asyncio.TimeoutError:
# Quality check timeout is non-fatal
logger.warning(
"Quality check timeout",
task_id=task.id[:8]
)
logger.warning("Quality check timeout", task_id=task.id[:8])
async def _store_task_result(self, task: Task) -> None:
"""Stores task result in memory for learning"""
await self.memory_store.remember(
key=f"task:{task.type.value}:{task.id}",
value={
"result": task.result_ref,
"parameters": task.parameters,
"result": task.result_ref, "parameters": task.parameters,
"completed_at": datetime.utcnow().isoformat()
},
agent_id="voice-orchestrator",
ttl_days=30
agent_id="voice-orchestrator", ttl_days=30
)
async def _handle_agent_message(
self,
message: AgentMessage
) -> Optional[Dict[str, Any]]:
async def _handle_agent_message(self, message: AgentMessage) -> Optional[Dict[str, Any]]:
"""Handles incoming messages from other agents"""
logger.debug(
"Received agent message",
sender=message.sender,
type=message.message_type
)
logger.debug("Received agent message", sender=message.sender, type=message.message_type)
if message.message_type == "task_status_update":
# Handle task status updates
task_id = message.payload.get("task_id")
if task_id in self._tasks:
task = self._tasks[task_id]
new_state = message.payload.get("state")
if new_state:
task.transition_to(TaskState(new_state), "agent_update")
return None
def _get_system_prompt(self) -> str:
"""Returns the system prompt for the voice assistant"""
return """Du bist ein hilfreicher Assistent für Lehrer in der Breakpilot-App.
return """Du bist ein hilfreicher Assistent fuer Lehrer in der Breakpilot-App.
Deine Aufgaben:
- Hilf beim Erstellen von Arbeitsblättern
- Unterstütze bei der Korrektur
- Hilf beim Erstellen von Arbeitsblaettern
- Unterstuetze bei der Korrektur
- Erstelle Elternbriefe und Klassennachrichten
- Dokumentiere Beobachtungen und Erinnerungen
Halte dich kurz und präzise. Nutze einfache, klare Sprache.
Halte dich kurz und praezise. Nutze einfache, klare Sprache.
Bei Unklarheiten frage nach."""
# Recovery methods
async def recover_session(
self,
voice_session_id: str,
session_id: str
self, voice_session_id: str, session_id: str
) -> Optional[AgentSession]:
"""
Recovers a session from checkpoint.
Args:
voice_session_id: The voice session ID
session_id: The agent session ID to recover
Returns:
The recovered session or None
"""
session = await self.session_manager.get_session(session_id)
if not session:
logger.warning(
"Session not found for recovery",
session_id=session_id
)
return None
if session.state != SessionState.ACTIVE:
logger.warning(
"Session not active for recovery",
session_id=session_id,
state=session.state.value
)
return None
# Resume session
session.resume()
# Restore heartbeat
heartbeat_client = HeartbeatClient(
session_id=session.session_id,
monitor=self.heartbeat,
interval_seconds=10
return await _recover_session(
self.session_manager, self.heartbeat,
self._voice_sessions, self._heartbeat_clients,
self._tasks, self.process_task,
voice_session_id, session_id,
)
await heartbeat_client.start()
self.heartbeat.register(session.session_id, "voice-orchestrator")
# Store references
self._voice_sessions[voice_session_id] = session
self._heartbeat_clients[session.session_id] = heartbeat_client
# Recover pending tasks from checkpoints
await self._recover_pending_tasks(session)
logger.info(
"Recovered session",
session_id=session.session_id[:8],
checkpoints=len(session.checkpoints)
)
return session
async def _recover_pending_tasks(self, session: AgentSession) -> None:
"""Recovers pending tasks from session checkpoints"""
for checkpoint in reversed(session.checkpoints):
if checkpoint.name == "task_queued":
task_id = checkpoint.data.get("task_id")
if task_id and task_id in self._tasks:
task = self._tasks[task_id]
if task.state == TaskState.QUEUED:
# Re-process queued task
await self.process_task(task)
logger.info(
"Recovered pending task",
task_id=task_id[:8]
)