[split-required] Split remaining 500-680 LOC files (final batch)
website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,82 +1,49 @@
|
||||
"""
|
||||
RAG Judge - Specialized evaluation for RAG/Correction quality
|
||||
|
||||
Split into:
|
||||
- rag_judge_types.py: Data classes for evaluation results
|
||||
- rag_judge_evaluators.py: Individual evaluation methods
|
||||
- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import structlog
|
||||
import httpx
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional, Dict, List, Any
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
from bqas.config import BQASConfig
|
||||
from bqas.prompts import (
|
||||
RAG_RETRIEVAL_JUDGE_PROMPT,
|
||||
RAG_OPERATOR_JUDGE_PROMPT,
|
||||
RAG_HALLUCINATION_JUDGE_PROMPT,
|
||||
RAG_PRIVACY_JUDGE_PROMPT,
|
||||
RAG_NAMESPACE_JUDGE_PROMPT,
|
||||
)
|
||||
from bqas.metrics import TestResult
|
||||
|
||||
# Re-export types for backward compatibility
|
||||
from bqas.rag_judge_types import (
|
||||
RAGRetrievalResult,
|
||||
RAGOperatorResult,
|
||||
RAGHallucinationResult,
|
||||
RAGPrivacyResult,
|
||||
RAGNamespaceResult,
|
||||
)
|
||||
|
||||
from bqas.rag_judge_evaluators import (
|
||||
evaluate_retrieval as _evaluate_retrieval,
|
||||
evaluate_operator as _evaluate_operator,
|
||||
evaluate_hallucination as _evaluate_hallucination,
|
||||
evaluate_privacy as _evaluate_privacy,
|
||||
evaluate_namespace as _evaluate_namespace,
|
||||
evaluate_rag_test_case as _evaluate_rag_test_case,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"RAGJudge",
|
||||
"RAGRetrievalResult",
|
||||
"RAGOperatorResult",
|
||||
"RAGHallucinationResult",
|
||||
"RAGPrivacyResult",
|
||||
"RAGNamespaceResult",
|
||||
]
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGRetrievalResult:
|
||||
"""Result from RAG retrieval evaluation."""
|
||||
retrieval_precision: int # 0-100
|
||||
faithfulness: int # 1-5
|
||||
relevance: int # 1-5
|
||||
citation_accuracy: int # 1-5
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGOperatorResult:
|
||||
"""Result from operator alignment evaluation."""
|
||||
operator_alignment: int # 0-100
|
||||
faithfulness: int # 1-5
|
||||
completeness: int # 1-5
|
||||
detected_afb: str # I, II, III
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGHallucinationResult:
|
||||
"""Result from hallucination control evaluation."""
|
||||
grounding_score: int # 0-100
|
||||
invention_detection: Literal["pass", "fail"]
|
||||
source_attribution: int # 1-5
|
||||
hallucinated_claims: List[str]
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGPrivacyResult:
|
||||
"""Result from privacy compliance evaluation."""
|
||||
privacy_compliance: Literal["pass", "fail"]
|
||||
anonymization: int # 1-5
|
||||
dsgvo_compliance: Literal["pass", "fail"]
|
||||
detected_pii: List[str]
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGNamespaceResult:
|
||||
"""Result from namespace isolation evaluation."""
|
||||
namespace_compliance: Literal["pass", "fail"]
|
||||
cross_tenant_leak: Literal["pass", "fail"]
|
||||
school_sharing_compliance: int # 1-5
|
||||
detected_leaks: List[str]
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
class RAGJudge:
|
||||
"""
|
||||
Specialized judge for RAG/Correction quality evaluation.
|
||||
@@ -130,460 +97,53 @@ class RAGJudge:
|
||||
logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
|
||||
return {}
|
||||
|
||||
# ================================
|
||||
# Retrieval Evaluation
|
||||
# ================================
|
||||
|
||||
async def evaluate_retrieval(
|
||||
self,
|
||||
query: str,
|
||||
aufgabentyp: str,
|
||||
subject: str,
|
||||
level: str,
|
||||
retrieved_passage: str,
|
||||
expected_concepts: List[str],
|
||||
self, query: str, aufgabentyp: str, subject: str, level: str,
|
||||
retrieved_passage: str, expected_concepts: List[str],
|
||||
) -> RAGRetrievalResult:
|
||||
"""Evaluate EH retrieval quality."""
|
||||
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
|
||||
query=query,
|
||||
aufgabentyp=aufgabentyp,
|
||||
subject=subject,
|
||||
level=level,
|
||||
retrieved_passage=retrieved_passage,
|
||||
expected_concepts=", ".join(expected_concepts),
|
||||
return await _evaluate_retrieval(
|
||||
self._call_ollama, self._parse_json_response, self.config,
|
||||
query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await self._call_ollama(prompt)
|
||||
data = self._parse_json_response(response_text)
|
||||
|
||||
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
|
||||
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
||||
relevance = max(1, min(5, int(data.get("relevance", 1))))
|
||||
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
|
||||
|
||||
composite = self._calculate_retrieval_composite(
|
||||
retrieval_precision, faithfulness, relevance, citation_accuracy
|
||||
)
|
||||
|
||||
return RAGRetrievalResult(
|
||||
retrieval_precision=retrieval_precision,
|
||||
faithfulness=faithfulness,
|
||||
relevance=relevance,
|
||||
citation_accuracy=citation_accuracy,
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Retrieval evaluation failed", error=str(e))
|
||||
return RAGRetrievalResult(
|
||||
retrieval_precision=0,
|
||||
faithfulness=1,
|
||||
relevance=1,
|
||||
citation_accuracy=1,
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
def _calculate_retrieval_composite(
|
||||
self,
|
||||
retrieval_precision: int,
|
||||
faithfulness: int,
|
||||
relevance: int,
|
||||
citation_accuracy: int,
|
||||
) -> float:
|
||||
"""Calculate composite score for retrieval evaluation."""
|
||||
c = self.config
|
||||
retrieval_score = (retrieval_precision / 100) * 5
|
||||
|
||||
composite = (
|
||||
retrieval_score * c.rag_retrieval_precision_weight +
|
||||
faithfulness * c.rag_faithfulness_weight +
|
||||
relevance * 0.3 + # Higher weight for relevance in retrieval
|
||||
citation_accuracy * c.rag_citation_accuracy_weight
|
||||
)
|
||||
return round(composite, 3)
|
||||
|
||||
# ================================
|
||||
# Operator Evaluation
|
||||
# ================================
|
||||
|
||||
async def evaluate_operator(
|
||||
self,
|
||||
operator: str,
|
||||
generated_definition: str,
|
||||
expected_afb: str,
|
||||
expected_actions: List[str],
|
||||
self, operator: str, generated_definition: str,
|
||||
expected_afb: str, expected_actions: List[str],
|
||||
) -> RAGOperatorResult:
|
||||
"""Evaluate operator alignment."""
|
||||
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
|
||||
operator=operator,
|
||||
generated_definition=generated_definition,
|
||||
expected_afb=expected_afb,
|
||||
expected_actions=", ".join(expected_actions),
|
||||
return await _evaluate_operator(
|
||||
self._call_ollama, self._parse_json_response,
|
||||
operator, generated_definition, expected_afb, expected_actions,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await self._call_ollama(prompt)
|
||||
data = self._parse_json_response(response_text)
|
||||
|
||||
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
|
||||
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
||||
completeness = max(1, min(5, int(data.get("completeness", 1))))
|
||||
detected_afb = str(data.get("detected_afb", ""))
|
||||
|
||||
composite = self._calculate_operator_composite(
|
||||
operator_alignment, faithfulness, completeness
|
||||
)
|
||||
|
||||
return RAGOperatorResult(
|
||||
operator_alignment=operator_alignment,
|
||||
faithfulness=faithfulness,
|
||||
completeness=completeness,
|
||||
detected_afb=detected_afb,
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Operator evaluation failed", error=str(e))
|
||||
return RAGOperatorResult(
|
||||
operator_alignment=0,
|
||||
faithfulness=1,
|
||||
completeness=1,
|
||||
detected_afb="",
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
def _calculate_operator_composite(
|
||||
self,
|
||||
operator_alignment: int,
|
||||
faithfulness: int,
|
||||
completeness: int,
|
||||
) -> float:
|
||||
"""Calculate composite score for operator evaluation."""
|
||||
alignment_score = (operator_alignment / 100) * 5
|
||||
|
||||
composite = (
|
||||
alignment_score * 0.5 +
|
||||
faithfulness * 0.3 +
|
||||
completeness * 0.2
|
||||
)
|
||||
return round(composite, 3)
|
||||
|
||||
# ================================
|
||||
# Hallucination Evaluation
|
||||
# ================================
|
||||
|
||||
async def evaluate_hallucination(
|
||||
self,
|
||||
query: str,
|
||||
response: str,
|
||||
available_facts: List[str],
|
||||
self, query: str, response: str, available_facts: List[str],
|
||||
) -> RAGHallucinationResult:
|
||||
"""Evaluate for hallucinations."""
|
||||
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
|
||||
query=query,
|
||||
response=response,
|
||||
available_facts="\n".join(f"- {f}" for f in available_facts),
|
||||
return await _evaluate_hallucination(
|
||||
self._call_ollama, self._parse_json_response,
|
||||
query, response, available_facts,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await self._call_ollama(prompt)
|
||||
data = self._parse_json_response(response_text)
|
||||
|
||||
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
|
||||
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
|
||||
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
|
||||
hallucinated_claims = data.get("hallucinated_claims", [])
|
||||
|
||||
composite = self._calculate_hallucination_composite(
|
||||
grounding_score, invention_detection, source_attribution
|
||||
)
|
||||
|
||||
return RAGHallucinationResult(
|
||||
grounding_score=grounding_score,
|
||||
invention_detection=invention_detection,
|
||||
source_attribution=source_attribution,
|
||||
hallucinated_claims=hallucinated_claims[:5],
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Hallucination evaluation failed", error=str(e))
|
||||
return RAGHallucinationResult(
|
||||
grounding_score=0,
|
||||
invention_detection="fail",
|
||||
source_attribution=1,
|
||||
hallucinated_claims=[],
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
def _calculate_hallucination_composite(
|
||||
self,
|
||||
grounding_score: int,
|
||||
invention_detection: str,
|
||||
source_attribution: int,
|
||||
) -> float:
|
||||
"""Calculate composite score for hallucination evaluation."""
|
||||
grounding = (grounding_score / 100) * 5
|
||||
invention = 5.0 if invention_detection == "pass" else 0.0
|
||||
|
||||
composite = (
|
||||
grounding * 0.4 +
|
||||
invention * 0.4 +
|
||||
source_attribution * 0.2
|
||||
)
|
||||
return round(composite, 3)
|
||||
|
||||
# ================================
|
||||
# Privacy Evaluation
|
||||
# ================================
|
||||
|
||||
async def evaluate_privacy(
|
||||
self,
|
||||
query: str,
|
||||
context: Dict[str, Any],
|
||||
response: str,
|
||||
self, query: str, context: Dict[str, Any], response: str,
|
||||
) -> RAGPrivacyResult:
|
||||
"""Evaluate privacy/DSGVO compliance."""
|
||||
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
|
||||
query=query,
|
||||
context=json.dumps(context, ensure_ascii=False, indent=2),
|
||||
response=response,
|
||||
return await _evaluate_privacy(
|
||||
self._call_ollama, self._parse_json_response,
|
||||
query, context, response,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await self._call_ollama(prompt)
|
||||
data = self._parse_json_response(response_text)
|
||||
|
||||
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
|
||||
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
|
||||
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
|
||||
detected_pii = data.get("detected_pii", [])
|
||||
|
||||
composite = self._calculate_privacy_composite(
|
||||
privacy_compliance, anonymization, dsgvo_compliance
|
||||
)
|
||||
|
||||
return RAGPrivacyResult(
|
||||
privacy_compliance=privacy_compliance,
|
||||
anonymization=anonymization,
|
||||
dsgvo_compliance=dsgvo_compliance,
|
||||
detected_pii=detected_pii[:5],
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Privacy evaluation failed", error=str(e))
|
||||
return RAGPrivacyResult(
|
||||
privacy_compliance="fail",
|
||||
anonymization=1,
|
||||
dsgvo_compliance="fail",
|
||||
detected_pii=[],
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
def _calculate_privacy_composite(
|
||||
self,
|
||||
privacy_compliance: str,
|
||||
anonymization: int,
|
||||
dsgvo_compliance: str,
|
||||
) -> float:
|
||||
"""Calculate composite score for privacy evaluation."""
|
||||
privacy = 5.0 if privacy_compliance == "pass" else 0.0
|
||||
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
|
||||
|
||||
composite = (
|
||||
privacy * 0.4 +
|
||||
anonymization * 0.2 +
|
||||
dsgvo * 0.4
|
||||
)
|
||||
return round(composite, 3)
|
||||
|
||||
# ================================
|
||||
# Namespace Evaluation
|
||||
# ================================
|
||||
|
||||
async def evaluate_namespace(
|
||||
self,
|
||||
teacher_id: str,
|
||||
namespace: str,
|
||||
school_id: str,
|
||||
requested_data: str,
|
||||
response: str,
|
||||
self, teacher_id: str, namespace: str, school_id: str,
|
||||
requested_data: str, response: str,
|
||||
) -> RAGNamespaceResult:
|
||||
"""Evaluate namespace isolation."""
|
||||
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
|
||||
teacher_id=teacher_id,
|
||||
namespace=namespace,
|
||||
school_id=school_id,
|
||||
requested_data=requested_data,
|
||||
response=response,
|
||||
return await _evaluate_namespace(
|
||||
self._call_ollama, self._parse_json_response,
|
||||
teacher_id, namespace, school_id, requested_data, response,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await self._call_ollama(prompt)
|
||||
data = self._parse_json_response(response_text)
|
||||
|
||||
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
|
||||
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
|
||||
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
|
||||
detected_leaks = data.get("detected_leaks", [])
|
||||
|
||||
composite = self._calculate_namespace_composite(
|
||||
namespace_compliance, cross_tenant_leak, school_sharing_compliance
|
||||
)
|
||||
|
||||
return RAGNamespaceResult(
|
||||
namespace_compliance=namespace_compliance,
|
||||
cross_tenant_leak=cross_tenant_leak,
|
||||
school_sharing_compliance=school_sharing_compliance,
|
||||
detected_leaks=detected_leaks[:5],
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Namespace evaluation failed", error=str(e))
|
||||
return RAGNamespaceResult(
|
||||
namespace_compliance="fail",
|
||||
cross_tenant_leak="fail",
|
||||
school_sharing_compliance=1,
|
||||
detected_leaks=[],
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
def _calculate_namespace_composite(
|
||||
self,
|
||||
namespace_compliance: str,
|
||||
cross_tenant_leak: str,
|
||||
school_sharing_compliance: int,
|
||||
) -> float:
|
||||
"""Calculate composite score for namespace evaluation."""
|
||||
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
|
||||
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
|
||||
|
||||
composite = (
|
||||
ns_compliance * 0.4 +
|
||||
cross_tenant * 0.4 +
|
||||
school_sharing_compliance * 0.2
|
||||
)
|
||||
return round(composite, 3)
|
||||
|
||||
# ================================
|
||||
# Test Case Evaluation
|
||||
# ================================
|
||||
|
||||
async def evaluate_rag_test_case(
|
||||
self,
|
||||
test_case: Dict[str, Any],
|
||||
service_response: Dict[str, Any],
|
||||
self, test_case: Dict[str, Any], service_response: Dict[str, Any],
|
||||
) -> TestResult:
|
||||
"""
|
||||
Evaluate a full RAG test case from the golden suite.
|
||||
|
||||
Args:
|
||||
test_case: Test case definition from YAML
|
||||
service_response: Response from the service being tested
|
||||
|
||||
Returns:
|
||||
TestResult with all metrics
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
test_id = test_case.get("id", "UNKNOWN")
|
||||
test_name = test_case.get("name", "")
|
||||
category = test_case.get("category", "")
|
||||
min_score = test_case.get("min_score", 3.5)
|
||||
|
||||
# Route to appropriate evaluation based on category
|
||||
composite_score = 0.0
|
||||
reasoning = ""
|
||||
|
||||
if category == "eh_retrieval":
|
||||
result = await self.evaluate_retrieval(
|
||||
query=test_case.get("input", {}).get("query", ""),
|
||||
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
|
||||
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
|
||||
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
|
||||
retrieved_passage=service_response.get("passage", ""),
|
||||
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "operator_alignment":
|
||||
result = await self.evaluate_operator(
|
||||
operator=test_case.get("input", {}).get("operator", ""),
|
||||
generated_definition=service_response.get("definition", ""),
|
||||
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
|
||||
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "hallucination_control":
|
||||
result = await self.evaluate_hallucination(
|
||||
query=test_case.get("input", {}).get("query", ""),
|
||||
response=service_response.get("response", ""),
|
||||
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "privacy_compliance":
|
||||
result = await self.evaluate_privacy(
|
||||
query=test_case.get("input", {}).get("query", ""),
|
||||
context=test_case.get("input", {}).get("context", {}),
|
||||
response=service_response.get("response", ""),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "namespace_isolation":
|
||||
context = test_case.get("input", {}).get("context", {})
|
||||
result = await self.evaluate_namespace(
|
||||
teacher_id=context.get("teacher_id", ""),
|
||||
namespace=context.get("namespace", ""),
|
||||
school_id=context.get("school_id", ""),
|
||||
requested_data=test_case.get("input", {}).get("query", ""),
|
||||
response=service_response.get("response", ""),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
else:
|
||||
reasoning = f"Unknown category: {category}"
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
passed = composite_score >= min_score
|
||||
|
||||
return TestResult(
|
||||
test_id=test_id,
|
||||
test_name=test_name,
|
||||
user_input=str(test_case.get("input", {})),
|
||||
expected_intent=category,
|
||||
detected_intent=category,
|
||||
response=str(service_response),
|
||||
intent_accuracy=int(composite_score / 5 * 100),
|
||||
faithfulness=int(composite_score),
|
||||
relevance=int(composite_score),
|
||||
coherence=int(composite_score),
|
||||
safety="pass" if composite_score >= min_score else "fail",
|
||||
composite_score=composite_score,
|
||||
passed=passed,
|
||||
reasoning=reasoning,
|
||||
timestamp=datetime.utcnow(),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
return await _evaluate_rag_test_case(self, test_case, service_response)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Check if Ollama and judge model are available."""
|
||||
|
||||
397
voice-service/bqas/rag_judge_evaluators.py
Normal file
397
voice-service/bqas/rag_judge_evaluators.py
Normal file
@@ -0,0 +1,397 @@
|
||||
"""
|
||||
RAG Judge Evaluators - Individual evaluation methods for RAG quality
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import structlog
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
from bqas.config import BQASConfig
|
||||
from bqas.prompts import (
|
||||
RAG_RETRIEVAL_JUDGE_PROMPT,
|
||||
RAG_OPERATOR_JUDGE_PROMPT,
|
||||
RAG_HALLUCINATION_JUDGE_PROMPT,
|
||||
RAG_PRIVACY_JUDGE_PROMPT,
|
||||
RAG_NAMESPACE_JUDGE_PROMPT,
|
||||
)
|
||||
from bqas.metrics import TestResult
|
||||
from bqas.rag_judge_types import (
|
||||
RAGRetrievalResult,
|
||||
RAGOperatorResult,
|
||||
RAGHallucinationResult,
|
||||
RAGPrivacyResult,
|
||||
RAGNamespaceResult,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
async def evaluate_retrieval(
|
||||
call_ollama,
|
||||
parse_json_response,
|
||||
config: BQASConfig,
|
||||
query: str,
|
||||
aufgabentyp: str,
|
||||
subject: str,
|
||||
level: str,
|
||||
retrieved_passage: str,
|
||||
expected_concepts: List[str],
|
||||
) -> RAGRetrievalResult:
|
||||
"""Evaluate EH retrieval quality."""
|
||||
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
|
||||
query=query,
|
||||
aufgabentyp=aufgabentyp,
|
||||
subject=subject,
|
||||
level=level,
|
||||
retrieved_passage=retrieved_passage,
|
||||
expected_concepts=", ".join(expected_concepts),
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await call_ollama(prompt)
|
||||
data = parse_json_response(response_text)
|
||||
|
||||
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
|
||||
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
||||
relevance = max(1, min(5, int(data.get("relevance", 1))))
|
||||
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
|
||||
|
||||
composite = _calculate_retrieval_composite(
|
||||
config, retrieval_precision, faithfulness, relevance, citation_accuracy
|
||||
)
|
||||
|
||||
return RAGRetrievalResult(
|
||||
retrieval_precision=retrieval_precision,
|
||||
faithfulness=faithfulness,
|
||||
relevance=relevance,
|
||||
citation_accuracy=citation_accuracy,
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Retrieval evaluation failed", error=str(e))
|
||||
return RAGRetrievalResult(
|
||||
retrieval_precision=0,
|
||||
faithfulness=1,
|
||||
relevance=1,
|
||||
citation_accuracy=1,
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
|
||||
def _calculate_retrieval_composite(
|
||||
config: BQASConfig,
|
||||
retrieval_precision: int,
|
||||
faithfulness: int,
|
||||
relevance: int,
|
||||
citation_accuracy: int,
|
||||
) -> float:
|
||||
"""Calculate composite score for retrieval evaluation."""
|
||||
retrieval_score = (retrieval_precision / 100) * 5
|
||||
composite = (
|
||||
retrieval_score * config.rag_retrieval_precision_weight +
|
||||
faithfulness * config.rag_faithfulness_weight +
|
||||
relevance * 0.3 +
|
||||
citation_accuracy * config.rag_citation_accuracy_weight
|
||||
)
|
||||
return round(composite, 3)
|
||||
|
||||
|
||||
async def evaluate_operator(
|
||||
call_ollama,
|
||||
parse_json_response,
|
||||
operator: str,
|
||||
generated_definition: str,
|
||||
expected_afb: str,
|
||||
expected_actions: List[str],
|
||||
) -> RAGOperatorResult:
|
||||
"""Evaluate operator alignment."""
|
||||
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
|
||||
operator=operator,
|
||||
generated_definition=generated_definition,
|
||||
expected_afb=expected_afb,
|
||||
expected_actions=", ".join(expected_actions),
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await call_ollama(prompt)
|
||||
data = parse_json_response(response_text)
|
||||
|
||||
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
|
||||
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
||||
completeness = max(1, min(5, int(data.get("completeness", 1))))
|
||||
detected_afb = str(data.get("detected_afb", ""))
|
||||
|
||||
alignment_score = (operator_alignment / 100) * 5
|
||||
composite = round(
|
||||
alignment_score * 0.5 + faithfulness * 0.3 + completeness * 0.2, 3
|
||||
)
|
||||
|
||||
return RAGOperatorResult(
|
||||
operator_alignment=operator_alignment,
|
||||
faithfulness=faithfulness,
|
||||
completeness=completeness,
|
||||
detected_afb=detected_afb,
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Operator evaluation failed", error=str(e))
|
||||
return RAGOperatorResult(
|
||||
operator_alignment=0,
|
||||
faithfulness=1,
|
||||
completeness=1,
|
||||
detected_afb="",
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
|
||||
async def evaluate_hallucination(
|
||||
call_ollama,
|
||||
parse_json_response,
|
||||
query: str,
|
||||
response: str,
|
||||
available_facts: List[str],
|
||||
) -> RAGHallucinationResult:
|
||||
"""Evaluate for hallucinations."""
|
||||
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
|
||||
query=query,
|
||||
response=response,
|
||||
available_facts="\n".join(f"- {f}" for f in available_facts),
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await call_ollama(prompt)
|
||||
data = parse_json_response(response_text)
|
||||
|
||||
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
|
||||
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
|
||||
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
|
||||
hallucinated_claims = data.get("hallucinated_claims", [])
|
||||
|
||||
grounding = (grounding_score / 100) * 5
|
||||
invention = 5.0 if invention_detection == "pass" else 0.0
|
||||
composite = round(grounding * 0.4 + invention * 0.4 + source_attribution * 0.2, 3)
|
||||
|
||||
return RAGHallucinationResult(
|
||||
grounding_score=grounding_score,
|
||||
invention_detection=invention_detection,
|
||||
source_attribution=source_attribution,
|
||||
hallucinated_claims=hallucinated_claims[:5],
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Hallucination evaluation failed", error=str(e))
|
||||
return RAGHallucinationResult(
|
||||
grounding_score=0,
|
||||
invention_detection="fail",
|
||||
source_attribution=1,
|
||||
hallucinated_claims=[],
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
|
||||
async def evaluate_privacy(
|
||||
call_ollama,
|
||||
parse_json_response,
|
||||
query: str,
|
||||
context: Dict[str, Any],
|
||||
response: str,
|
||||
) -> RAGPrivacyResult:
|
||||
"""Evaluate privacy/DSGVO compliance."""
|
||||
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
|
||||
query=query,
|
||||
context=json.dumps(context, ensure_ascii=False, indent=2),
|
||||
response=response,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await call_ollama(prompt)
|
||||
data = parse_json_response(response_text)
|
||||
|
||||
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
|
||||
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
|
||||
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
|
||||
detected_pii = data.get("detected_pii", [])
|
||||
|
||||
privacy = 5.0 if privacy_compliance == "pass" else 0.0
|
||||
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
|
||||
composite = round(privacy * 0.4 + anonymization * 0.2 + dsgvo * 0.4, 3)
|
||||
|
||||
return RAGPrivacyResult(
|
||||
privacy_compliance=privacy_compliance,
|
||||
anonymization=anonymization,
|
||||
dsgvo_compliance=dsgvo_compliance,
|
||||
detected_pii=detected_pii[:5],
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Privacy evaluation failed", error=str(e))
|
||||
return RAGPrivacyResult(
|
||||
privacy_compliance="fail",
|
||||
anonymization=1,
|
||||
dsgvo_compliance="fail",
|
||||
detected_pii=[],
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
|
||||
async def evaluate_namespace(
|
||||
call_ollama,
|
||||
parse_json_response,
|
||||
teacher_id: str,
|
||||
namespace: str,
|
||||
school_id: str,
|
||||
requested_data: str,
|
||||
response: str,
|
||||
) -> RAGNamespaceResult:
|
||||
"""Evaluate namespace isolation."""
|
||||
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
|
||||
teacher_id=teacher_id,
|
||||
namespace=namespace,
|
||||
school_id=school_id,
|
||||
requested_data=requested_data,
|
||||
response=response,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = await call_ollama(prompt)
|
||||
data = parse_json_response(response_text)
|
||||
|
||||
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
|
||||
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
|
||||
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
|
||||
detected_leaks = data.get("detected_leaks", [])
|
||||
|
||||
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
|
||||
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
|
||||
composite = round(
|
||||
ns_compliance * 0.4 + cross_tenant * 0.4 + school_sharing_compliance * 0.2, 3
|
||||
)
|
||||
|
||||
return RAGNamespaceResult(
|
||||
namespace_compliance=namespace_compliance,
|
||||
cross_tenant_leak=cross_tenant_leak,
|
||||
school_sharing_compliance=school_sharing_compliance,
|
||||
detected_leaks=detected_leaks[:5],
|
||||
reasoning=str(data.get("reasoning", ""))[:500],
|
||||
composite_score=composite,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Namespace evaluation failed", error=str(e))
|
||||
return RAGNamespaceResult(
|
||||
namespace_compliance="fail",
|
||||
cross_tenant_leak="fail",
|
||||
school_sharing_compliance=1,
|
||||
detected_leaks=[],
|
||||
reasoning=f"Evaluation failed: {str(e)}",
|
||||
composite_score=0.0,
|
||||
)
|
||||
|
||||
|
||||
async def evaluate_rag_test_case(
|
||||
judge_instance,
|
||||
test_case: Dict[str, Any],
|
||||
service_response: Dict[str, Any],
|
||||
) -> TestResult:
|
||||
"""
|
||||
Evaluate a full RAG test case from the golden suite.
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
test_id = test_case.get("id", "UNKNOWN")
|
||||
test_name = test_case.get("name", "")
|
||||
category = test_case.get("category", "")
|
||||
min_score = test_case.get("min_score", 3.5)
|
||||
|
||||
composite_score = 0.0
|
||||
reasoning = ""
|
||||
|
||||
if category == "eh_retrieval":
|
||||
result = await judge_instance.evaluate_retrieval(
|
||||
query=test_case.get("input", {}).get("query", ""),
|
||||
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
|
||||
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
|
||||
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
|
||||
retrieved_passage=service_response.get("passage", ""),
|
||||
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "operator_alignment":
|
||||
result = await judge_instance.evaluate_operator(
|
||||
operator=test_case.get("input", {}).get("operator", ""),
|
||||
generated_definition=service_response.get("definition", ""),
|
||||
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
|
||||
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "hallucination_control":
|
||||
result = await judge_instance.evaluate_hallucination(
|
||||
query=test_case.get("input", {}).get("query", ""),
|
||||
response=service_response.get("response", ""),
|
||||
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "privacy_compliance":
|
||||
result = await judge_instance.evaluate_privacy(
|
||||
query=test_case.get("input", {}).get("query", ""),
|
||||
context=test_case.get("input", {}).get("context", {}),
|
||||
response=service_response.get("response", ""),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
elif category == "namespace_isolation":
|
||||
context = test_case.get("input", {}).get("context", {})
|
||||
result = await judge_instance.evaluate_namespace(
|
||||
teacher_id=context.get("teacher_id", ""),
|
||||
namespace=context.get("namespace", ""),
|
||||
school_id=context.get("school_id", ""),
|
||||
requested_data=test_case.get("input", {}).get("query", ""),
|
||||
response=service_response.get("response", ""),
|
||||
)
|
||||
composite_score = result.composite_score
|
||||
reasoning = result.reasoning
|
||||
|
||||
else:
|
||||
reasoning = f"Unknown category: {category}"
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
passed = composite_score >= min_score
|
||||
|
||||
return TestResult(
|
||||
test_id=test_id,
|
||||
test_name=test_name,
|
||||
user_input=str(test_case.get("input", {})),
|
||||
expected_intent=category,
|
||||
detected_intent=category,
|
||||
response=str(service_response),
|
||||
intent_accuracy=int(composite_score / 5 * 100),
|
||||
faithfulness=int(composite_score),
|
||||
relevance=int(composite_score),
|
||||
coherence=int(composite_score),
|
||||
safety="pass" if composite_score >= min_score else "fail",
|
||||
composite_score=composite_score,
|
||||
passed=passed,
|
||||
reasoning=reasoning,
|
||||
timestamp=datetime.utcnow(),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
60
voice-service/bqas/rag_judge_types.py
Normal file
60
voice-service/bqas/rag_judge_types.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""
|
||||
RAG Judge Types - Data classes for RAG evaluation results
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, List
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGRetrievalResult:
|
||||
"""Result from RAG retrieval evaluation."""
|
||||
retrieval_precision: int # 0-100
|
||||
faithfulness: int # 1-5
|
||||
relevance: int # 1-5
|
||||
citation_accuracy: int # 1-5
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGOperatorResult:
|
||||
"""Result from operator alignment evaluation."""
|
||||
operator_alignment: int # 0-100
|
||||
faithfulness: int # 1-5
|
||||
completeness: int # 1-5
|
||||
detected_afb: str # I, II, III
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGHallucinationResult:
|
||||
"""Result from hallucination control evaluation."""
|
||||
grounding_score: int # 0-100
|
||||
invention_detection: Literal["pass", "fail"]
|
||||
source_attribution: int # 1-5
|
||||
hallucinated_claims: List[str]
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGPrivacyResult:
|
||||
"""Result from privacy compliance evaluation."""
|
||||
privacy_compliance: Literal["pass", "fail"]
|
||||
anonymization: int # 1-5
|
||||
dsgvo_compliance: Literal["pass", "fail"]
|
||||
detected_pii: List[str]
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGNamespaceResult:
|
||||
"""Result from namespace isolation evaluation."""
|
||||
namespace_compliance: Literal["pass", "fail"]
|
||||
cross_tenant_leak: Literal["pass", "fail"]
|
||||
school_sharing_compliance: int # 1-5
|
||||
detected_leaks: List[str]
|
||||
reasoning: str
|
||||
composite_score: float
|
||||
@@ -1,11 +1,12 @@
|
||||
"""
|
||||
BQAS Test Runner - Executes Golden, RAG, and Synthetic test suites
|
||||
|
||||
Split into:
|
||||
- runner_golden.py: Test loading, simulation helpers, error result creation
|
||||
- runner.py (this file): BQASRunner class, singleton
|
||||
"""
|
||||
import yaml
|
||||
import asyncio
|
||||
import structlog
|
||||
import httpx
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass, field
|
||||
@@ -15,6 +16,13 @@ from bqas.judge import LLMJudge
|
||||
from bqas.rag_judge import RAGJudge
|
||||
from bqas.metrics import TestResult, BQASMetrics
|
||||
from bqas.synthetic_generator import SyntheticGenerator
|
||||
from bqas.runner_golden import (
|
||||
load_golden_tests,
|
||||
load_rag_tests,
|
||||
simulate_response,
|
||||
create_error_result,
|
||||
simulate_rag_response,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -61,87 +69,42 @@ class BQASRunner:
|
||||
# ================================
|
||||
|
||||
async def run_golden_suite(self, git_commit: Optional[str] = None) -> TestRun:
|
||||
"""
|
||||
Run the golden test suite.
|
||||
|
||||
Loads test cases from YAML files and evaluates each one.
|
||||
"""
|
||||
"""Run the golden test suite."""
|
||||
logger.info("Starting Golden Suite run")
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
# Load all golden test cases
|
||||
test_cases = await self._load_golden_tests()
|
||||
test_cases = await load_golden_tests()
|
||||
logger.info(f"Loaded {len(test_cases)} golden test cases")
|
||||
|
||||
# Run all tests
|
||||
results = []
|
||||
for i, test_case in enumerate(test_cases):
|
||||
try:
|
||||
result = await self._run_golden_test(test_case)
|
||||
results.append(result)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
logger.info(f"Progress: {i + 1}/{len(test_cases)} tests completed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test {test_case.get('id')} failed with error", error=str(e))
|
||||
# Create a failed result
|
||||
results.append(self._create_error_result(test_case, str(e)))
|
||||
results.append(create_error_result(test_case, str(e)))
|
||||
|
||||
# Calculate metrics
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
# Record run
|
||||
self._run_counter += 1
|
||||
run = TestRun(
|
||||
id=self._run_counter,
|
||||
suite="golden",
|
||||
timestamp=start_time,
|
||||
git_commit=git_commit,
|
||||
metrics=metrics,
|
||||
results=results,
|
||||
id=self._run_counter, suite="golden", timestamp=start_time,
|
||||
git_commit=git_commit, metrics=metrics, results=results,
|
||||
duration_seconds=duration,
|
||||
)
|
||||
self._test_runs.insert(0, run)
|
||||
|
||||
logger.info(
|
||||
"Golden Suite completed",
|
||||
total=metrics.total_tests,
|
||||
passed=metrics.passed_tests,
|
||||
failed=metrics.failed_tests,
|
||||
score=metrics.avg_composite_score,
|
||||
duration=f"{duration:.1f}s",
|
||||
"Golden Suite completed", total=metrics.total_tests,
|
||||
passed=metrics.passed_tests, failed=metrics.failed_tests,
|
||||
score=metrics.avg_composite_score, duration=f"{duration:.1f}s",
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
async def _load_golden_tests(self) -> List[Dict[str, Any]]:
|
||||
"""Load all golden test cases from YAML files."""
|
||||
tests = []
|
||||
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
||||
|
||||
yaml_files = [
|
||||
"intent_tests.yaml",
|
||||
"edge_cases.yaml",
|
||||
"workflow_tests.yaml",
|
||||
]
|
||||
|
||||
for filename in yaml_files:
|
||||
filepath = golden_dir / filename
|
||||
if filepath.exists():
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
if data and 'tests' in data:
|
||||
for test in data['tests']:
|
||||
test['source_file'] = filename
|
||||
tests.extend(data['tests'])
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load {filename}", error=str(e))
|
||||
|
||||
return tests
|
||||
|
||||
async def _run_golden_test(self, test_case: Dict[str, Any]) -> TestResult:
|
||||
"""Run a single golden test case."""
|
||||
test_id = test_case.get('id', 'UNKNOWN')
|
||||
@@ -150,38 +113,19 @@ class BQASRunner:
|
||||
expected_intent = test_case.get('expected_intent', '')
|
||||
min_score = test_case.get('min_score', self.config.min_golden_score)
|
||||
|
||||
# Get response from voice service (or simulate)
|
||||
detected_intent, response = await self._get_voice_response(user_input, expected_intent)
|
||||
|
||||
# Evaluate with judge
|
||||
result = await self.judge.evaluate_test_case(
|
||||
test_id=test_id,
|
||||
test_name=test_name,
|
||||
user_input=user_input,
|
||||
expected_intent=expected_intent,
|
||||
detected_intent=detected_intent,
|
||||
response=response,
|
||||
min_score=min_score,
|
||||
test_id=test_id, test_name=test_name, user_input=user_input,
|
||||
expected_intent=expected_intent, detected_intent=detected_intent,
|
||||
response=response, min_score=min_score,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def _get_voice_response(
|
||||
self,
|
||||
user_input: str,
|
||||
expected_intent: str
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Get response from voice service.
|
||||
|
||||
For now, simulates responses since the full voice pipeline
|
||||
might not be available. In production, this would call the
|
||||
actual voice service endpoints.
|
||||
"""
|
||||
async def _get_voice_response(self, user_input: str, expected_intent: str) -> tuple:
|
||||
"""Get response from voice service."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
# Try to call the voice service intent detection
|
||||
response = await client.post(
|
||||
f"{self.config.voice_service_url}/api/v1/tasks",
|
||||
json={
|
||||
@@ -191,231 +135,71 @@ class BQASRunner:
|
||||
},
|
||||
timeout=10.0,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return data.get('detected_intent', expected_intent), data.get('response', f"Verarbeite: {user_input}")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Voice service call failed, using simulation", error=str(e))
|
||||
|
||||
# Simulate response based on expected intent
|
||||
return self._simulate_response(user_input, expected_intent)
|
||||
|
||||
def _simulate_response(self, user_input: str, expected_intent: str) -> tuple[str, str]:
|
||||
"""Simulate voice service response for testing without live service."""
|
||||
# Simulate realistic detected intent (90% correct for golden tests)
|
||||
import random
|
||||
if random.random() < 0.90:
|
||||
detected_intent = expected_intent
|
||||
else:
|
||||
# Simulate occasional misclassification
|
||||
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
|
||||
detected_intent = random.choice([i for i in intents if i != expected_intent])
|
||||
|
||||
# Generate simulated response
|
||||
responses = {
|
||||
"student_observation": f"Notiz wurde gespeichert: {user_input}",
|
||||
"reminder": f"Erinnerung erstellt: {user_input}",
|
||||
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
|
||||
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
|
||||
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
|
||||
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
|
||||
"quiz_generate": f"Quiz wird erstellt: {user_input}",
|
||||
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
|
||||
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
|
||||
"canvas_layout": f"Layout wird angepasst: {user_input}",
|
||||
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
|
||||
"eh_passage": f"EH-Passage gefunden: {user_input}",
|
||||
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
|
||||
"reminder_schedule": f"Erinnerung geplant: {user_input}",
|
||||
"task_summary": f"Aufgabenuebersicht: {user_input}",
|
||||
"conference_topic": f"Konferenzthema notiert: {user_input}",
|
||||
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
|
||||
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
|
||||
}
|
||||
|
||||
response = responses.get(detected_intent, f"Verstanden: {user_input}")
|
||||
return detected_intent, response
|
||||
|
||||
def _create_error_result(self, test_case: Dict[str, Any], error: str) -> TestResult:
|
||||
"""Create a failed test result due to error."""
|
||||
return TestResult(
|
||||
test_id=test_case.get('id', 'UNKNOWN'),
|
||||
test_name=test_case.get('name', 'Error'),
|
||||
user_input=test_case.get('input', ''),
|
||||
expected_intent=test_case.get('expected_intent', ''),
|
||||
detected_intent='error',
|
||||
response='',
|
||||
intent_accuracy=0,
|
||||
faithfulness=1,
|
||||
relevance=1,
|
||||
coherence=1,
|
||||
safety='fail',
|
||||
composite_score=0.0,
|
||||
passed=False,
|
||||
reasoning=f"Test execution error: {error}",
|
||||
timestamp=datetime.utcnow(),
|
||||
duration_ms=0,
|
||||
)
|
||||
return simulate_response(user_input, expected_intent)
|
||||
|
||||
# ================================
|
||||
# RAG Suite Runner
|
||||
# ================================
|
||||
|
||||
async def run_rag_suite(self, git_commit: Optional[str] = None) -> TestRun:
|
||||
"""
|
||||
Run the RAG/Correction test suite.
|
||||
|
||||
Tests EH retrieval, operator alignment, hallucination control, etc.
|
||||
"""
|
||||
"""Run the RAG/Correction test suite."""
|
||||
logger.info("Starting RAG Suite run")
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
# Load RAG test cases
|
||||
test_cases = await self._load_rag_tests()
|
||||
test_cases = await load_rag_tests()
|
||||
logger.info(f"Loaded {len(test_cases)} RAG test cases")
|
||||
|
||||
# Run all tests
|
||||
results = []
|
||||
for i, test_case in enumerate(test_cases):
|
||||
try:
|
||||
result = await self._run_rag_test(test_case)
|
||||
service_response = await simulate_rag_response(test_case)
|
||||
result = await self.rag_judge.evaluate_rag_test_case(
|
||||
test_case=test_case, service_response=service_response,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
if (i + 1) % 5 == 0:
|
||||
logger.info(f"Progress: {i + 1}/{len(test_cases)} RAG tests completed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"RAG test {test_case.get('id')} failed", error=str(e))
|
||||
results.append(self._create_error_result(test_case, str(e)))
|
||||
results.append(create_error_result(test_case, str(e)))
|
||||
|
||||
# Calculate metrics
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
# Record run
|
||||
self._run_counter += 1
|
||||
run = TestRun(
|
||||
id=self._run_counter,
|
||||
suite="rag",
|
||||
timestamp=start_time,
|
||||
git_commit=git_commit,
|
||||
metrics=metrics,
|
||||
results=results,
|
||||
id=self._run_counter, suite="rag", timestamp=start_time,
|
||||
git_commit=git_commit, metrics=metrics, results=results,
|
||||
duration_seconds=duration,
|
||||
)
|
||||
self._test_runs.insert(0, run)
|
||||
|
||||
logger.info(
|
||||
"RAG Suite completed",
|
||||
total=metrics.total_tests,
|
||||
passed=metrics.passed_tests,
|
||||
score=metrics.avg_composite_score,
|
||||
"RAG Suite completed", total=metrics.total_tests,
|
||||
passed=metrics.passed_tests, score=metrics.avg_composite_score,
|
||||
duration=f"{duration:.1f}s",
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
async def _load_rag_tests(self) -> List[Dict[str, Any]]:
|
||||
"""Load RAG test cases from YAML."""
|
||||
tests = []
|
||||
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
|
||||
|
||||
if rag_file.exists():
|
||||
try:
|
||||
with open(rag_file, 'r', encoding='utf-8') as f:
|
||||
# Handle YAML documents separated by ---
|
||||
documents = list(yaml.safe_load_all(f))
|
||||
for doc in documents:
|
||||
if doc and 'tests' in doc:
|
||||
tests.extend(doc['tests'])
|
||||
if doc and 'edge_cases' in doc:
|
||||
tests.extend(doc['edge_cases'])
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load RAG tests", error=str(e))
|
||||
|
||||
return tests
|
||||
|
||||
async def _run_rag_test(self, test_case: Dict[str, Any]) -> TestResult:
|
||||
"""Run a single RAG test case."""
|
||||
# Simulate service response for RAG tests
|
||||
service_response = await self._simulate_rag_response(test_case)
|
||||
|
||||
# Evaluate with RAG judge
|
||||
result = await self.rag_judge.evaluate_rag_test_case(
|
||||
test_case=test_case,
|
||||
service_response=service_response,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def _simulate_rag_response(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Simulate RAG service response."""
|
||||
category = test_case.get('category', '')
|
||||
input_data = test_case.get('input', {})
|
||||
expected = test_case.get('expected', {})
|
||||
|
||||
# Simulate responses based on category
|
||||
if category == 'eh_retrieval':
|
||||
concepts = expected.get('must_contain_concepts', [])
|
||||
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
|
||||
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
|
||||
return {
|
||||
"passage": passage,
|
||||
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
|
||||
"relevance_score": 0.85,
|
||||
}
|
||||
|
||||
elif category == 'operator_alignment':
|
||||
operator = input_data.get('operator', '')
|
||||
afb = expected.get('afb_level', 'II')
|
||||
actions = expected.get('expected_actions', [])
|
||||
return {
|
||||
"operator": operator,
|
||||
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
|
||||
"afb_level": afb,
|
||||
}
|
||||
|
||||
elif category == 'hallucination_control':
|
||||
return {
|
||||
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
|
||||
"grounded": True,
|
||||
}
|
||||
|
||||
elif category == 'privacy_compliance':
|
||||
return {
|
||||
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
|
||||
"contains_pii": False,
|
||||
}
|
||||
|
||||
elif category == 'namespace_isolation':
|
||||
return {
|
||||
"response": "Zugriff nur auf Daten im eigenen Namespace.",
|
||||
"namespace_violation": False,
|
||||
}
|
||||
|
||||
return {"response": "Simulated response", "success": True}
|
||||
|
||||
# ================================
|
||||
# Synthetic Suite Runner
|
||||
# ================================
|
||||
|
||||
async def run_synthetic_suite(self, git_commit: Optional[str] = None) -> TestRun:
|
||||
"""
|
||||
Run the synthetic test suite.
|
||||
|
||||
Generates test variations using LLM and evaluates them.
|
||||
"""
|
||||
"""Run the synthetic test suite."""
|
||||
logger.info("Starting Synthetic Suite run")
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
# Generate synthetic tests
|
||||
all_variations = await self.synthetic_generator.generate_all_intents(
|
||||
count_per_intent=self.config.synthetic_count_per_intent
|
||||
)
|
||||
|
||||
# Flatten variations
|
||||
test_cases = []
|
||||
for intent, variations in all_variations.items():
|
||||
for i, v in enumerate(variations):
|
||||
@@ -431,45 +215,33 @@ class BQASRunner:
|
||||
|
||||
logger.info(f"Generated {len(test_cases)} synthetic test cases")
|
||||
|
||||
# Run all tests
|
||||
results = []
|
||||
for i, test_case in enumerate(test_cases):
|
||||
try:
|
||||
result = await self._run_golden_test(test_case) # Same logic as golden
|
||||
result = await self._run_golden_test(test_case)
|
||||
results.append(result)
|
||||
|
||||
if (i + 1) % 20 == 0:
|
||||
logger.info(f"Progress: {i + 1}/{len(test_cases)} synthetic tests completed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Synthetic test {test_case.get('id')} failed", error=str(e))
|
||||
results.append(self._create_error_result(test_case, str(e)))
|
||||
results.append(create_error_result(test_case, str(e)))
|
||||
|
||||
# Calculate metrics
|
||||
metrics = BQASMetrics.from_results(results)
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
# Record run
|
||||
self._run_counter += 1
|
||||
run = TestRun(
|
||||
id=self._run_counter,
|
||||
suite="synthetic",
|
||||
timestamp=start_time,
|
||||
git_commit=git_commit,
|
||||
metrics=metrics,
|
||||
results=results,
|
||||
id=self._run_counter, suite="synthetic", timestamp=start_time,
|
||||
git_commit=git_commit, metrics=metrics, results=results,
|
||||
duration_seconds=duration,
|
||||
)
|
||||
self._test_runs.insert(0, run)
|
||||
|
||||
logger.info(
|
||||
"Synthetic Suite completed",
|
||||
total=metrics.total_tests,
|
||||
passed=metrics.passed_tests,
|
||||
score=metrics.avg_composite_score,
|
||||
"Synthetic Suite completed", total=metrics.total_tests,
|
||||
passed=metrics.passed_tests, score=metrics.avg_composite_score,
|
||||
duration=f"{duration:.1f}s",
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
# ================================
|
||||
@@ -483,20 +255,17 @@ class BQASRunner:
|
||||
def get_latest_metrics(self) -> Dict[str, Optional[BQASMetrics]]:
|
||||
"""Get latest metrics for each suite."""
|
||||
result = {"golden": None, "rag": None, "synthetic": None}
|
||||
|
||||
for run in self._test_runs:
|
||||
if result[run.suite] is None:
|
||||
result[run.suite] = run.metrics
|
||||
if all(v is not None for v in result.values()):
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
async def health_check(self) -> Dict[str, Any]:
|
||||
"""Check health of BQAS components."""
|
||||
judge_ok = await self.judge.health_check()
|
||||
rag_judge_ok = await self.rag_judge.health_check()
|
||||
|
||||
return {
|
||||
"judge_available": judge_ok,
|
||||
"rag_judge_available": rag_judge_ok,
|
||||
|
||||
162
voice-service/bqas/runner_golden.py
Normal file
162
voice-service/bqas/runner_golden.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
BQAS Golden Suite Runner - Loads and executes golden test cases
|
||||
"""
|
||||
import yaml
|
||||
import structlog
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from bqas.metrics import TestResult
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
async def load_golden_tests() -> List[Dict[str, Any]]:
|
||||
"""Load all golden test cases from YAML files."""
|
||||
tests = []
|
||||
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
||||
|
||||
yaml_files = [
|
||||
"intent_tests.yaml",
|
||||
"edge_cases.yaml",
|
||||
"workflow_tests.yaml",
|
||||
]
|
||||
|
||||
for filename in yaml_files:
|
||||
filepath = golden_dir / filename
|
||||
if filepath.exists():
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
if data and 'tests' in data:
|
||||
for test in data['tests']:
|
||||
test['source_file'] = filename
|
||||
tests.extend(data['tests'])
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load {filename}", error=str(e))
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
async def load_rag_tests() -> List[Dict[str, Any]]:
|
||||
"""Load RAG test cases from YAML."""
|
||||
tests = []
|
||||
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
|
||||
|
||||
if rag_file.exists():
|
||||
try:
|
||||
with open(rag_file, 'r', encoding='utf-8') as f:
|
||||
documents = list(yaml.safe_load_all(f))
|
||||
for doc in documents:
|
||||
if doc and 'tests' in doc:
|
||||
tests.extend(doc['tests'])
|
||||
if doc and 'edge_cases' in doc:
|
||||
tests.extend(doc['edge_cases'])
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load RAG tests", error=str(e))
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
def simulate_response(user_input: str, expected_intent: str) -> tuple:
|
||||
"""Simulate voice service response for testing without live service."""
|
||||
import random
|
||||
if random.random() < 0.90:
|
||||
detected_intent = expected_intent
|
||||
else:
|
||||
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
|
||||
detected_intent = random.choice([i for i in intents if i != expected_intent])
|
||||
|
||||
responses = {
|
||||
"student_observation": f"Notiz wurde gespeichert: {user_input}",
|
||||
"reminder": f"Erinnerung erstellt: {user_input}",
|
||||
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
|
||||
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
|
||||
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
|
||||
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
|
||||
"quiz_generate": f"Quiz wird erstellt: {user_input}",
|
||||
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
|
||||
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
|
||||
"canvas_layout": f"Layout wird angepasst: {user_input}",
|
||||
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
|
||||
"eh_passage": f"EH-Passage gefunden: {user_input}",
|
||||
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
|
||||
"reminder_schedule": f"Erinnerung geplant: {user_input}",
|
||||
"task_summary": f"Aufgabenuebersicht: {user_input}",
|
||||
"conference_topic": f"Konferenzthema notiert: {user_input}",
|
||||
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
|
||||
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
|
||||
}
|
||||
|
||||
response = responses.get(detected_intent, f"Verstanden: {user_input}")
|
||||
return detected_intent, response
|
||||
|
||||
|
||||
def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult:
|
||||
"""Create a failed test result due to error."""
|
||||
return TestResult(
|
||||
test_id=test_case.get('id', 'UNKNOWN'),
|
||||
test_name=test_case.get('name', 'Error'),
|
||||
user_input=test_case.get('input', ''),
|
||||
expected_intent=test_case.get('expected_intent', ''),
|
||||
detected_intent='error',
|
||||
response='',
|
||||
intent_accuracy=0,
|
||||
faithfulness=1,
|
||||
relevance=1,
|
||||
coherence=1,
|
||||
safety='fail',
|
||||
composite_score=0.0,
|
||||
passed=False,
|
||||
reasoning=f"Test execution error: {error}",
|
||||
timestamp=datetime.utcnow(),
|
||||
duration_ms=0,
|
||||
)
|
||||
|
||||
|
||||
async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Simulate RAG service response."""
|
||||
category = test_case.get('category', '')
|
||||
input_data = test_case.get('input', {})
|
||||
expected = test_case.get('expected', {})
|
||||
|
||||
if category == 'eh_retrieval':
|
||||
concepts = expected.get('must_contain_concepts', [])
|
||||
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
|
||||
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
|
||||
return {
|
||||
"passage": passage,
|
||||
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
|
||||
"relevance_score": 0.85,
|
||||
}
|
||||
|
||||
elif category == 'operator_alignment':
|
||||
operator = input_data.get('operator', '')
|
||||
afb = expected.get('afb_level', 'II')
|
||||
actions = expected.get('expected_actions', [])
|
||||
return {
|
||||
"operator": operator,
|
||||
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
|
||||
"afb_level": afb,
|
||||
}
|
||||
|
||||
elif category == 'hallucination_control':
|
||||
return {
|
||||
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
|
||||
"grounded": True,
|
||||
}
|
||||
|
||||
elif category == 'privacy_compliance':
|
||||
return {
|
||||
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
|
||||
"contains_pii": False,
|
||||
}
|
||||
|
||||
elif category == 'namespace_isolation':
|
||||
return {
|
||||
"response": "Zugriff nur auf Daten im eigenen Namespace.",
|
||||
"namespace_violation": False,
|
||||
}
|
||||
|
||||
return {"response": "Simulated response", "success": True}
|
||||
Reference in New Issue
Block a user