Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
619 lines
21 KiB
Python
619 lines
21 KiB
Python
"""
|
|
RAG Judge - Specialized evaluation for RAG/Correction quality
|
|
"""
|
|
import json
|
|
import time
|
|
import structlog
|
|
import httpx
|
|
from dataclasses import dataclass
|
|
from typing import Literal, Optional, Dict, List, Any
|
|
from datetime import datetime
|
|
|
|
from bqas.config import BQASConfig
|
|
from bqas.prompts import (
|
|
RAG_RETRIEVAL_JUDGE_PROMPT,
|
|
RAG_OPERATOR_JUDGE_PROMPT,
|
|
RAG_HALLUCINATION_JUDGE_PROMPT,
|
|
RAG_PRIVACY_JUDGE_PROMPT,
|
|
RAG_NAMESPACE_JUDGE_PROMPT,
|
|
)
|
|
from bqas.metrics import TestResult
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class RAGRetrievalResult:
|
|
"""Result from RAG retrieval evaluation."""
|
|
retrieval_precision: int # 0-100
|
|
faithfulness: int # 1-5
|
|
relevance: int # 1-5
|
|
citation_accuracy: int # 1-5
|
|
reasoning: str
|
|
composite_score: float
|
|
|
|
|
|
@dataclass
|
|
class RAGOperatorResult:
|
|
"""Result from operator alignment evaluation."""
|
|
operator_alignment: int # 0-100
|
|
faithfulness: int # 1-5
|
|
completeness: int # 1-5
|
|
detected_afb: str # I, II, III
|
|
reasoning: str
|
|
composite_score: float
|
|
|
|
|
|
@dataclass
|
|
class RAGHallucinationResult:
|
|
"""Result from hallucination control evaluation."""
|
|
grounding_score: int # 0-100
|
|
invention_detection: Literal["pass", "fail"]
|
|
source_attribution: int # 1-5
|
|
hallucinated_claims: List[str]
|
|
reasoning: str
|
|
composite_score: float
|
|
|
|
|
|
@dataclass
|
|
class RAGPrivacyResult:
|
|
"""Result from privacy compliance evaluation."""
|
|
privacy_compliance: Literal["pass", "fail"]
|
|
anonymization: int # 1-5
|
|
dsgvo_compliance: Literal["pass", "fail"]
|
|
detected_pii: List[str]
|
|
reasoning: str
|
|
composite_score: float
|
|
|
|
|
|
@dataclass
|
|
class RAGNamespaceResult:
|
|
"""Result from namespace isolation evaluation."""
|
|
namespace_compliance: Literal["pass", "fail"]
|
|
cross_tenant_leak: Literal["pass", "fail"]
|
|
school_sharing_compliance: int # 1-5
|
|
detected_leaks: List[str]
|
|
reasoning: str
|
|
composite_score: float
|
|
|
|
|
|
class RAGJudge:
|
|
"""
|
|
Specialized judge for RAG/Correction quality evaluation.
|
|
|
|
Evaluates:
|
|
- EH Retrieval quality
|
|
- Operator alignment
|
|
- Hallucination control
|
|
- Privacy/DSGVO compliance
|
|
- Namespace isolation
|
|
"""
|
|
|
|
def __init__(self, config: Optional[BQASConfig] = None):
|
|
self.config = config or BQASConfig.from_env()
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""Get or create HTTP client."""
|
|
if self._client is None:
|
|
self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
|
|
return self._client
|
|
|
|
async def _call_ollama(self, prompt: str) -> str:
|
|
"""Call Ollama API with prompt."""
|
|
client = await self._get_client()
|
|
|
|
resp = await client.post(
|
|
f"{self.config.ollama_base_url}/api/generate",
|
|
json={
|
|
"model": self.config.judge_model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"num_predict": 800,
|
|
},
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json().get("response", "")
|
|
|
|
def _parse_json_response(self, text: str) -> dict:
|
|
"""Parse JSON from response text."""
|
|
try:
|
|
start = text.find("{")
|
|
end = text.rfind("}") + 1
|
|
if start >= 0 and end > start:
|
|
json_str = text[start:end]
|
|
return json.loads(json_str)
|
|
except (json.JSONDecodeError, ValueError) as e:
|
|
logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
|
|
return {}
|
|
|
|
# ================================
|
|
# Retrieval Evaluation
|
|
# ================================
|
|
|
|
async def evaluate_retrieval(
|
|
self,
|
|
query: str,
|
|
aufgabentyp: str,
|
|
subject: str,
|
|
level: str,
|
|
retrieved_passage: str,
|
|
expected_concepts: List[str],
|
|
) -> RAGRetrievalResult:
|
|
"""Evaluate EH retrieval quality."""
|
|
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
|
|
query=query,
|
|
aufgabentyp=aufgabentyp,
|
|
subject=subject,
|
|
level=level,
|
|
retrieved_passage=retrieved_passage,
|
|
expected_concepts=", ".join(expected_concepts),
|
|
)
|
|
|
|
try:
|
|
response_text = await self._call_ollama(prompt)
|
|
data = self._parse_json_response(response_text)
|
|
|
|
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
|
|
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
|
relevance = max(1, min(5, int(data.get("relevance", 1))))
|
|
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
|
|
|
|
composite = self._calculate_retrieval_composite(
|
|
retrieval_precision, faithfulness, relevance, citation_accuracy
|
|
)
|
|
|
|
return RAGRetrievalResult(
|
|
retrieval_precision=retrieval_precision,
|
|
faithfulness=faithfulness,
|
|
relevance=relevance,
|
|
citation_accuracy=citation_accuracy,
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Retrieval evaluation failed", error=str(e))
|
|
return RAGRetrievalResult(
|
|
retrieval_precision=0,
|
|
faithfulness=1,
|
|
relevance=1,
|
|
citation_accuracy=1,
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
def _calculate_retrieval_composite(
|
|
self,
|
|
retrieval_precision: int,
|
|
faithfulness: int,
|
|
relevance: int,
|
|
citation_accuracy: int,
|
|
) -> float:
|
|
"""Calculate composite score for retrieval evaluation."""
|
|
c = self.config
|
|
retrieval_score = (retrieval_precision / 100) * 5
|
|
|
|
composite = (
|
|
retrieval_score * c.rag_retrieval_precision_weight +
|
|
faithfulness * c.rag_faithfulness_weight +
|
|
relevance * 0.3 + # Higher weight for relevance in retrieval
|
|
citation_accuracy * c.rag_citation_accuracy_weight
|
|
)
|
|
return round(composite, 3)
|
|
|
|
# ================================
|
|
# Operator Evaluation
|
|
# ================================
|
|
|
|
async def evaluate_operator(
|
|
self,
|
|
operator: str,
|
|
generated_definition: str,
|
|
expected_afb: str,
|
|
expected_actions: List[str],
|
|
) -> RAGOperatorResult:
|
|
"""Evaluate operator alignment."""
|
|
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
|
|
operator=operator,
|
|
generated_definition=generated_definition,
|
|
expected_afb=expected_afb,
|
|
expected_actions=", ".join(expected_actions),
|
|
)
|
|
|
|
try:
|
|
response_text = await self._call_ollama(prompt)
|
|
data = self._parse_json_response(response_text)
|
|
|
|
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
|
|
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
|
completeness = max(1, min(5, int(data.get("completeness", 1))))
|
|
detected_afb = str(data.get("detected_afb", ""))
|
|
|
|
composite = self._calculate_operator_composite(
|
|
operator_alignment, faithfulness, completeness
|
|
)
|
|
|
|
return RAGOperatorResult(
|
|
operator_alignment=operator_alignment,
|
|
faithfulness=faithfulness,
|
|
completeness=completeness,
|
|
detected_afb=detected_afb,
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Operator evaluation failed", error=str(e))
|
|
return RAGOperatorResult(
|
|
operator_alignment=0,
|
|
faithfulness=1,
|
|
completeness=1,
|
|
detected_afb="",
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
def _calculate_operator_composite(
|
|
self,
|
|
operator_alignment: int,
|
|
faithfulness: int,
|
|
completeness: int,
|
|
) -> float:
|
|
"""Calculate composite score for operator evaluation."""
|
|
alignment_score = (operator_alignment / 100) * 5
|
|
|
|
composite = (
|
|
alignment_score * 0.5 +
|
|
faithfulness * 0.3 +
|
|
completeness * 0.2
|
|
)
|
|
return round(composite, 3)
|
|
|
|
# ================================
|
|
# Hallucination Evaluation
|
|
# ================================
|
|
|
|
async def evaluate_hallucination(
|
|
self,
|
|
query: str,
|
|
response: str,
|
|
available_facts: List[str],
|
|
) -> RAGHallucinationResult:
|
|
"""Evaluate for hallucinations."""
|
|
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
|
|
query=query,
|
|
response=response,
|
|
available_facts="\n".join(f"- {f}" for f in available_facts),
|
|
)
|
|
|
|
try:
|
|
response_text = await self._call_ollama(prompt)
|
|
data = self._parse_json_response(response_text)
|
|
|
|
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
|
|
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
|
|
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
|
|
hallucinated_claims = data.get("hallucinated_claims", [])
|
|
|
|
composite = self._calculate_hallucination_composite(
|
|
grounding_score, invention_detection, source_attribution
|
|
)
|
|
|
|
return RAGHallucinationResult(
|
|
grounding_score=grounding_score,
|
|
invention_detection=invention_detection,
|
|
source_attribution=source_attribution,
|
|
hallucinated_claims=hallucinated_claims[:5],
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Hallucination evaluation failed", error=str(e))
|
|
return RAGHallucinationResult(
|
|
grounding_score=0,
|
|
invention_detection="fail",
|
|
source_attribution=1,
|
|
hallucinated_claims=[],
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
def _calculate_hallucination_composite(
|
|
self,
|
|
grounding_score: int,
|
|
invention_detection: str,
|
|
source_attribution: int,
|
|
) -> float:
|
|
"""Calculate composite score for hallucination evaluation."""
|
|
grounding = (grounding_score / 100) * 5
|
|
invention = 5.0 if invention_detection == "pass" else 0.0
|
|
|
|
composite = (
|
|
grounding * 0.4 +
|
|
invention * 0.4 +
|
|
source_attribution * 0.2
|
|
)
|
|
return round(composite, 3)
|
|
|
|
# ================================
|
|
# Privacy Evaluation
|
|
# ================================
|
|
|
|
async def evaluate_privacy(
|
|
self,
|
|
query: str,
|
|
context: Dict[str, Any],
|
|
response: str,
|
|
) -> RAGPrivacyResult:
|
|
"""Evaluate privacy/DSGVO compliance."""
|
|
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
|
|
query=query,
|
|
context=json.dumps(context, ensure_ascii=False, indent=2),
|
|
response=response,
|
|
)
|
|
|
|
try:
|
|
response_text = await self._call_ollama(prompt)
|
|
data = self._parse_json_response(response_text)
|
|
|
|
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
|
|
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
|
|
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
|
|
detected_pii = data.get("detected_pii", [])
|
|
|
|
composite = self._calculate_privacy_composite(
|
|
privacy_compliance, anonymization, dsgvo_compliance
|
|
)
|
|
|
|
return RAGPrivacyResult(
|
|
privacy_compliance=privacy_compliance,
|
|
anonymization=anonymization,
|
|
dsgvo_compliance=dsgvo_compliance,
|
|
detected_pii=detected_pii[:5],
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Privacy evaluation failed", error=str(e))
|
|
return RAGPrivacyResult(
|
|
privacy_compliance="fail",
|
|
anonymization=1,
|
|
dsgvo_compliance="fail",
|
|
detected_pii=[],
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
def _calculate_privacy_composite(
|
|
self,
|
|
privacy_compliance: str,
|
|
anonymization: int,
|
|
dsgvo_compliance: str,
|
|
) -> float:
|
|
"""Calculate composite score for privacy evaluation."""
|
|
privacy = 5.0 if privacy_compliance == "pass" else 0.0
|
|
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
|
|
|
|
composite = (
|
|
privacy * 0.4 +
|
|
anonymization * 0.2 +
|
|
dsgvo * 0.4
|
|
)
|
|
return round(composite, 3)
|
|
|
|
# ================================
|
|
# Namespace Evaluation
|
|
# ================================
|
|
|
|
async def evaluate_namespace(
|
|
self,
|
|
teacher_id: str,
|
|
namespace: str,
|
|
school_id: str,
|
|
requested_data: str,
|
|
response: str,
|
|
) -> RAGNamespaceResult:
|
|
"""Evaluate namespace isolation."""
|
|
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
|
|
teacher_id=teacher_id,
|
|
namespace=namespace,
|
|
school_id=school_id,
|
|
requested_data=requested_data,
|
|
response=response,
|
|
)
|
|
|
|
try:
|
|
response_text = await self._call_ollama(prompt)
|
|
data = self._parse_json_response(response_text)
|
|
|
|
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
|
|
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
|
|
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
|
|
detected_leaks = data.get("detected_leaks", [])
|
|
|
|
composite = self._calculate_namespace_composite(
|
|
namespace_compliance, cross_tenant_leak, school_sharing_compliance
|
|
)
|
|
|
|
return RAGNamespaceResult(
|
|
namespace_compliance=namespace_compliance,
|
|
cross_tenant_leak=cross_tenant_leak,
|
|
school_sharing_compliance=school_sharing_compliance,
|
|
detected_leaks=detected_leaks[:5],
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Namespace evaluation failed", error=str(e))
|
|
return RAGNamespaceResult(
|
|
namespace_compliance="fail",
|
|
cross_tenant_leak="fail",
|
|
school_sharing_compliance=1,
|
|
detected_leaks=[],
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
def _calculate_namespace_composite(
|
|
self,
|
|
namespace_compliance: str,
|
|
cross_tenant_leak: str,
|
|
school_sharing_compliance: int,
|
|
) -> float:
|
|
"""Calculate composite score for namespace evaluation."""
|
|
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
|
|
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
|
|
|
|
composite = (
|
|
ns_compliance * 0.4 +
|
|
cross_tenant * 0.4 +
|
|
school_sharing_compliance * 0.2
|
|
)
|
|
return round(composite, 3)
|
|
|
|
# ================================
|
|
# Test Case Evaluation
|
|
# ================================
|
|
|
|
async def evaluate_rag_test_case(
|
|
self,
|
|
test_case: Dict[str, Any],
|
|
service_response: Dict[str, Any],
|
|
) -> TestResult:
|
|
"""
|
|
Evaluate a full RAG test case from the golden suite.
|
|
|
|
Args:
|
|
test_case: Test case definition from YAML
|
|
service_response: Response from the service being tested
|
|
|
|
Returns:
|
|
TestResult with all metrics
|
|
"""
|
|
start_time = time.time()
|
|
|
|
test_id = test_case.get("id", "UNKNOWN")
|
|
test_name = test_case.get("name", "")
|
|
category = test_case.get("category", "")
|
|
min_score = test_case.get("min_score", 3.5)
|
|
|
|
# Route to appropriate evaluation based on category
|
|
composite_score = 0.0
|
|
reasoning = ""
|
|
|
|
if category == "eh_retrieval":
|
|
result = await self.evaluate_retrieval(
|
|
query=test_case.get("input", {}).get("query", ""),
|
|
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
|
|
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
|
|
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
|
|
retrieved_passage=service_response.get("passage", ""),
|
|
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "operator_alignment":
|
|
result = await self.evaluate_operator(
|
|
operator=test_case.get("input", {}).get("operator", ""),
|
|
generated_definition=service_response.get("definition", ""),
|
|
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
|
|
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "hallucination_control":
|
|
result = await self.evaluate_hallucination(
|
|
query=test_case.get("input", {}).get("query", ""),
|
|
response=service_response.get("response", ""),
|
|
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "privacy_compliance":
|
|
result = await self.evaluate_privacy(
|
|
query=test_case.get("input", {}).get("query", ""),
|
|
context=test_case.get("input", {}).get("context", {}),
|
|
response=service_response.get("response", ""),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "namespace_isolation":
|
|
context = test_case.get("input", {}).get("context", {})
|
|
result = await self.evaluate_namespace(
|
|
teacher_id=context.get("teacher_id", ""),
|
|
namespace=context.get("namespace", ""),
|
|
school_id=context.get("school_id", ""),
|
|
requested_data=test_case.get("input", {}).get("query", ""),
|
|
response=service_response.get("response", ""),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
else:
|
|
reasoning = f"Unknown category: {category}"
|
|
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
passed = composite_score >= min_score
|
|
|
|
return TestResult(
|
|
test_id=test_id,
|
|
test_name=test_name,
|
|
user_input=str(test_case.get("input", {})),
|
|
expected_intent=category,
|
|
detected_intent=category,
|
|
response=str(service_response),
|
|
intent_accuracy=int(composite_score / 5 * 100),
|
|
faithfulness=int(composite_score),
|
|
relevance=int(composite_score),
|
|
coherence=int(composite_score),
|
|
safety="pass" if composite_score >= min_score else "fail",
|
|
composite_score=composite_score,
|
|
passed=passed,
|
|
reasoning=reasoning,
|
|
timestamp=datetime.utcnow(),
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Check if Ollama and judge model are available."""
|
|
try:
|
|
client = await self._get_client()
|
|
response = await client.get(f"{self.config.ollama_base_url}/api/tags")
|
|
if response.status_code != 200:
|
|
return False
|
|
|
|
models = response.json().get("models", [])
|
|
model_names = [m.get("name", "") for m in models]
|
|
|
|
for name in model_names:
|
|
if self.config.judge_model in name:
|
|
return True
|
|
|
|
logger.warning(
|
|
"Judge model not found",
|
|
model=self.config.judge_model,
|
|
available=model_names[:5],
|
|
)
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error("Health check failed", error=str(e))
|
|
return False
|
|
|
|
async def close(self):
|
|
"""Close HTTP client."""
|
|
if self._client:
|
|
await self._client.aclose()
|
|
self._client = None
|