website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
398 lines
14 KiB
Python
398 lines
14 KiB
Python
"""
|
|
RAG Judge Evaluators - Individual evaluation methods for RAG quality
|
|
"""
|
|
import json
|
|
import time
|
|
import structlog
|
|
from typing import List, Dict, Any
|
|
from datetime import datetime
|
|
|
|
from bqas.config import BQASConfig
|
|
from bqas.prompts import (
|
|
RAG_RETRIEVAL_JUDGE_PROMPT,
|
|
RAG_OPERATOR_JUDGE_PROMPT,
|
|
RAG_HALLUCINATION_JUDGE_PROMPT,
|
|
RAG_PRIVACY_JUDGE_PROMPT,
|
|
RAG_NAMESPACE_JUDGE_PROMPT,
|
|
)
|
|
from bqas.metrics import TestResult
|
|
from bqas.rag_judge_types import (
|
|
RAGRetrievalResult,
|
|
RAGOperatorResult,
|
|
RAGHallucinationResult,
|
|
RAGPrivacyResult,
|
|
RAGNamespaceResult,
|
|
)
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
async def evaluate_retrieval(
|
|
call_ollama,
|
|
parse_json_response,
|
|
config: BQASConfig,
|
|
query: str,
|
|
aufgabentyp: str,
|
|
subject: str,
|
|
level: str,
|
|
retrieved_passage: str,
|
|
expected_concepts: List[str],
|
|
) -> RAGRetrievalResult:
|
|
"""Evaluate EH retrieval quality."""
|
|
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
|
|
query=query,
|
|
aufgabentyp=aufgabentyp,
|
|
subject=subject,
|
|
level=level,
|
|
retrieved_passage=retrieved_passage,
|
|
expected_concepts=", ".join(expected_concepts),
|
|
)
|
|
|
|
try:
|
|
response_text = await call_ollama(prompt)
|
|
data = parse_json_response(response_text)
|
|
|
|
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
|
|
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
|
relevance = max(1, min(5, int(data.get("relevance", 1))))
|
|
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
|
|
|
|
composite = _calculate_retrieval_composite(
|
|
config, retrieval_precision, faithfulness, relevance, citation_accuracy
|
|
)
|
|
|
|
return RAGRetrievalResult(
|
|
retrieval_precision=retrieval_precision,
|
|
faithfulness=faithfulness,
|
|
relevance=relevance,
|
|
citation_accuracy=citation_accuracy,
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Retrieval evaluation failed", error=str(e))
|
|
return RAGRetrievalResult(
|
|
retrieval_precision=0,
|
|
faithfulness=1,
|
|
relevance=1,
|
|
citation_accuracy=1,
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
|
|
def _calculate_retrieval_composite(
|
|
config: BQASConfig,
|
|
retrieval_precision: int,
|
|
faithfulness: int,
|
|
relevance: int,
|
|
citation_accuracy: int,
|
|
) -> float:
|
|
"""Calculate composite score for retrieval evaluation."""
|
|
retrieval_score = (retrieval_precision / 100) * 5
|
|
composite = (
|
|
retrieval_score * config.rag_retrieval_precision_weight +
|
|
faithfulness * config.rag_faithfulness_weight +
|
|
relevance * 0.3 +
|
|
citation_accuracy * config.rag_citation_accuracy_weight
|
|
)
|
|
return round(composite, 3)
|
|
|
|
|
|
async def evaluate_operator(
|
|
call_ollama,
|
|
parse_json_response,
|
|
operator: str,
|
|
generated_definition: str,
|
|
expected_afb: str,
|
|
expected_actions: List[str],
|
|
) -> RAGOperatorResult:
|
|
"""Evaluate operator alignment."""
|
|
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
|
|
operator=operator,
|
|
generated_definition=generated_definition,
|
|
expected_afb=expected_afb,
|
|
expected_actions=", ".join(expected_actions),
|
|
)
|
|
|
|
try:
|
|
response_text = await call_ollama(prompt)
|
|
data = parse_json_response(response_text)
|
|
|
|
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
|
|
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
|
|
completeness = max(1, min(5, int(data.get("completeness", 1))))
|
|
detected_afb = str(data.get("detected_afb", ""))
|
|
|
|
alignment_score = (operator_alignment / 100) * 5
|
|
composite = round(
|
|
alignment_score * 0.5 + faithfulness * 0.3 + completeness * 0.2, 3
|
|
)
|
|
|
|
return RAGOperatorResult(
|
|
operator_alignment=operator_alignment,
|
|
faithfulness=faithfulness,
|
|
completeness=completeness,
|
|
detected_afb=detected_afb,
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Operator evaluation failed", error=str(e))
|
|
return RAGOperatorResult(
|
|
operator_alignment=0,
|
|
faithfulness=1,
|
|
completeness=1,
|
|
detected_afb="",
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
|
|
async def evaluate_hallucination(
|
|
call_ollama,
|
|
parse_json_response,
|
|
query: str,
|
|
response: str,
|
|
available_facts: List[str],
|
|
) -> RAGHallucinationResult:
|
|
"""Evaluate for hallucinations."""
|
|
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
|
|
query=query,
|
|
response=response,
|
|
available_facts="\n".join(f"- {f}" for f in available_facts),
|
|
)
|
|
|
|
try:
|
|
response_text = await call_ollama(prompt)
|
|
data = parse_json_response(response_text)
|
|
|
|
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
|
|
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
|
|
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
|
|
hallucinated_claims = data.get("hallucinated_claims", [])
|
|
|
|
grounding = (grounding_score / 100) * 5
|
|
invention = 5.0 if invention_detection == "pass" else 0.0
|
|
composite = round(grounding * 0.4 + invention * 0.4 + source_attribution * 0.2, 3)
|
|
|
|
return RAGHallucinationResult(
|
|
grounding_score=grounding_score,
|
|
invention_detection=invention_detection,
|
|
source_attribution=source_attribution,
|
|
hallucinated_claims=hallucinated_claims[:5],
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Hallucination evaluation failed", error=str(e))
|
|
return RAGHallucinationResult(
|
|
grounding_score=0,
|
|
invention_detection="fail",
|
|
source_attribution=1,
|
|
hallucinated_claims=[],
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
|
|
async def evaluate_privacy(
|
|
call_ollama,
|
|
parse_json_response,
|
|
query: str,
|
|
context: Dict[str, Any],
|
|
response: str,
|
|
) -> RAGPrivacyResult:
|
|
"""Evaluate privacy/DSGVO compliance."""
|
|
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
|
|
query=query,
|
|
context=json.dumps(context, ensure_ascii=False, indent=2),
|
|
response=response,
|
|
)
|
|
|
|
try:
|
|
response_text = await call_ollama(prompt)
|
|
data = parse_json_response(response_text)
|
|
|
|
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
|
|
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
|
|
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
|
|
detected_pii = data.get("detected_pii", [])
|
|
|
|
privacy = 5.0 if privacy_compliance == "pass" else 0.0
|
|
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
|
|
composite = round(privacy * 0.4 + anonymization * 0.2 + dsgvo * 0.4, 3)
|
|
|
|
return RAGPrivacyResult(
|
|
privacy_compliance=privacy_compliance,
|
|
anonymization=anonymization,
|
|
dsgvo_compliance=dsgvo_compliance,
|
|
detected_pii=detected_pii[:5],
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Privacy evaluation failed", error=str(e))
|
|
return RAGPrivacyResult(
|
|
privacy_compliance="fail",
|
|
anonymization=1,
|
|
dsgvo_compliance="fail",
|
|
detected_pii=[],
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
|
|
async def evaluate_namespace(
|
|
call_ollama,
|
|
parse_json_response,
|
|
teacher_id: str,
|
|
namespace: str,
|
|
school_id: str,
|
|
requested_data: str,
|
|
response: str,
|
|
) -> RAGNamespaceResult:
|
|
"""Evaluate namespace isolation."""
|
|
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
|
|
teacher_id=teacher_id,
|
|
namespace=namespace,
|
|
school_id=school_id,
|
|
requested_data=requested_data,
|
|
response=response,
|
|
)
|
|
|
|
try:
|
|
response_text = await call_ollama(prompt)
|
|
data = parse_json_response(response_text)
|
|
|
|
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
|
|
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
|
|
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
|
|
detected_leaks = data.get("detected_leaks", [])
|
|
|
|
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
|
|
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
|
|
composite = round(
|
|
ns_compliance * 0.4 + cross_tenant * 0.4 + school_sharing_compliance * 0.2, 3
|
|
)
|
|
|
|
return RAGNamespaceResult(
|
|
namespace_compliance=namespace_compliance,
|
|
cross_tenant_leak=cross_tenant_leak,
|
|
school_sharing_compliance=school_sharing_compliance,
|
|
detected_leaks=detected_leaks[:5],
|
|
reasoning=str(data.get("reasoning", ""))[:500],
|
|
composite_score=composite,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Namespace evaluation failed", error=str(e))
|
|
return RAGNamespaceResult(
|
|
namespace_compliance="fail",
|
|
cross_tenant_leak="fail",
|
|
school_sharing_compliance=1,
|
|
detected_leaks=[],
|
|
reasoning=f"Evaluation failed: {str(e)}",
|
|
composite_score=0.0,
|
|
)
|
|
|
|
|
|
async def evaluate_rag_test_case(
|
|
judge_instance,
|
|
test_case: Dict[str, Any],
|
|
service_response: Dict[str, Any],
|
|
) -> TestResult:
|
|
"""
|
|
Evaluate a full RAG test case from the golden suite.
|
|
"""
|
|
start_time = time.time()
|
|
|
|
test_id = test_case.get("id", "UNKNOWN")
|
|
test_name = test_case.get("name", "")
|
|
category = test_case.get("category", "")
|
|
min_score = test_case.get("min_score", 3.5)
|
|
|
|
composite_score = 0.0
|
|
reasoning = ""
|
|
|
|
if category == "eh_retrieval":
|
|
result = await judge_instance.evaluate_retrieval(
|
|
query=test_case.get("input", {}).get("query", ""),
|
|
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
|
|
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
|
|
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
|
|
retrieved_passage=service_response.get("passage", ""),
|
|
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "operator_alignment":
|
|
result = await judge_instance.evaluate_operator(
|
|
operator=test_case.get("input", {}).get("operator", ""),
|
|
generated_definition=service_response.get("definition", ""),
|
|
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
|
|
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "hallucination_control":
|
|
result = await judge_instance.evaluate_hallucination(
|
|
query=test_case.get("input", {}).get("query", ""),
|
|
response=service_response.get("response", ""),
|
|
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "privacy_compliance":
|
|
result = await judge_instance.evaluate_privacy(
|
|
query=test_case.get("input", {}).get("query", ""),
|
|
context=test_case.get("input", {}).get("context", {}),
|
|
response=service_response.get("response", ""),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
elif category == "namespace_isolation":
|
|
context = test_case.get("input", {}).get("context", {})
|
|
result = await judge_instance.evaluate_namespace(
|
|
teacher_id=context.get("teacher_id", ""),
|
|
namespace=context.get("namespace", ""),
|
|
school_id=context.get("school_id", ""),
|
|
requested_data=test_case.get("input", {}).get("query", ""),
|
|
response=service_response.get("response", ""),
|
|
)
|
|
composite_score = result.composite_score
|
|
reasoning = result.reasoning
|
|
|
|
else:
|
|
reasoning = f"Unknown category: {category}"
|
|
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
passed = composite_score >= min_score
|
|
|
|
return TestResult(
|
|
test_id=test_id,
|
|
test_name=test_name,
|
|
user_input=str(test_case.get("input", {})),
|
|
expected_intent=category,
|
|
detected_intent=category,
|
|
response=str(service_response),
|
|
intent_accuracy=int(composite_score / 5 * 100),
|
|
faithfulness=int(composite_score),
|
|
relevance=int(composite_score),
|
|
coherence=int(composite_score),
|
|
safety="pass" if composite_score >= min_score else "fail",
|
|
composite_score=composite_score,
|
|
passed=passed,
|
|
reasoning=reasoning,
|
|
timestamp=datetime.utcnow(),
|
|
duration_ms=duration_ms,
|
|
)
|