[split-required] Split remaining 500-680 LOC files (final batch)

website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:56:45 +02:00
parent b4613e26f3
commit 451365a312
115 changed files with 10694 additions and 13839 deletions

View File

@@ -1,82 +1,49 @@
"""
RAG Judge - Specialized evaluation for RAG/Correction quality
Split into:
- rag_judge_types.py: Data classes for evaluation results
- rag_judge_evaluators.py: Individual evaluation methods
- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
"""
import json
import time
import structlog
import httpx
from dataclasses import dataclass
from typing import Literal, Optional, Dict, List, Any
from datetime import datetime
from typing import Optional, Dict, List, Any
from bqas.config import BQASConfig
from bqas.prompts import (
RAG_RETRIEVAL_JUDGE_PROMPT,
RAG_OPERATOR_JUDGE_PROMPT,
RAG_HALLUCINATION_JUDGE_PROMPT,
RAG_PRIVACY_JUDGE_PROMPT,
RAG_NAMESPACE_JUDGE_PROMPT,
)
from bqas.metrics import TestResult
# Re-export types for backward compatibility
from bqas.rag_judge_types import (
RAGRetrievalResult,
RAGOperatorResult,
RAGHallucinationResult,
RAGPrivacyResult,
RAGNamespaceResult,
)
from bqas.rag_judge_evaluators import (
evaluate_retrieval as _evaluate_retrieval,
evaluate_operator as _evaluate_operator,
evaluate_hallucination as _evaluate_hallucination,
evaluate_privacy as _evaluate_privacy,
evaluate_namespace as _evaluate_namespace,
evaluate_rag_test_case as _evaluate_rag_test_case,
)
__all__ = [
"RAGJudge",
"RAGRetrievalResult",
"RAGOperatorResult",
"RAGHallucinationResult",
"RAGPrivacyResult",
"RAGNamespaceResult",
]
logger = structlog.get_logger(__name__)
@dataclass
class RAGRetrievalResult:
"""Result from RAG retrieval evaluation."""
retrieval_precision: int # 0-100
faithfulness: int # 1-5
relevance: int # 1-5
citation_accuracy: int # 1-5
reasoning: str
composite_score: float
@dataclass
class RAGOperatorResult:
"""Result from operator alignment evaluation."""
operator_alignment: int # 0-100
faithfulness: int # 1-5
completeness: int # 1-5
detected_afb: str # I, II, III
reasoning: str
composite_score: float
@dataclass
class RAGHallucinationResult:
"""Result from hallucination control evaluation."""
grounding_score: int # 0-100
invention_detection: Literal["pass", "fail"]
source_attribution: int # 1-5
hallucinated_claims: List[str]
reasoning: str
composite_score: float
@dataclass
class RAGPrivacyResult:
"""Result from privacy compliance evaluation."""
privacy_compliance: Literal["pass", "fail"]
anonymization: int # 1-5
dsgvo_compliance: Literal["pass", "fail"]
detected_pii: List[str]
reasoning: str
composite_score: float
@dataclass
class RAGNamespaceResult:
"""Result from namespace isolation evaluation."""
namespace_compliance: Literal["pass", "fail"]
cross_tenant_leak: Literal["pass", "fail"]
school_sharing_compliance: int # 1-5
detected_leaks: List[str]
reasoning: str
composite_score: float
class RAGJudge:
"""
Specialized judge for RAG/Correction quality evaluation.
@@ -130,460 +97,53 @@ class RAGJudge:
logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
return {}
# ================================
# Retrieval Evaluation
# ================================
async def evaluate_retrieval(
self,
query: str,
aufgabentyp: str,
subject: str,
level: str,
retrieved_passage: str,
expected_concepts: List[str],
self, query: str, aufgabentyp: str, subject: str, level: str,
retrieved_passage: str, expected_concepts: List[str],
) -> RAGRetrievalResult:
"""Evaluate EH retrieval quality."""
prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
query=query,
aufgabentyp=aufgabentyp,
subject=subject,
level=level,
retrieved_passage=retrieved_passage,
expected_concepts=", ".join(expected_concepts),
return await _evaluate_retrieval(
self._call_ollama, self._parse_json_response, self.config,
query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
relevance = max(1, min(5, int(data.get("relevance", 1))))
citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))
composite = self._calculate_retrieval_composite(
retrieval_precision, faithfulness, relevance, citation_accuracy
)
return RAGRetrievalResult(
retrieval_precision=retrieval_precision,
faithfulness=faithfulness,
relevance=relevance,
citation_accuracy=citation_accuracy,
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Retrieval evaluation failed", error=str(e))
return RAGRetrievalResult(
retrieval_precision=0,
faithfulness=1,
relevance=1,
citation_accuracy=1,
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_retrieval_composite(
self,
retrieval_precision: int,
faithfulness: int,
relevance: int,
citation_accuracy: int,
) -> float:
"""Calculate composite score for retrieval evaluation."""
c = self.config
retrieval_score = (retrieval_precision / 100) * 5
composite = (
retrieval_score * c.rag_retrieval_precision_weight +
faithfulness * c.rag_faithfulness_weight +
relevance * 0.3 + # Higher weight for relevance in retrieval
citation_accuracy * c.rag_citation_accuracy_weight
)
return round(composite, 3)
# ================================
# Operator Evaluation
# ================================
async def evaluate_operator(
self,
operator: str,
generated_definition: str,
expected_afb: str,
expected_actions: List[str],
self, operator: str, generated_definition: str,
expected_afb: str, expected_actions: List[str],
) -> RAGOperatorResult:
"""Evaluate operator alignment."""
prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
operator=operator,
generated_definition=generated_definition,
expected_afb=expected_afb,
expected_actions=", ".join(expected_actions),
return await _evaluate_operator(
self._call_ollama, self._parse_json_response,
operator, generated_definition, expected_afb, expected_actions,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
completeness = max(1, min(5, int(data.get("completeness", 1))))
detected_afb = str(data.get("detected_afb", ""))
composite = self._calculate_operator_composite(
operator_alignment, faithfulness, completeness
)
return RAGOperatorResult(
operator_alignment=operator_alignment,
faithfulness=faithfulness,
completeness=completeness,
detected_afb=detected_afb,
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Operator evaluation failed", error=str(e))
return RAGOperatorResult(
operator_alignment=0,
faithfulness=1,
completeness=1,
detected_afb="",
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_operator_composite(
self,
operator_alignment: int,
faithfulness: int,
completeness: int,
) -> float:
"""Calculate composite score for operator evaluation."""
alignment_score = (operator_alignment / 100) * 5
composite = (
alignment_score * 0.5 +
faithfulness * 0.3 +
completeness * 0.2
)
return round(composite, 3)
# ================================
# Hallucination Evaluation
# ================================
async def evaluate_hallucination(
self,
query: str,
response: str,
available_facts: List[str],
self, query: str, response: str, available_facts: List[str],
) -> RAGHallucinationResult:
"""Evaluate for hallucinations."""
prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
query=query,
response=response,
available_facts="\n".join(f"- {f}" for f in available_facts),
return await _evaluate_hallucination(
self._call_ollama, self._parse_json_response,
query, response, available_facts,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
hallucinated_claims = data.get("hallucinated_claims", [])
composite = self._calculate_hallucination_composite(
grounding_score, invention_detection, source_attribution
)
return RAGHallucinationResult(
grounding_score=grounding_score,
invention_detection=invention_detection,
source_attribution=source_attribution,
hallucinated_claims=hallucinated_claims[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Hallucination evaluation failed", error=str(e))
return RAGHallucinationResult(
grounding_score=0,
invention_detection="fail",
source_attribution=1,
hallucinated_claims=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_hallucination_composite(
self,
grounding_score: int,
invention_detection: str,
source_attribution: int,
) -> float:
"""Calculate composite score for hallucination evaluation."""
grounding = (grounding_score / 100) * 5
invention = 5.0 if invention_detection == "pass" else 0.0
composite = (
grounding * 0.4 +
invention * 0.4 +
source_attribution * 0.2
)
return round(composite, 3)
# ================================
# Privacy Evaluation
# ================================
async def evaluate_privacy(
self,
query: str,
context: Dict[str, Any],
response: str,
self, query: str, context: Dict[str, Any], response: str,
) -> RAGPrivacyResult:
"""Evaluate privacy/DSGVO compliance."""
prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
query=query,
context=json.dumps(context, ensure_ascii=False, indent=2),
response=response,
return await _evaluate_privacy(
self._call_ollama, self._parse_json_response,
query, context, response,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
anonymization = max(1, min(5, int(data.get("anonymization", 1))))
dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
detected_pii = data.get("detected_pii", [])
composite = self._calculate_privacy_composite(
privacy_compliance, anonymization, dsgvo_compliance
)
return RAGPrivacyResult(
privacy_compliance=privacy_compliance,
anonymization=anonymization,
dsgvo_compliance=dsgvo_compliance,
detected_pii=detected_pii[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Privacy evaluation failed", error=str(e))
return RAGPrivacyResult(
privacy_compliance="fail",
anonymization=1,
dsgvo_compliance="fail",
detected_pii=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_privacy_composite(
self,
privacy_compliance: str,
anonymization: int,
dsgvo_compliance: str,
) -> float:
"""Calculate composite score for privacy evaluation."""
privacy = 5.0 if privacy_compliance == "pass" else 0.0
dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0
composite = (
privacy * 0.4 +
anonymization * 0.2 +
dsgvo * 0.4
)
return round(composite, 3)
# ================================
# Namespace Evaluation
# ================================
async def evaluate_namespace(
self,
teacher_id: str,
namespace: str,
school_id: str,
requested_data: str,
response: str,
self, teacher_id: str, namespace: str, school_id: str,
requested_data: str, response: str,
) -> RAGNamespaceResult:
"""Evaluate namespace isolation."""
prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
teacher_id=teacher_id,
namespace=namespace,
school_id=school_id,
requested_data=requested_data,
response=response,
return await _evaluate_namespace(
self._call_ollama, self._parse_json_response,
teacher_id, namespace, school_id, requested_data, response,
)
try:
response_text = await self._call_ollama(prompt)
data = self._parse_json_response(response_text)
namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
detected_leaks = data.get("detected_leaks", [])
composite = self._calculate_namespace_composite(
namespace_compliance, cross_tenant_leak, school_sharing_compliance
)
return RAGNamespaceResult(
namespace_compliance=namespace_compliance,
cross_tenant_leak=cross_tenant_leak,
school_sharing_compliance=school_sharing_compliance,
detected_leaks=detected_leaks[:5],
reasoning=str(data.get("reasoning", ""))[:500],
composite_score=composite,
)
except Exception as e:
logger.error("Namespace evaluation failed", error=str(e))
return RAGNamespaceResult(
namespace_compliance="fail",
cross_tenant_leak="fail",
school_sharing_compliance=1,
detected_leaks=[],
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
def _calculate_namespace_composite(
self,
namespace_compliance: str,
cross_tenant_leak: str,
school_sharing_compliance: int,
) -> float:
"""Calculate composite score for namespace evaluation."""
ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0
composite = (
ns_compliance * 0.4 +
cross_tenant * 0.4 +
school_sharing_compliance * 0.2
)
return round(composite, 3)
# ================================
# Test Case Evaluation
# ================================
async def evaluate_rag_test_case(
self,
test_case: Dict[str, Any],
service_response: Dict[str, Any],
self, test_case: Dict[str, Any], service_response: Dict[str, Any],
) -> TestResult:
"""
Evaluate a full RAG test case from the golden suite.
Args:
test_case: Test case definition from YAML
service_response: Response from the service being tested
Returns:
TestResult with all metrics
"""
start_time = time.time()
test_id = test_case.get("id", "UNKNOWN")
test_name = test_case.get("name", "")
category = test_case.get("category", "")
min_score = test_case.get("min_score", 3.5)
# Route to appropriate evaluation based on category
composite_score = 0.0
reasoning = ""
if category == "eh_retrieval":
result = await self.evaluate_retrieval(
query=test_case.get("input", {}).get("query", ""),
aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
retrieved_passage=service_response.get("passage", ""),
expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "operator_alignment":
result = await self.evaluate_operator(
operator=test_case.get("input", {}).get("operator", ""),
generated_definition=service_response.get("definition", ""),
expected_afb=test_case.get("expected", {}).get("afb_level", ""),
expected_actions=test_case.get("expected", {}).get("expected_actions", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "hallucination_control":
result = await self.evaluate_hallucination(
query=test_case.get("input", {}).get("query", ""),
response=service_response.get("response", ""),
available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "privacy_compliance":
result = await self.evaluate_privacy(
query=test_case.get("input", {}).get("query", ""),
context=test_case.get("input", {}).get("context", {}),
response=service_response.get("response", ""),
)
composite_score = result.composite_score
reasoning = result.reasoning
elif category == "namespace_isolation":
context = test_case.get("input", {}).get("context", {})
result = await self.evaluate_namespace(
teacher_id=context.get("teacher_id", ""),
namespace=context.get("namespace", ""),
school_id=context.get("school_id", ""),
requested_data=test_case.get("input", {}).get("query", ""),
response=service_response.get("response", ""),
)
composite_score = result.composite_score
reasoning = result.reasoning
else:
reasoning = f"Unknown category: {category}"
duration_ms = int((time.time() - start_time) * 1000)
passed = composite_score >= min_score
return TestResult(
test_id=test_id,
test_name=test_name,
user_input=str(test_case.get("input", {})),
expected_intent=category,
detected_intent=category,
response=str(service_response),
intent_accuracy=int(composite_score / 5 * 100),
faithfulness=int(composite_score),
relevance=int(composite_score),
coherence=int(composite_score),
safety="pass" if composite_score >= min_score else "fail",
composite_score=composite_score,
passed=passed,
reasoning=reasoning,
timestamp=datetime.utcnow(),
duration_ms=duration_ms,
)
return await _evaluate_rag_test_case(self, test_case, service_response)
async def health_check(self) -> bool:
"""Check if Ollama and judge model are available."""