Files
breakpilot-lehrer/voice-service/bqas/rag_judge.py
Benjamin Admin 451365a312 [split-required] Split remaining 500-680 LOC files (final batch)
website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00

179 lines
5.7 KiB
Python

"""
RAG Judge - Specialized evaluation for RAG/Correction quality
Split into:
- rag_judge_types.py: Data classes for evaluation results
- rag_judge_evaluators.py: Individual evaluation methods
- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
"""
import json
import structlog
import httpx
from typing import Optional, Dict, List, Any
from bqas.config import BQASConfig
from bqas.metrics import TestResult
# Re-export types for backward compatibility
from bqas.rag_judge_types import (
RAGRetrievalResult,
RAGOperatorResult,
RAGHallucinationResult,
RAGPrivacyResult,
RAGNamespaceResult,
)
from bqas.rag_judge_evaluators import (
evaluate_retrieval as _evaluate_retrieval,
evaluate_operator as _evaluate_operator,
evaluate_hallucination as _evaluate_hallucination,
evaluate_privacy as _evaluate_privacy,
evaluate_namespace as _evaluate_namespace,
evaluate_rag_test_case as _evaluate_rag_test_case,
)
__all__ = [
"RAGJudge",
"RAGRetrievalResult",
"RAGOperatorResult",
"RAGHallucinationResult",
"RAGPrivacyResult",
"RAGNamespaceResult",
]
logger = structlog.get_logger(__name__)
class RAGJudge:
"""
Specialized judge for RAG/Correction quality evaluation.
Evaluates:
- EH Retrieval quality
- Operator alignment
- Hallucination control
- Privacy/DSGVO compliance
- Namespace isolation
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None:
self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
return self._client
async def _call_ollama(self, prompt: str) -> str:
"""Call Ollama API with prompt."""
client = await self._get_client()
resp = await client.post(
f"{self.config.ollama_base_url}/api/generate",
json={
"model": self.config.judge_model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 800,
},
},
)
resp.raise_for_status()
return resp.json().get("response", "")
def _parse_json_response(self, text: str) -> dict:
"""Parse JSON from response text."""
try:
start = text.find("{")
end = text.rfind("}") + 1
if start >= 0 and end > start:
json_str = text[start:end]
return json.loads(json_str)
except (json.JSONDecodeError, ValueError) as e:
logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
return {}
async def evaluate_retrieval(
self, query: str, aufgabentyp: str, subject: str, level: str,
retrieved_passage: str, expected_concepts: List[str],
) -> RAGRetrievalResult:
return await _evaluate_retrieval(
self._call_ollama, self._parse_json_response, self.config,
query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
)
async def evaluate_operator(
self, operator: str, generated_definition: str,
expected_afb: str, expected_actions: List[str],
) -> RAGOperatorResult:
return await _evaluate_operator(
self._call_ollama, self._parse_json_response,
operator, generated_definition, expected_afb, expected_actions,
)
async def evaluate_hallucination(
self, query: str, response: str, available_facts: List[str],
) -> RAGHallucinationResult:
return await _evaluate_hallucination(
self._call_ollama, self._parse_json_response,
query, response, available_facts,
)
async def evaluate_privacy(
self, query: str, context: Dict[str, Any], response: str,
) -> RAGPrivacyResult:
return await _evaluate_privacy(
self._call_ollama, self._parse_json_response,
query, context, response,
)
async def evaluate_namespace(
self, teacher_id: str, namespace: str, school_id: str,
requested_data: str, response: str,
) -> RAGNamespaceResult:
return await _evaluate_namespace(
self._call_ollama, self._parse_json_response,
teacher_id, namespace, school_id, requested_data, response,
)
async def evaluate_rag_test_case(
self, test_case: Dict[str, Any], service_response: Dict[str, Any],
) -> TestResult:
return await _evaluate_rag_test_case(self, test_case, service_response)
async def health_check(self) -> bool:
"""Check if Ollama and judge model are available."""
try:
client = await self._get_client()
response = await client.get(f"{self.config.ollama_base_url}/api/tags")
if response.status_code != 200:
return False
models = response.json().get("models", [])
model_names = [m.get("name", "") for m in models]
for name in model_names:
if self.config.judge_model in name:
return True
logger.warning(
"Judge model not found",
model=self.config.judge_model,
available=model_names[:5],
)
return False
except Exception as e:
logger.error("Health check failed", error=str(e))
return False
async def close(self):
"""Close HTTP client."""
if self._client:
await self._client.aclose()
self._client = None