website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
179 lines
5.7 KiB
Python
179 lines
5.7 KiB
Python
"""
|
|
RAG Judge - Specialized evaluation for RAG/Correction quality
|
|
|
|
Split into:
|
|
- rag_judge_types.py: Data classes for evaluation results
|
|
- rag_judge_evaluators.py: Individual evaluation methods
|
|
- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
|
|
"""
|
|
import json
|
|
import structlog
|
|
import httpx
|
|
from typing import Optional, Dict, List, Any
|
|
|
|
from bqas.config import BQASConfig
|
|
from bqas.metrics import TestResult
|
|
|
|
# Re-export types for backward compatibility
|
|
from bqas.rag_judge_types import (
|
|
RAGRetrievalResult,
|
|
RAGOperatorResult,
|
|
RAGHallucinationResult,
|
|
RAGPrivacyResult,
|
|
RAGNamespaceResult,
|
|
)
|
|
|
|
from bqas.rag_judge_evaluators import (
|
|
evaluate_retrieval as _evaluate_retrieval,
|
|
evaluate_operator as _evaluate_operator,
|
|
evaluate_hallucination as _evaluate_hallucination,
|
|
evaluate_privacy as _evaluate_privacy,
|
|
evaluate_namespace as _evaluate_namespace,
|
|
evaluate_rag_test_case as _evaluate_rag_test_case,
|
|
)
|
|
|
|
__all__ = [
|
|
"RAGJudge",
|
|
"RAGRetrievalResult",
|
|
"RAGOperatorResult",
|
|
"RAGHallucinationResult",
|
|
"RAGPrivacyResult",
|
|
"RAGNamespaceResult",
|
|
]
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
class RAGJudge:
|
|
"""
|
|
Specialized judge for RAG/Correction quality evaluation.
|
|
|
|
Evaluates:
|
|
- EH Retrieval quality
|
|
- Operator alignment
|
|
- Hallucination control
|
|
- Privacy/DSGVO compliance
|
|
- Namespace isolation
|
|
"""
|
|
|
|
def __init__(self, config: Optional[BQASConfig] = None):
|
|
self.config = config or BQASConfig.from_env()
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""Get or create HTTP client."""
|
|
if self._client is None:
|
|
self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
|
|
return self._client
|
|
|
|
async def _call_ollama(self, prompt: str) -> str:
|
|
"""Call Ollama API with prompt."""
|
|
client = await self._get_client()
|
|
|
|
resp = await client.post(
|
|
f"{self.config.ollama_base_url}/api/generate",
|
|
json={
|
|
"model": self.config.judge_model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"num_predict": 800,
|
|
},
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json().get("response", "")
|
|
|
|
def _parse_json_response(self, text: str) -> dict:
|
|
"""Parse JSON from response text."""
|
|
try:
|
|
start = text.find("{")
|
|
end = text.rfind("}") + 1
|
|
if start >= 0 and end > start:
|
|
json_str = text[start:end]
|
|
return json.loads(json_str)
|
|
except (json.JSONDecodeError, ValueError) as e:
|
|
logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
|
|
return {}
|
|
|
|
async def evaluate_retrieval(
|
|
self, query: str, aufgabentyp: str, subject: str, level: str,
|
|
retrieved_passage: str, expected_concepts: List[str],
|
|
) -> RAGRetrievalResult:
|
|
return await _evaluate_retrieval(
|
|
self._call_ollama, self._parse_json_response, self.config,
|
|
query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
|
|
)
|
|
|
|
async def evaluate_operator(
|
|
self, operator: str, generated_definition: str,
|
|
expected_afb: str, expected_actions: List[str],
|
|
) -> RAGOperatorResult:
|
|
return await _evaluate_operator(
|
|
self._call_ollama, self._parse_json_response,
|
|
operator, generated_definition, expected_afb, expected_actions,
|
|
)
|
|
|
|
async def evaluate_hallucination(
|
|
self, query: str, response: str, available_facts: List[str],
|
|
) -> RAGHallucinationResult:
|
|
return await _evaluate_hallucination(
|
|
self._call_ollama, self._parse_json_response,
|
|
query, response, available_facts,
|
|
)
|
|
|
|
async def evaluate_privacy(
|
|
self, query: str, context: Dict[str, Any], response: str,
|
|
) -> RAGPrivacyResult:
|
|
return await _evaluate_privacy(
|
|
self._call_ollama, self._parse_json_response,
|
|
query, context, response,
|
|
)
|
|
|
|
async def evaluate_namespace(
|
|
self, teacher_id: str, namespace: str, school_id: str,
|
|
requested_data: str, response: str,
|
|
) -> RAGNamespaceResult:
|
|
return await _evaluate_namespace(
|
|
self._call_ollama, self._parse_json_response,
|
|
teacher_id, namespace, school_id, requested_data, response,
|
|
)
|
|
|
|
async def evaluate_rag_test_case(
|
|
self, test_case: Dict[str, Any], service_response: Dict[str, Any],
|
|
) -> TestResult:
|
|
return await _evaluate_rag_test_case(self, test_case, service_response)
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Check if Ollama and judge model are available."""
|
|
try:
|
|
client = await self._get_client()
|
|
response = await client.get(f"{self.config.ollama_base_url}/api/tags")
|
|
if response.status_code != 200:
|
|
return False
|
|
|
|
models = response.json().get("models", [])
|
|
model_names = [m.get("name", "") for m in models]
|
|
|
|
for name in model_names:
|
|
if self.config.judge_model in name:
|
|
return True
|
|
|
|
logger.warning(
|
|
"Judge model not found",
|
|
model=self.config.judge_model,
|
|
available=model_names[:5],
|
|
)
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error("Health check failed", error=str(e))
|
|
return False
|
|
|
|
async def close(self):
|
|
"""Close HTTP client."""
|
|
if self._client:
|
|
await self._client.aclose()
|
|
self._client = None
|