""" RAG Judge - Specialized evaluation for RAG/Correction quality Split into: - rag_judge_types.py: Data classes for evaluation results - rag_judge_evaluators.py: Individual evaluation methods - rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports) """ import json import structlog import httpx from typing import Optional, Dict, List, Any from bqas.config import BQASConfig from bqas.metrics import TestResult # Re-export types for backward compatibility from bqas.rag_judge_types import ( RAGRetrievalResult, RAGOperatorResult, RAGHallucinationResult, RAGPrivacyResult, RAGNamespaceResult, ) from bqas.rag_judge_evaluators import ( evaluate_retrieval as _evaluate_retrieval, evaluate_operator as _evaluate_operator, evaluate_hallucination as _evaluate_hallucination, evaluate_privacy as _evaluate_privacy, evaluate_namespace as _evaluate_namespace, evaluate_rag_test_case as _evaluate_rag_test_case, ) __all__ = [ "RAGJudge", "RAGRetrievalResult", "RAGOperatorResult", "RAGHallucinationResult", "RAGPrivacyResult", "RAGNamespaceResult", ] logger = structlog.get_logger(__name__) class RAGJudge: """ Specialized judge for RAG/Correction quality evaluation. Evaluates: - EH Retrieval quality - Operator alignment - Hallucination control - Privacy/DSGVO compliance - Namespace isolation """ def __init__(self, config: Optional[BQASConfig] = None): self.config = config or BQASConfig.from_env() self._client: Optional[httpx.AsyncClient] = None async def _get_client(self) -> httpx.AsyncClient: """Get or create HTTP client.""" if self._client is None: self._client = httpx.AsyncClient(timeout=self.config.judge_timeout) return self._client async def _call_ollama(self, prompt: str) -> str: """Call Ollama API with prompt.""" client = await self._get_client() resp = await client.post( f"{self.config.ollama_base_url}/api/generate", json={ "model": self.config.judge_model, "prompt": prompt, "stream": False, "options": { "temperature": 0.1, "num_predict": 800, }, }, ) resp.raise_for_status() return resp.json().get("response", "") def _parse_json_response(self, text: str) -> dict: """Parse JSON from response text.""" try: start = text.find("{") end = text.rfind("}") + 1 if start >= 0 and end > start: json_str = text[start:end] return json.loads(json_str) except (json.JSONDecodeError, ValueError) as e: logger.warning("Failed to parse JSON response", error=str(e), text=text[:200]) return {} async def evaluate_retrieval( self, query: str, aufgabentyp: str, subject: str, level: str, retrieved_passage: str, expected_concepts: List[str], ) -> RAGRetrievalResult: return await _evaluate_retrieval( self._call_ollama, self._parse_json_response, self.config, query, aufgabentyp, subject, level, retrieved_passage, expected_concepts, ) async def evaluate_operator( self, operator: str, generated_definition: str, expected_afb: str, expected_actions: List[str], ) -> RAGOperatorResult: return await _evaluate_operator( self._call_ollama, self._parse_json_response, operator, generated_definition, expected_afb, expected_actions, ) async def evaluate_hallucination( self, query: str, response: str, available_facts: List[str], ) -> RAGHallucinationResult: return await _evaluate_hallucination( self._call_ollama, self._parse_json_response, query, response, available_facts, ) async def evaluate_privacy( self, query: str, context: Dict[str, Any], response: str, ) -> RAGPrivacyResult: return await _evaluate_privacy( self._call_ollama, self._parse_json_response, query, context, response, ) async def evaluate_namespace( self, teacher_id: str, namespace: str, school_id: str, requested_data: str, response: str, ) -> RAGNamespaceResult: return await _evaluate_namespace( self._call_ollama, self._parse_json_response, teacher_id, namespace, school_id, requested_data, response, ) async def evaluate_rag_test_case( self, test_case: Dict[str, Any], service_response: Dict[str, Any], ) -> TestResult: return await _evaluate_rag_test_case(self, test_case, service_response) async def health_check(self) -> bool: """Check if Ollama and judge model are available.""" try: client = await self._get_client() response = await client.get(f"{self.config.ollama_base_url}/api/tags") if response.status_code != 200: return False models = response.json().get("models", []) model_names = [m.get("name", "") for m in models] for name in model_names: if self.config.judge_model in name: return True logger.warning( "Judge model not found", model=self.config.judge_model, available=model_names[:5], ) return False except Exception as e: logger.error("Health check failed", error=str(e)) return False async def close(self): """Close HTTP client.""" if self._client: await self._client.aclose() self._client = None