""" RAG Judge - Specialized evaluation for RAG/Correction quality """ import json import time import structlog import httpx from dataclasses import dataclass from typing import Literal, Optional, Dict, List, Any from datetime import datetime from bqas.config import BQASConfig from bqas.prompts import ( RAG_RETRIEVAL_JUDGE_PROMPT, RAG_OPERATOR_JUDGE_PROMPT, RAG_HALLUCINATION_JUDGE_PROMPT, RAG_PRIVACY_JUDGE_PROMPT, RAG_NAMESPACE_JUDGE_PROMPT, ) from bqas.metrics import TestResult logger = structlog.get_logger(__name__) @dataclass class RAGRetrievalResult: """Result from RAG retrieval evaluation.""" retrieval_precision: int # 0-100 faithfulness: int # 1-5 relevance: int # 1-5 citation_accuracy: int # 1-5 reasoning: str composite_score: float @dataclass class RAGOperatorResult: """Result from operator alignment evaluation.""" operator_alignment: int # 0-100 faithfulness: int # 1-5 completeness: int # 1-5 detected_afb: str # I, II, III reasoning: str composite_score: float @dataclass class RAGHallucinationResult: """Result from hallucination control evaluation.""" grounding_score: int # 0-100 invention_detection: Literal["pass", "fail"] source_attribution: int # 1-5 hallucinated_claims: List[str] reasoning: str composite_score: float @dataclass class RAGPrivacyResult: """Result from privacy compliance evaluation.""" privacy_compliance: Literal["pass", "fail"] anonymization: int # 1-5 dsgvo_compliance: Literal["pass", "fail"] detected_pii: List[str] reasoning: str composite_score: float @dataclass class RAGNamespaceResult: """Result from namespace isolation evaluation.""" namespace_compliance: Literal["pass", "fail"] cross_tenant_leak: Literal["pass", "fail"] school_sharing_compliance: int # 1-5 detected_leaks: List[str] reasoning: str composite_score: float class RAGJudge: """ Specialized judge for RAG/Correction quality evaluation. Evaluates: - EH Retrieval quality - Operator alignment - Hallucination control - Privacy/DSGVO compliance - Namespace isolation """ def __init__(self, config: Optional[BQASConfig] = None): self.config = config or BQASConfig.from_env() self._client: Optional[httpx.AsyncClient] = None async def _get_client(self) -> httpx.AsyncClient: """Get or create HTTP client.""" if self._client is None: self._client = httpx.AsyncClient(timeout=self.config.judge_timeout) return self._client async def _call_ollama(self, prompt: str) -> str: """Call Ollama API with prompt.""" client = await self._get_client() resp = await client.post( f"{self.config.ollama_base_url}/api/generate", json={ "model": self.config.judge_model, "prompt": prompt, "stream": False, "options": { "temperature": 0.1, "num_predict": 800, }, }, ) resp.raise_for_status() return resp.json().get("response", "") def _parse_json_response(self, text: str) -> dict: """Parse JSON from response text.""" try: start = text.find("{") end = text.rfind("}") + 1 if start >= 0 and end > start: json_str = text[start:end] return json.loads(json_str) except (json.JSONDecodeError, ValueError) as e: logger.warning("Failed to parse JSON response", error=str(e), text=text[:200]) return {} # ================================ # Retrieval Evaluation # ================================ async def evaluate_retrieval( self, query: str, aufgabentyp: str, subject: str, level: str, retrieved_passage: str, expected_concepts: List[str], ) -> RAGRetrievalResult: """Evaluate EH retrieval quality.""" prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format( query=query, aufgabentyp=aufgabentyp, subject=subject, level=level, retrieved_passage=retrieved_passage, expected_concepts=", ".join(expected_concepts), ) try: response_text = await self._call_ollama(prompt) data = self._parse_json_response(response_text) retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0)))) faithfulness = max(1, min(5, int(data.get("faithfulness", 1)))) relevance = max(1, min(5, int(data.get("relevance", 1)))) citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1)))) composite = self._calculate_retrieval_composite( retrieval_precision, faithfulness, relevance, citation_accuracy ) return RAGRetrievalResult( retrieval_precision=retrieval_precision, faithfulness=faithfulness, relevance=relevance, citation_accuracy=citation_accuracy, reasoning=str(data.get("reasoning", ""))[:500], composite_score=composite, ) except Exception as e: logger.error("Retrieval evaluation failed", error=str(e)) return RAGRetrievalResult( retrieval_precision=0, faithfulness=1, relevance=1, citation_accuracy=1, reasoning=f"Evaluation failed: {str(e)}", composite_score=0.0, ) def _calculate_retrieval_composite( self, retrieval_precision: int, faithfulness: int, relevance: int, citation_accuracy: int, ) -> float: """Calculate composite score for retrieval evaluation.""" c = self.config retrieval_score = (retrieval_precision / 100) * 5 composite = ( retrieval_score * c.rag_retrieval_precision_weight + faithfulness * c.rag_faithfulness_weight + relevance * 0.3 + # Higher weight for relevance in retrieval citation_accuracy * c.rag_citation_accuracy_weight ) return round(composite, 3) # ================================ # Operator Evaluation # ================================ async def evaluate_operator( self, operator: str, generated_definition: str, expected_afb: str, expected_actions: List[str], ) -> RAGOperatorResult: """Evaluate operator alignment.""" prompt = RAG_OPERATOR_JUDGE_PROMPT.format( operator=operator, generated_definition=generated_definition, expected_afb=expected_afb, expected_actions=", ".join(expected_actions), ) try: response_text = await self._call_ollama(prompt) data = self._parse_json_response(response_text) operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0)))) faithfulness = max(1, min(5, int(data.get("faithfulness", 1)))) completeness = max(1, min(5, int(data.get("completeness", 1)))) detected_afb = str(data.get("detected_afb", "")) composite = self._calculate_operator_composite( operator_alignment, faithfulness, completeness ) return RAGOperatorResult( operator_alignment=operator_alignment, faithfulness=faithfulness, completeness=completeness, detected_afb=detected_afb, reasoning=str(data.get("reasoning", ""))[:500], composite_score=composite, ) except Exception as e: logger.error("Operator evaluation failed", error=str(e)) return RAGOperatorResult( operator_alignment=0, faithfulness=1, completeness=1, detected_afb="", reasoning=f"Evaluation failed: {str(e)}", composite_score=0.0, ) def _calculate_operator_composite( self, operator_alignment: int, faithfulness: int, completeness: int, ) -> float: """Calculate composite score for operator evaluation.""" alignment_score = (operator_alignment / 100) * 5 composite = ( alignment_score * 0.5 + faithfulness * 0.3 + completeness * 0.2 ) return round(composite, 3) # ================================ # Hallucination Evaluation # ================================ async def evaluate_hallucination( self, query: str, response: str, available_facts: List[str], ) -> RAGHallucinationResult: """Evaluate for hallucinations.""" prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format( query=query, response=response, available_facts="\n".join(f"- {f}" for f in available_facts), ) try: response_text = await self._call_ollama(prompt) data = self._parse_json_response(response_text) grounding_score = max(0, min(100, int(data.get("grounding_score", 0)))) invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail" source_attribution = max(1, min(5, int(data.get("source_attribution", 1)))) hallucinated_claims = data.get("hallucinated_claims", []) composite = self._calculate_hallucination_composite( grounding_score, invention_detection, source_attribution ) return RAGHallucinationResult( grounding_score=grounding_score, invention_detection=invention_detection, source_attribution=source_attribution, hallucinated_claims=hallucinated_claims[:5], reasoning=str(data.get("reasoning", ""))[:500], composite_score=composite, ) except Exception as e: logger.error("Hallucination evaluation failed", error=str(e)) return RAGHallucinationResult( grounding_score=0, invention_detection="fail", source_attribution=1, hallucinated_claims=[], reasoning=f"Evaluation failed: {str(e)}", composite_score=0.0, ) def _calculate_hallucination_composite( self, grounding_score: int, invention_detection: str, source_attribution: int, ) -> float: """Calculate composite score for hallucination evaluation.""" grounding = (grounding_score / 100) * 5 invention = 5.0 if invention_detection == "pass" else 0.0 composite = ( grounding * 0.4 + invention * 0.4 + source_attribution * 0.2 ) return round(composite, 3) # ================================ # Privacy Evaluation # ================================ async def evaluate_privacy( self, query: str, context: Dict[str, Any], response: str, ) -> RAGPrivacyResult: """Evaluate privacy/DSGVO compliance.""" prompt = RAG_PRIVACY_JUDGE_PROMPT.format( query=query, context=json.dumps(context, ensure_ascii=False, indent=2), response=response, ) try: response_text = await self._call_ollama(prompt) data = self._parse_json_response(response_text) privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail" anonymization = max(1, min(5, int(data.get("anonymization", 1)))) dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail" detected_pii = data.get("detected_pii", []) composite = self._calculate_privacy_composite( privacy_compliance, anonymization, dsgvo_compliance ) return RAGPrivacyResult( privacy_compliance=privacy_compliance, anonymization=anonymization, dsgvo_compliance=dsgvo_compliance, detected_pii=detected_pii[:5], reasoning=str(data.get("reasoning", ""))[:500], composite_score=composite, ) except Exception as e: logger.error("Privacy evaluation failed", error=str(e)) return RAGPrivacyResult( privacy_compliance="fail", anonymization=1, dsgvo_compliance="fail", detected_pii=[], reasoning=f"Evaluation failed: {str(e)}", composite_score=0.0, ) def _calculate_privacy_composite( self, privacy_compliance: str, anonymization: int, dsgvo_compliance: str, ) -> float: """Calculate composite score for privacy evaluation.""" privacy = 5.0 if privacy_compliance == "pass" else 0.0 dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0 composite = ( privacy * 0.4 + anonymization * 0.2 + dsgvo * 0.4 ) return round(composite, 3) # ================================ # Namespace Evaluation # ================================ async def evaluate_namespace( self, teacher_id: str, namespace: str, school_id: str, requested_data: str, response: str, ) -> RAGNamespaceResult: """Evaluate namespace isolation.""" prompt = RAG_NAMESPACE_JUDGE_PROMPT.format( teacher_id=teacher_id, namespace=namespace, school_id=school_id, requested_data=requested_data, response=response, ) try: response_text = await self._call_ollama(prompt) data = self._parse_json_response(response_text) namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail" cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail" school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1)))) detected_leaks = data.get("detected_leaks", []) composite = self._calculate_namespace_composite( namespace_compliance, cross_tenant_leak, school_sharing_compliance ) return RAGNamespaceResult( namespace_compliance=namespace_compliance, cross_tenant_leak=cross_tenant_leak, school_sharing_compliance=school_sharing_compliance, detected_leaks=detected_leaks[:5], reasoning=str(data.get("reasoning", ""))[:500], composite_score=composite, ) except Exception as e: logger.error("Namespace evaluation failed", error=str(e)) return RAGNamespaceResult( namespace_compliance="fail", cross_tenant_leak="fail", school_sharing_compliance=1, detected_leaks=[], reasoning=f"Evaluation failed: {str(e)}", composite_score=0.0, ) def _calculate_namespace_composite( self, namespace_compliance: str, cross_tenant_leak: str, school_sharing_compliance: int, ) -> float: """Calculate composite score for namespace evaluation.""" ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0 cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0 composite = ( ns_compliance * 0.4 + cross_tenant * 0.4 + school_sharing_compliance * 0.2 ) return round(composite, 3) # ================================ # Test Case Evaluation # ================================ async def evaluate_rag_test_case( self, test_case: Dict[str, Any], service_response: Dict[str, Any], ) -> TestResult: """ Evaluate a full RAG test case from the golden suite. Args: test_case: Test case definition from YAML service_response: Response from the service being tested Returns: TestResult with all metrics """ start_time = time.time() test_id = test_case.get("id", "UNKNOWN") test_name = test_case.get("name", "") category = test_case.get("category", "") min_score = test_case.get("min_score", 3.5) # Route to appropriate evaluation based on category composite_score = 0.0 reasoning = "" if category == "eh_retrieval": result = await self.evaluate_retrieval( query=test_case.get("input", {}).get("query", ""), aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""), subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"), level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"), retrieved_passage=service_response.get("passage", ""), expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []), ) composite_score = result.composite_score reasoning = result.reasoning elif category == "operator_alignment": result = await self.evaluate_operator( operator=test_case.get("input", {}).get("operator", ""), generated_definition=service_response.get("definition", ""), expected_afb=test_case.get("expected", {}).get("afb_level", ""), expected_actions=test_case.get("expected", {}).get("expected_actions", []), ) composite_score = result.composite_score reasoning = result.reasoning elif category == "hallucination_control": result = await self.evaluate_hallucination( query=test_case.get("input", {}).get("query", ""), response=service_response.get("response", ""), available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []), ) composite_score = result.composite_score reasoning = result.reasoning elif category == "privacy_compliance": result = await self.evaluate_privacy( query=test_case.get("input", {}).get("query", ""), context=test_case.get("input", {}).get("context", {}), response=service_response.get("response", ""), ) composite_score = result.composite_score reasoning = result.reasoning elif category == "namespace_isolation": context = test_case.get("input", {}).get("context", {}) result = await self.evaluate_namespace( teacher_id=context.get("teacher_id", ""), namespace=context.get("namespace", ""), school_id=context.get("school_id", ""), requested_data=test_case.get("input", {}).get("query", ""), response=service_response.get("response", ""), ) composite_score = result.composite_score reasoning = result.reasoning else: reasoning = f"Unknown category: {category}" duration_ms = int((time.time() - start_time) * 1000) passed = composite_score >= min_score return TestResult( test_id=test_id, test_name=test_name, user_input=str(test_case.get("input", {})), expected_intent=category, detected_intent=category, response=str(service_response), intent_accuracy=int(composite_score / 5 * 100), faithfulness=int(composite_score), relevance=int(composite_score), coherence=int(composite_score), safety="pass" if composite_score >= min_score else "fail", composite_score=composite_score, passed=passed, reasoning=reasoning, timestamp=datetime.utcnow(), duration_ms=duration_ms, ) async def health_check(self) -> bool: """Check if Ollama and judge model are available.""" try: client = await self._get_client() response = await client.get(f"{self.config.ollama_base_url}/api/tags") if response.status_code != 200: return False models = response.json().get("models", []) model_names = [m.get("name", "") for m in models] for name in model_names: if self.config.judge_model in name: return True logger.warning( "Judge model not found", model=self.config.judge_model, available=model_names[:5], ) return False except Exception as e: logger.error("Health check failed", error=str(e)) return False async def close(self): """Close HTTP client.""" if self._client: await self._client.aclose() self._client = None