breakpilot-lehrer/voice-service/bqas/rag_judge.py

"""
RAG Judge - Specialized evaluation for RAG/Correction quality
"""
import json
import time
import structlog
import httpx
from dataclasses import dataclass
from typing import Literal, Optional, Dict, List, Any
from datetime import datetime

from bqas.config import BQASConfig
from bqas.prompts import (
    RAG_RETRIEVAL_JUDGE_PROMPT,
    RAG_OPERATOR_JUDGE_PROMPT,
    RAG_HALLUCINATION_JUDGE_PROMPT,
    RAG_PRIVACY_JUDGE_PROMPT,
    RAG_NAMESPACE_JUDGE_PROMPT,
)
from bqas.metrics import TestResult

logger = structlog.get_logger(__name__)


@dataclass
class RAGRetrievalResult:
    """Result from RAG retrieval evaluation."""
    retrieval_precision: int  # 0-100
    faithfulness: int  # 1-5
    relevance: int  # 1-5
    citation_accuracy: int  # 1-5
    reasoning: str
    composite_score: float


@dataclass
class RAGOperatorResult:
    """Result from operator alignment evaluation."""
    operator_alignment: int  # 0-100
    faithfulness: int  # 1-5
    completeness: int  # 1-5
    detected_afb: str  # I, II, III
    reasoning: str
    composite_score: float


@dataclass
class RAGHallucinationResult:
    """Result from hallucination control evaluation."""
    grounding_score: int  # 0-100
    invention_detection: Literal["pass", "fail"]
    source_attribution: int  # 1-5
    hallucinated_claims: List[str]
    reasoning: str
    composite_score: float


@dataclass
class RAGPrivacyResult:
    """Result from privacy compliance evaluation."""
    privacy_compliance: Literal["pass", "fail"]
    anonymization: int  # 1-5
    dsgvo_compliance: Literal["pass", "fail"]
    detected_pii: List[str]
    reasoning: str
    composite_score: float


@dataclass
class RAGNamespaceResult:
    """Result from namespace isolation evaluation."""
    namespace_compliance: Literal["pass", "fail"]
    cross_tenant_leak: Literal["pass", "fail"]
    school_sharing_compliance: int  # 1-5
    detected_leaks: List[str]
    reasoning: str
    composite_score: float


class RAGJudge:
    """
    Specialized judge for RAG/Correction quality evaluation.

    Evaluates:
    - EH Retrieval quality
    - Operator alignment
    - Hallucination control
    - Privacy/DSGVO compliance
    - Namespace isolation
    """

    def __init__(self, config: Optional[BQASConfig] = None):
        self.config = config or BQASConfig.from_env()
        self._client: Optional[httpx.AsyncClient] = None

    async def _get_client(self) -> httpx.AsyncClient:
        """Get or create HTTP client."""
        if self._client is None:
            self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
        return self._client

    async def _call_ollama(self, prompt: str) -> str:
        """Call Ollama API with prompt."""
        client = await self._get_client()

        resp = await client.post(
            f"{self.config.ollama_base_url}/api/generate",
            json={
                "model": self.config.judge_model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.1,
                    "num_predict": 800,
                },
            },
        )
        resp.raise_for_status()
        return resp.json().get("response", "")

    def _parse_json_response(self, text: str) -> dict:
        """Parse JSON from response text."""
        try:
            start = text.find("{")
            end = text.rfind("}") + 1
            if start >= 0 and end > start:
                json_str = text[start:end]
                return json.loads(json_str)
        except (json.JSONDecodeError, ValueError) as e:
            logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
        return {}

    # ================================
    # Retrieval Evaluation
    # ================================

    async def evaluate_retrieval(
        self,
        query: str,
        aufgabentyp: str,
        subject: str,
        level: str,
        retrieved_passage: str,
        expected_concepts: List[str],
    ) -> RAGRetrievalResult:
        """Evaluate EH retrieval quality."""
        prompt = RAG_RETRIEVAL_JUDGE_PROMPT.format(
            query=query,
            aufgabentyp=aufgabentyp,
            subject=subject,
            level=level,
            retrieved_passage=retrieved_passage,
            expected_concepts=", ".join(expected_concepts),
        )

        try:
            response_text = await self._call_ollama(prompt)
            data = self._parse_json_response(response_text)

            retrieval_precision = max(0, min(100, int(data.get("retrieval_precision", 0))))
            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
            relevance = max(1, min(5, int(data.get("relevance", 1))))
            citation_accuracy = max(1, min(5, int(data.get("citation_accuracy", 1))))

            composite = self._calculate_retrieval_composite(
                retrieval_precision, faithfulness, relevance, citation_accuracy
            )

            return RAGRetrievalResult(
                retrieval_precision=retrieval_precision,
                faithfulness=faithfulness,
                relevance=relevance,
                citation_accuracy=citation_accuracy,
                reasoning=str(data.get("reasoning", ""))[:500],
                composite_score=composite,
            )

        except Exception as e:
            logger.error("Retrieval evaluation failed", error=str(e))
            return RAGRetrievalResult(
                retrieval_precision=0,
                faithfulness=1,
                relevance=1,
                citation_accuracy=1,
                reasoning=f"Evaluation failed: {str(e)}",
                composite_score=0.0,
            )

    def _calculate_retrieval_composite(
        self,
        retrieval_precision: int,
        faithfulness: int,
        relevance: int,
        citation_accuracy: int,
    ) -> float:
        """Calculate composite score for retrieval evaluation."""
        c = self.config
        retrieval_score = (retrieval_precision / 100) * 5

        composite = (
            retrieval_score * c.rag_retrieval_precision_weight +
            faithfulness * c.rag_faithfulness_weight +
            relevance * 0.3 +  # Higher weight for relevance in retrieval
            citation_accuracy * c.rag_citation_accuracy_weight
        )
        return round(composite, 3)

    # ================================
    # Operator Evaluation
    # ================================

    async def evaluate_operator(
        self,
        operator: str,
        generated_definition: str,
        expected_afb: str,
        expected_actions: List[str],
    ) -> RAGOperatorResult:
        """Evaluate operator alignment."""
        prompt = RAG_OPERATOR_JUDGE_PROMPT.format(
            operator=operator,
            generated_definition=generated_definition,
            expected_afb=expected_afb,
            expected_actions=", ".join(expected_actions),
        )

        try:
            response_text = await self._call_ollama(prompt)
            data = self._parse_json_response(response_text)

            operator_alignment = max(0, min(100, int(data.get("operator_alignment", 0))))
            faithfulness = max(1, min(5, int(data.get("faithfulness", 1))))
            completeness = max(1, min(5, int(data.get("completeness", 1))))
            detected_afb = str(data.get("detected_afb", ""))

            composite = self._calculate_operator_composite(
                operator_alignment, faithfulness, completeness
            )

            return RAGOperatorResult(
                operator_alignment=operator_alignment,
                faithfulness=faithfulness,
                completeness=completeness,
                detected_afb=detected_afb,
                reasoning=str(data.get("reasoning", ""))[:500],
                composite_score=composite,
            )

        except Exception as e:
            logger.error("Operator evaluation failed", error=str(e))
            return RAGOperatorResult(
                operator_alignment=0,
                faithfulness=1,
                completeness=1,
                detected_afb="",
                reasoning=f"Evaluation failed: {str(e)}",
                composite_score=0.0,
            )

    def _calculate_operator_composite(
        self,
        operator_alignment: int,
        faithfulness: int,
        completeness: int,
    ) -> float:
        """Calculate composite score for operator evaluation."""
        alignment_score = (operator_alignment / 100) * 5

        composite = (
            alignment_score * 0.5 +
            faithfulness * 0.3 +
            completeness * 0.2
        )
        return round(composite, 3)

    # ================================
    # Hallucination Evaluation
    # ================================

    async def evaluate_hallucination(
        self,
        query: str,
        response: str,
        available_facts: List[str],
    ) -> RAGHallucinationResult:
        """Evaluate for hallucinations."""
        prompt = RAG_HALLUCINATION_JUDGE_PROMPT.format(
            query=query,
            response=response,
            available_facts="\n".join(f"- {f}" for f in available_facts),
        )

        try:
            response_text = await self._call_ollama(prompt)
            data = self._parse_json_response(response_text)

            grounding_score = max(0, min(100, int(data.get("grounding_score", 0))))
            invention_detection = "pass" if data.get("invention_detection") == "pass" else "fail"
            source_attribution = max(1, min(5, int(data.get("source_attribution", 1))))
            hallucinated_claims = data.get("hallucinated_claims", [])

            composite = self._calculate_hallucination_composite(
                grounding_score, invention_detection, source_attribution
            )

            return RAGHallucinationResult(
                grounding_score=grounding_score,
                invention_detection=invention_detection,
                source_attribution=source_attribution,
                hallucinated_claims=hallucinated_claims[:5],
                reasoning=str(data.get("reasoning", ""))[:500],
                composite_score=composite,
            )

        except Exception as e:
            logger.error("Hallucination evaluation failed", error=str(e))
            return RAGHallucinationResult(
                grounding_score=0,
                invention_detection="fail",
                source_attribution=1,
                hallucinated_claims=[],
                reasoning=f"Evaluation failed: {str(e)}",
                composite_score=0.0,
            )

    def _calculate_hallucination_composite(
        self,
        grounding_score: int,
        invention_detection: str,
        source_attribution: int,
    ) -> float:
        """Calculate composite score for hallucination evaluation."""
        grounding = (grounding_score / 100) * 5
        invention = 5.0 if invention_detection == "pass" else 0.0

        composite = (
            grounding * 0.4 +
            invention * 0.4 +
            source_attribution * 0.2
        )
        return round(composite, 3)

    # ================================
    # Privacy Evaluation
    # ================================

    async def evaluate_privacy(
        self,
        query: str,
        context: Dict[str, Any],
        response: str,
    ) -> RAGPrivacyResult:
        """Evaluate privacy/DSGVO compliance."""
        prompt = RAG_PRIVACY_JUDGE_PROMPT.format(
            query=query,
            context=json.dumps(context, ensure_ascii=False, indent=2),
            response=response,
        )

        try:
            response_text = await self._call_ollama(prompt)
            data = self._parse_json_response(response_text)

            privacy_compliance = "pass" if data.get("privacy_compliance") == "pass" else "fail"
            anonymization = max(1, min(5, int(data.get("anonymization", 1))))
            dsgvo_compliance = "pass" if data.get("dsgvo_compliance") == "pass" else "fail"
            detected_pii = data.get("detected_pii", [])

            composite = self._calculate_privacy_composite(
                privacy_compliance, anonymization, dsgvo_compliance
            )

            return RAGPrivacyResult(
                privacy_compliance=privacy_compliance,
                anonymization=anonymization,
                dsgvo_compliance=dsgvo_compliance,
                detected_pii=detected_pii[:5],
                reasoning=str(data.get("reasoning", ""))[:500],
                composite_score=composite,
            )

        except Exception as e:
            logger.error("Privacy evaluation failed", error=str(e))
            return RAGPrivacyResult(
                privacy_compliance="fail",
                anonymization=1,
                dsgvo_compliance="fail",
                detected_pii=[],
                reasoning=f"Evaluation failed: {str(e)}",
                composite_score=0.0,
            )

    def _calculate_privacy_composite(
        self,
        privacy_compliance: str,
        anonymization: int,
        dsgvo_compliance: str,
    ) -> float:
        """Calculate composite score for privacy evaluation."""
        privacy = 5.0 if privacy_compliance == "pass" else 0.0
        dsgvo = 5.0 if dsgvo_compliance == "pass" else 0.0

        composite = (
            privacy * 0.4 +
            anonymization * 0.2 +
            dsgvo * 0.4
        )
        return round(composite, 3)

    # ================================
    # Namespace Evaluation
    # ================================

    async def evaluate_namespace(
        self,
        teacher_id: str,
        namespace: str,
        school_id: str,
        requested_data: str,
        response: str,
    ) -> RAGNamespaceResult:
        """Evaluate namespace isolation."""
        prompt = RAG_NAMESPACE_JUDGE_PROMPT.format(
            teacher_id=teacher_id,
            namespace=namespace,
            school_id=school_id,
            requested_data=requested_data,
            response=response,
        )

        try:
            response_text = await self._call_ollama(prompt)
            data = self._parse_json_response(response_text)

            namespace_compliance = "pass" if data.get("namespace_compliance") == "pass" else "fail"
            cross_tenant_leak = "pass" if data.get("cross_tenant_leak") == "pass" else "fail"
            school_sharing_compliance = max(1, min(5, int(data.get("school_sharing_compliance", 1))))
            detected_leaks = data.get("detected_leaks", [])

            composite = self._calculate_namespace_composite(
                namespace_compliance, cross_tenant_leak, school_sharing_compliance
            )

            return RAGNamespaceResult(
                namespace_compliance=namespace_compliance,
                cross_tenant_leak=cross_tenant_leak,
                school_sharing_compliance=school_sharing_compliance,
                detected_leaks=detected_leaks[:5],
                reasoning=str(data.get("reasoning", ""))[:500],
                composite_score=composite,
            )

        except Exception as e:
            logger.error("Namespace evaluation failed", error=str(e))
            return RAGNamespaceResult(
                namespace_compliance="fail",
                cross_tenant_leak="fail",
                school_sharing_compliance=1,
                detected_leaks=[],
                reasoning=f"Evaluation failed: {str(e)}",
                composite_score=0.0,
            )

    def _calculate_namespace_composite(
        self,
        namespace_compliance: str,
        cross_tenant_leak: str,
        school_sharing_compliance: int,
    ) -> float:
        """Calculate composite score for namespace evaluation."""
        ns_compliance = 5.0 if namespace_compliance == "pass" else 0.0
        cross_tenant = 5.0 if cross_tenant_leak == "pass" else 0.0

        composite = (
            ns_compliance * 0.4 +
            cross_tenant * 0.4 +
            school_sharing_compliance * 0.2
        )
        return round(composite, 3)

    # ================================
    # Test Case Evaluation
    # ================================

    async def evaluate_rag_test_case(
        self,
        test_case: Dict[str, Any],
        service_response: Dict[str, Any],
    ) -> TestResult:
        """
        Evaluate a full RAG test case from the golden suite.

        Args:
            test_case: Test case definition from YAML
            service_response: Response from the service being tested

        Returns:
            TestResult with all metrics
        """
        start_time = time.time()

        test_id = test_case.get("id", "UNKNOWN")
        test_name = test_case.get("name", "")
        category = test_case.get("category", "")
        min_score = test_case.get("min_score", 3.5)

        # Route to appropriate evaluation based on category
        composite_score = 0.0
        reasoning = ""

        if category == "eh_retrieval":
            result = await self.evaluate_retrieval(
                query=test_case.get("input", {}).get("query", ""),
                aufgabentyp=test_case.get("input", {}).get("context", {}).get("aufgabentyp", ""),
                subject=test_case.get("input", {}).get("context", {}).get("subject", "Deutsch"),
                level=test_case.get("input", {}).get("context", {}).get("level", "Abitur"),
                retrieved_passage=service_response.get("passage", ""),
                expected_concepts=test_case.get("expected", {}).get("must_contain_concepts", []),
            )
            composite_score = result.composite_score
            reasoning = result.reasoning

        elif category == "operator_alignment":
            result = await self.evaluate_operator(
                operator=test_case.get("input", {}).get("operator", ""),
                generated_definition=service_response.get("definition", ""),
                expected_afb=test_case.get("expected", {}).get("afb_level", ""),
                expected_actions=test_case.get("expected", {}).get("expected_actions", []),
            )
            composite_score = result.composite_score
            reasoning = result.reasoning

        elif category == "hallucination_control":
            result = await self.evaluate_hallucination(
                query=test_case.get("input", {}).get("query", ""),
                response=service_response.get("response", ""),
                available_facts=test_case.get("input", {}).get("context", {}).get("available_facts", []),
            )
            composite_score = result.composite_score
            reasoning = result.reasoning

        elif category == "privacy_compliance":
            result = await self.evaluate_privacy(
                query=test_case.get("input", {}).get("query", ""),
                context=test_case.get("input", {}).get("context", {}),
                response=service_response.get("response", ""),
            )
            composite_score = result.composite_score
            reasoning = result.reasoning

        elif category == "namespace_isolation":
            context = test_case.get("input", {}).get("context", {})
            result = await self.evaluate_namespace(
                teacher_id=context.get("teacher_id", ""),
                namespace=context.get("namespace", ""),
                school_id=context.get("school_id", ""),
                requested_data=test_case.get("input", {}).get("query", ""),
                response=service_response.get("response", ""),
            )
            composite_score = result.composite_score
            reasoning = result.reasoning

        else:
            reasoning = f"Unknown category: {category}"

        duration_ms = int((time.time() - start_time) * 1000)
        passed = composite_score >= min_score

        return TestResult(
            test_id=test_id,
            test_name=test_name,
            user_input=str(test_case.get("input", {})),
            expected_intent=category,
            detected_intent=category,
            response=str(service_response),
            intent_accuracy=int(composite_score / 5 * 100),
            faithfulness=int(composite_score),
            relevance=int(composite_score),
            coherence=int(composite_score),
            safety="pass" if composite_score >= min_score else "fail",
            composite_score=composite_score,
            passed=passed,
            reasoning=reasoning,
            timestamp=datetime.utcnow(),
            duration_ms=duration_ms,
        )

    async def health_check(self) -> bool:
        """Check if Ollama and judge model are available."""
        try:
            client = await self._get_client()
            response = await client.get(f"{self.config.ollama_base_url}/api/tags")
            if response.status_code != 200:
                return False

            models = response.json().get("models", [])
            model_names = [m.get("name", "") for m in models]

            for name in model_names:
                if self.config.judge_model in name:
                    return True

            logger.warning(
                "Judge model not found",
                model=self.config.judge_model,
                available=model_names[:5],
            )
            return False

        except Exception as e:
            logger.error("Health check failed", error=str(e))
            return False

    async def close(self):
        """Close HTTP client."""
        if self._client:
            await self._client.aclose()
            self._client = None