breakpilot-lehrer/voice-service/bqas/rag_judge.py

"""
RAG Judge - Specialized evaluation for RAG/Correction quality

Split into:
- rag_judge_types.py: Data classes for evaluation results
- rag_judge_evaluators.py: Individual evaluation methods
- rag_judge.py (this file): RAGJudge class (orchestrator + barrel re-exports)
"""
import json
import structlog
import httpx
from typing import Optional, Dict, List, Any

from bqas.config import BQASConfig
from bqas.metrics import TestResult

# Re-export types for backward compatibility
from bqas.rag_judge_types import (
    RAGRetrievalResult,
    RAGOperatorResult,
    RAGHallucinationResult,
    RAGPrivacyResult,
    RAGNamespaceResult,
)

from bqas.rag_judge_evaluators import (
    evaluate_retrieval as _evaluate_retrieval,
    evaluate_operator as _evaluate_operator,
    evaluate_hallucination as _evaluate_hallucination,
    evaluate_privacy as _evaluate_privacy,
    evaluate_namespace as _evaluate_namespace,
    evaluate_rag_test_case as _evaluate_rag_test_case,
)

__all__ = [
    "RAGJudge",
    "RAGRetrievalResult",
    "RAGOperatorResult",
    "RAGHallucinationResult",
    "RAGPrivacyResult",
    "RAGNamespaceResult",
]

logger = structlog.get_logger(__name__)


class RAGJudge:
    """
    Specialized judge for RAG/Correction quality evaluation.

    Evaluates:
    - EH Retrieval quality
    - Operator alignment
    - Hallucination control
    - Privacy/DSGVO compliance
    - Namespace isolation
    """

    def __init__(self, config: Optional[BQASConfig] = None):
        self.config = config or BQASConfig.from_env()
        self._client: Optional[httpx.AsyncClient] = None

    async def _get_client(self) -> httpx.AsyncClient:
        """Get or create HTTP client."""
        if self._client is None:
            self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
        return self._client

    async def _call_ollama(self, prompt: str) -> str:
        """Call Ollama API with prompt."""
        client = await self._get_client()

        resp = await client.post(
            f"{self.config.ollama_base_url}/api/generate",
            json={
                "model": self.config.judge_model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.1,
                    "num_predict": 800,
                },
            },
        )
        resp.raise_for_status()
        return resp.json().get("response", "")

    def _parse_json_response(self, text: str) -> dict:
        """Parse JSON from response text."""
        try:
            start = text.find("{")
            end = text.rfind("}") + 1
            if start >= 0 and end > start:
                json_str = text[start:end]
                return json.loads(json_str)
        except (json.JSONDecodeError, ValueError) as e:
            logger.warning("Failed to parse JSON response", error=str(e), text=text[:200])
        return {}

    async def evaluate_retrieval(
        self, query: str, aufgabentyp: str, subject: str, level: str,
        retrieved_passage: str, expected_concepts: List[str],
    ) -> RAGRetrievalResult:
        return await _evaluate_retrieval(
            self._call_ollama, self._parse_json_response, self.config,
            query, aufgabentyp, subject, level, retrieved_passage, expected_concepts,
        )

    async def evaluate_operator(
        self, operator: str, generated_definition: str,
        expected_afb: str, expected_actions: List[str],
    ) -> RAGOperatorResult:
        return await _evaluate_operator(
            self._call_ollama, self._parse_json_response,
            operator, generated_definition, expected_afb, expected_actions,
        )

    async def evaluate_hallucination(
        self, query: str, response: str, available_facts: List[str],
    ) -> RAGHallucinationResult:
        return await _evaluate_hallucination(
            self._call_ollama, self._parse_json_response,
            query, response, available_facts,
        )

    async def evaluate_privacy(
        self, query: str, context: Dict[str, Any], response: str,
    ) -> RAGPrivacyResult:
        return await _evaluate_privacy(
            self._call_ollama, self._parse_json_response,
            query, context, response,
        )

    async def evaluate_namespace(
        self, teacher_id: str, namespace: str, school_id: str,
        requested_data: str, response: str,
    ) -> RAGNamespaceResult:
        return await _evaluate_namespace(
            self._call_ollama, self._parse_json_response,
            teacher_id, namespace, school_id, requested_data, response,
        )

    async def evaluate_rag_test_case(
        self, test_case: Dict[str, Any], service_response: Dict[str, Any],
    ) -> TestResult:
        return await _evaluate_rag_test_case(self, test_case, service_response)

    async def health_check(self) -> bool:
        """Check if Ollama and judge model are available."""
        try:
            client = await self._get_client()
            response = await client.get(f"{self.config.ollama_base_url}/api/tags")
            if response.status_code != 200:
                return False

            models = response.json().get("models", [])
            model_names = [m.get("name", "") for m in models]

            for name in model_names:
                if self.config.judge_model in name:
                    return True

            logger.warning(
                "Judge model not found",
                model=self.config.judge_model,
                available=model_names[:5],
            )
            return False

        except Exception as e:
            logger.error("Health check failed", error=str(e))
            return False

    async def close(self):
        """Close HTTP client."""
        if self._client:
            await self._client.aclose()
            self._client = None