Files
breakpilot-lehrer/voice-service/bqas/judge.py
Benjamin Admin 9912997187
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service)
- 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen
- Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps
- CAMUNDA_URL aus backend-lehrer environment entfernt
- Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt
- Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00

272 lines
8.6 KiB
Python

"""
LLM Judge - Qwen2.5-32B based evaluation
"""
import json
import time
import structlog
import httpx
from dataclasses import dataclass
from typing import Literal, Optional
from datetime import datetime
from bqas.config import BQASConfig
from bqas.prompts import JUDGE_PROMPT
from bqas.metrics import TestResult
logger = structlog.get_logger(__name__)
@dataclass
class JudgeResult:
"""Result from LLM Judge evaluation."""
intent_accuracy: int # 0-100
faithfulness: int # 1-5
relevance: int # 1-5
coherence: int # 1-5
safety: Literal["pass", "fail"]
reasoning: str
composite_score: float # Weighted average
class LLMJudge:
"""
LLM-based evaluation of voice service responses.
Uses Qwen2.5-32B via Ollama to evaluate:
- Intent accuracy
- Faithfulness (factual correctness)
- Relevance (addresses the question)
- Coherence (logical consistency)
- Safety (no PII/DSGVO violations)
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None:
self._client = httpx.AsyncClient(timeout=self.config.judge_timeout)
return self._client
async def evaluate(
self,
user_input: str,
detected_intent: str,
response: str,
expected_intent: str,
) -> JudgeResult:
"""
Evaluate a voice service response.
Args:
user_input: Original user voice command
detected_intent: Intent detected by the service
response: Generated response text
expected_intent: Expected (ground truth) intent
Returns:
JudgeResult with all metrics
"""
prompt = JUDGE_PROMPT.format(
user_input=user_input,
detected_intent=detected_intent,
response=response,
expected_intent=expected_intent,
)
client = await self._get_client()
try:
resp = await client.post(
f"{self.config.ollama_base_url}/api/generate",
json={
"model": self.config.judge_model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 500,
},
},
)
resp.raise_for_status()
result_text = resp.json().get("response", "")
# Parse JSON from response
parsed = self._parse_judge_response(result_text)
# Calculate composite score
composite = self._calculate_composite(parsed)
parsed["composite_score"] = composite
return JudgeResult(**parsed)
except httpx.HTTPError as e:
logger.error("Judge request failed", error=str(e))
# Return a failed result
return JudgeResult(
intent_accuracy=0,
faithfulness=1,
relevance=1,
coherence=1,
safety="fail",
reasoning=f"Evaluation failed: {str(e)}",
composite_score=0.0,
)
except Exception as e:
logger.error("Unexpected error during evaluation", error=str(e))
return JudgeResult(
intent_accuracy=0,
faithfulness=1,
relevance=1,
coherence=1,
safety="fail",
reasoning=f"Unexpected error: {str(e)}",
composite_score=0.0,
)
def _parse_judge_response(self, text: str) -> dict:
"""Parse JSON from judge response."""
try:
# Find JSON in response
start = text.find("{")
end = text.rfind("}") + 1
if start >= 0 and end > start:
json_str = text[start:end]
data = json.loads(json_str)
# Validate and clamp values
return {
"intent_accuracy": max(0, min(100, int(data.get("intent_accuracy", 0)))),
"faithfulness": max(1, min(5, int(data.get("faithfulness", 1)))),
"relevance": max(1, min(5, int(data.get("relevance", 1)))),
"coherence": max(1, min(5, int(data.get("coherence", 1)))),
"safety": "pass" if data.get("safety", "fail") == "pass" else "fail",
"reasoning": str(data.get("reasoning", ""))[:500],
}
except (json.JSONDecodeError, ValueError, TypeError) as e:
logger.warning("Failed to parse judge response", error=str(e), text=text[:200])
# Default values on parse failure
return {
"intent_accuracy": 0,
"faithfulness": 1,
"relevance": 1,
"coherence": 1,
"safety": "fail",
"reasoning": "Parse error",
}
def _calculate_composite(self, result: dict) -> float:
"""Calculate weighted composite score (0-5 scale)."""
c = self.config
# Normalize intent accuracy to 0-5 scale
intent_score = (result["intent_accuracy"] / 100) * 5
# Safety score: 5 if pass, 0 if fail
safety_score = 5.0 if result["safety"] == "pass" else 0.0
composite = (
intent_score * c.intent_accuracy_weight +
result["faithfulness"] * c.faithfulness_weight +
result["relevance"] * c.relevance_weight +
result["coherence"] * c.coherence_weight +
safety_score * c.safety_weight
)
return round(composite, 3)
async def evaluate_test_case(
self,
test_id: str,
test_name: str,
user_input: str,
expected_intent: str,
detected_intent: str,
response: str,
min_score: float = 3.5,
) -> TestResult:
"""
Evaluate a full test case and return TestResult.
Args:
test_id: Unique test identifier
test_name: Human-readable test name
user_input: Original voice command
expected_intent: Ground truth intent
detected_intent: Detected intent from service
response: Generated response
min_score: Minimum score to pass
Returns:
TestResult with all metrics and pass/fail status
"""
start_time = time.time()
judge_result = await self.evaluate(
user_input=user_input,
detected_intent=detected_intent,
response=response,
expected_intent=expected_intent,
)
duration_ms = int((time.time() - start_time) * 1000)
passed = judge_result.composite_score >= min_score
return TestResult(
test_id=test_id,
test_name=test_name,
user_input=user_input,
expected_intent=expected_intent,
detected_intent=detected_intent,
response=response,
intent_accuracy=judge_result.intent_accuracy,
faithfulness=judge_result.faithfulness,
relevance=judge_result.relevance,
coherence=judge_result.coherence,
safety=judge_result.safety,
composite_score=judge_result.composite_score,
passed=passed,
reasoning=judge_result.reasoning,
timestamp=datetime.utcnow(),
duration_ms=duration_ms,
)
async def health_check(self) -> bool:
"""Check if Ollama and judge model are available."""
try:
client = await self._get_client()
response = await client.get(f"{self.config.ollama_base_url}/api/tags")
if response.status_code != 200:
return False
# Check if model is available
models = response.json().get("models", [])
model_names = [m.get("name", "") for m in models]
# Check for exact match or partial match
for name in model_names:
if self.config.judge_model in name:
return True
logger.warning(
"Judge model not found",
model=self.config.judge_model,
available=model_names[:5],
)
return False
except Exception as e:
logger.error("Health check failed", error=str(e))
return False
async def close(self):
"""Close HTTP client."""
if self._client:
await self._client.aclose()
self._client = None