Files
breakpilot-lehrer/klausur-service/backend/rag_evaluation.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

394 lines
11 KiB
Python

"""
RAG Evaluation Module
Implements key RAG quality metrics inspired by RAGAS framework:
- Context Precision: Is the retrieved context relevant?
- Context Recall: Did we retrieve all necessary information?
- Faithfulness: Are answers grounded in the context?
- Answer Relevancy: Does the answer address the question?
These metrics help continuously monitor and improve RAG quality.
"""
import os
from typing import List, Dict, Optional, Tuple
from datetime import datetime
import json
from pathlib import Path
import httpx
# Configuration
EVALUATION_ENABLED = os.getenv("RAG_EVALUATION_ENABLED", "true").lower() == "true"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
EVAL_MODEL = os.getenv("RAG_EVAL_MODEL", "gpt-4o-mini")
# Storage for evaluation results
EVAL_RESULTS_FILE = Path(os.getenv("RAG_EVAL_RESULTS_FILE", "/app/docs/rag_evaluation_results.json"))
class RAGEvaluationError(Exception):
"""Error during RAG evaluation."""
pass
def _load_eval_results() -> List[Dict]:
"""Load evaluation results from file."""
if EVAL_RESULTS_FILE.exists():
try:
with open(EVAL_RESULTS_FILE, 'r') as f:
return json.load(f)
except Exception:
return []
return []
def _save_eval_results(results: List[Dict]) -> None:
"""Save evaluation results to file."""
EVAL_RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(EVAL_RESULTS_FILE, 'w') as f:
json.dump(results[-1000:], f, indent=2) # Keep last 1000
def calculate_context_precision(
query: str,
retrieved_contexts: List[str],
relevant_contexts: List[str]
) -> float:
"""
Calculate Context Precision: What fraction of retrieved contexts are relevant?
Precision = |Relevant ∩ Retrieved| / |Retrieved|
Args:
query: The search query
retrieved_contexts: Contexts returned by RAG
relevant_contexts: Ground truth relevant contexts
Returns:
Precision score between 0 and 1
"""
if not retrieved_contexts:
return 0.0
# Simple text overlap check
relevant_count = 0
for ret_ctx in retrieved_contexts:
for rel_ctx in relevant_contexts:
# Check if there's significant overlap
if _text_similarity(ret_ctx, rel_ctx) > 0.5:
relevant_count += 1
break
return relevant_count / len(retrieved_contexts)
def calculate_context_recall(
query: str,
retrieved_contexts: List[str],
relevant_contexts: List[str]
) -> float:
"""
Calculate Context Recall: What fraction of relevant contexts were retrieved?
Recall = |Relevant ∩ Retrieved| / |Relevant|
Args:
query: The search query
retrieved_contexts: Contexts returned by RAG
relevant_contexts: Ground truth relevant contexts
Returns:
Recall score between 0 and 1
"""
if not relevant_contexts:
return 1.0 # No relevant contexts to miss
found_count = 0
for rel_ctx in relevant_contexts:
for ret_ctx in retrieved_contexts:
if _text_similarity(ret_ctx, rel_ctx) > 0.5:
found_count += 1
break
return found_count / len(relevant_contexts)
def _text_similarity(text1: str, text2: str) -> float:
"""
Simple text similarity using word overlap (Jaccard similarity).
"""
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
if not words1 or not words2:
return 0.0
intersection = len(words1 & words2)
union = len(words1 | words2)
return intersection / union if union > 0 else 0.0
async def evaluate_faithfulness(
answer: str,
contexts: List[str],
) -> Tuple[float, str]:
"""
Evaluate Faithfulness: Is the answer grounded in the provided contexts?
Uses LLM to check if claims in the answer are supported by contexts.
Args:
answer: The generated answer
contexts: The retrieved contexts used to generate the answer
Returns:
Tuple of (faithfulness_score, explanation)
"""
if not OPENAI_API_KEY:
return 0.5, "LLM not configured for faithfulness evaluation"
context_text = "\n---\n".join(contexts[:5]) # Limit context length
prompt = f"""Bewerte, ob die folgende Antwort vollständig durch die gegebenen Kontexte gestützt wird.
KONTEXTE:
{context_text}
ANTWORT:
{answer}
Analysiere:
1. Sind alle Aussagen in der Antwort durch die Kontexte belegt?
2. Gibt es Behauptungen ohne Grundlage in den Kontexten?
Antworte im Format:
SCORE: [0.0-1.0]
BEGRÜNDUNG: [Kurze Erklärung]"""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": EVAL_MODEL,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 200,
"temperature": 0.0,
},
timeout=30.0
)
if response.status_code != 200:
return 0.5, f"API error: {response.status_code}"
result = response.json()["choices"][0]["message"]["content"]
# Parse score from response
import re
score_match = re.search(r'SCORE:\s*([\d.]+)', result)
score = float(score_match.group(1)) if score_match else 0.5
reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
reason = reason_match.group(1).strip() if reason_match else result
return min(max(score, 0.0), 1.0), reason
except Exception as e:
return 0.5, f"Evaluation error: {str(e)}"
async def evaluate_answer_relevancy(
query: str,
answer: str,
) -> Tuple[float, str]:
"""
Evaluate Answer Relevancy: Does the answer address the question?
Uses LLM to assess if the answer is relevant to the query.
Args:
query: The original question
answer: The generated answer
Returns:
Tuple of (relevancy_score, explanation)
"""
if not OPENAI_API_KEY:
return 0.5, "LLM not configured for relevancy evaluation"
prompt = f"""Bewerte, wie relevant die Antwort für die gestellte Frage ist.
FRAGE: {query}
ANTWORT: {answer}
Analysiere:
1. Beantwortet die Antwort die gestellte Frage direkt?
2. Ist die Antwort vollständig oder fehlen wichtige Aspekte?
3. Enthält die Antwort irrelevante Informationen?
Antworte im Format:
SCORE: [0.0-1.0]
BEGRÜNDUNG: [Kurze Erklärung]"""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": EVAL_MODEL,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 200,
"temperature": 0.0,
},
timeout=30.0
)
if response.status_code != 200:
return 0.5, f"API error: {response.status_code}"
result = response.json()["choices"][0]["message"]["content"]
import re
score_match = re.search(r'SCORE:\s*([\d.]+)', result)
score = float(score_match.group(1)) if score_match else 0.5
reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
reason = reason_match.group(1).strip() if reason_match else result
return min(max(score, 0.0), 1.0), reason
except Exception as e:
return 0.5, f"Evaluation error: {str(e)}"
async def evaluate_rag_response(
query: str,
answer: str,
retrieved_contexts: List[str],
ground_truth_contexts: Optional[List[str]] = None,
ground_truth_answer: Optional[str] = None,
) -> Dict:
"""
Comprehensive RAG evaluation combining all metrics.
Args:
query: The original question
answer: The generated answer
retrieved_contexts: Contexts retrieved by RAG
ground_truth_contexts: Optional ground truth relevant contexts
ground_truth_answer: Optional ground truth answer
Returns:
Evaluation results with all metrics
"""
results = {
"timestamp": datetime.now().isoformat(),
"query": query,
"answer_length": len(answer),
"contexts_count": len(retrieved_contexts),
"metrics": {},
}
# Context metrics (if ground truth available)
if ground_truth_contexts:
results["metrics"]["context_precision"] = calculate_context_precision(
query, retrieved_contexts, ground_truth_contexts
)
results["metrics"]["context_recall"] = calculate_context_recall(
query, retrieved_contexts, ground_truth_contexts
)
# Faithfulness (requires LLM)
if OPENAI_API_KEY and retrieved_contexts:
faith_score, faith_reason = await evaluate_faithfulness(answer, retrieved_contexts)
results["metrics"]["faithfulness"] = faith_score
results["faithfulness_reason"] = faith_reason
# Answer relevancy (requires LLM)
if OPENAI_API_KEY:
rel_score, rel_reason = await evaluate_answer_relevancy(query, answer)
results["metrics"]["answer_relevancy"] = rel_score
results["answer_relevancy_reason"] = rel_reason
# Calculate overall score
metric_values = list(results["metrics"].values())
if metric_values:
results["overall_score"] = sum(metric_values) / len(metric_values)
# Store results
all_results = _load_eval_results()
all_results.append(results)
_save_eval_results(all_results)
return results
def get_evaluation_summary(last_n: int = 100) -> Dict:
"""
Get summary statistics of recent evaluations.
Args:
last_n: Number of recent evaluations to include
Returns:
Summary with average scores and trends
"""
all_results = _load_eval_results()
recent = all_results[-last_n:] if all_results else []
if not recent:
return {
"total_evaluations": 0,
"message": "No evaluations yet",
}
# Calculate averages
metrics_sums = {}
metrics_counts = {}
for result in recent:
for metric, value in result.get("metrics", {}).items():
if metric not in metrics_sums:
metrics_sums[metric] = 0
metrics_counts[metric] = 0
metrics_sums[metric] += value
metrics_counts[metric] += 1
averages = {
metric: metrics_sums[metric] / metrics_counts[metric]
for metric in metrics_sums
}
return {
"total_evaluations": len(all_results),
"evaluations_in_summary": len(recent),
"average_metrics": averages,
"overall_average": sum(averages.values()) / len(averages) if averages else 0,
}
def get_evaluation_info() -> dict:
"""Get information about evaluation configuration."""
return {
"enabled": EVALUATION_ENABLED,
"llm_configured": bool(OPENAI_API_KEY),
"eval_model": EVAL_MODEL,
"results_file": str(EVAL_RESULTS_FILE),
"metrics": [
"context_precision",
"context_recall",
"faithfulness",
"answer_relevancy",
],
}