Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
393
klausur-service/backend/rag_evaluation.py
Normal file
393
klausur-service/backend/rag_evaluation.py
Normal file
@@ -0,0 +1,393 @@
|
||||
"""
|
||||
RAG Evaluation Module
|
||||
|
||||
Implements key RAG quality metrics inspired by RAGAS framework:
|
||||
- Context Precision: Is the retrieved context relevant?
|
||||
- Context Recall: Did we retrieve all necessary information?
|
||||
- Faithfulness: Are answers grounded in the context?
|
||||
- Answer Relevancy: Does the answer address the question?
|
||||
|
||||
These metrics help continuously monitor and improve RAG quality.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from datetime import datetime
|
||||
import json
|
||||
from pathlib import Path
|
||||
import httpx
|
||||
|
||||
# Configuration
|
||||
EVALUATION_ENABLED = os.getenv("RAG_EVALUATION_ENABLED", "true").lower() == "true"
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
EVAL_MODEL = os.getenv("RAG_EVAL_MODEL", "gpt-4o-mini")
|
||||
|
||||
# Storage for evaluation results
|
||||
EVAL_RESULTS_FILE = Path(os.getenv("RAG_EVAL_RESULTS_FILE", "/app/docs/rag_evaluation_results.json"))
|
||||
|
||||
|
||||
class RAGEvaluationError(Exception):
|
||||
"""Error during RAG evaluation."""
|
||||
pass
|
||||
|
||||
|
||||
def _load_eval_results() -> List[Dict]:
|
||||
"""Load evaluation results from file."""
|
||||
if EVAL_RESULTS_FILE.exists():
|
||||
try:
|
||||
with open(EVAL_RESULTS_FILE, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def _save_eval_results(results: List[Dict]) -> None:
|
||||
"""Save evaluation results to file."""
|
||||
EVAL_RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(EVAL_RESULTS_FILE, 'w') as f:
|
||||
json.dump(results[-1000:], f, indent=2) # Keep last 1000
|
||||
|
||||
|
||||
def calculate_context_precision(
|
||||
query: str,
|
||||
retrieved_contexts: List[str],
|
||||
relevant_contexts: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Context Precision: What fraction of retrieved contexts are relevant?
|
||||
|
||||
Precision = |Relevant ∩ Retrieved| / |Retrieved|
|
||||
|
||||
Args:
|
||||
query: The search query
|
||||
retrieved_contexts: Contexts returned by RAG
|
||||
relevant_contexts: Ground truth relevant contexts
|
||||
|
||||
Returns:
|
||||
Precision score between 0 and 1
|
||||
"""
|
||||
if not retrieved_contexts:
|
||||
return 0.0
|
||||
|
||||
# Simple text overlap check
|
||||
relevant_count = 0
|
||||
for ret_ctx in retrieved_contexts:
|
||||
for rel_ctx in relevant_contexts:
|
||||
# Check if there's significant overlap
|
||||
if _text_similarity(ret_ctx, rel_ctx) > 0.5:
|
||||
relevant_count += 1
|
||||
break
|
||||
|
||||
return relevant_count / len(retrieved_contexts)
|
||||
|
||||
|
||||
def calculate_context_recall(
|
||||
query: str,
|
||||
retrieved_contexts: List[str],
|
||||
relevant_contexts: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Context Recall: What fraction of relevant contexts were retrieved?
|
||||
|
||||
Recall = |Relevant ∩ Retrieved| / |Relevant|
|
||||
|
||||
Args:
|
||||
query: The search query
|
||||
retrieved_contexts: Contexts returned by RAG
|
||||
relevant_contexts: Ground truth relevant contexts
|
||||
|
||||
Returns:
|
||||
Recall score between 0 and 1
|
||||
"""
|
||||
if not relevant_contexts:
|
||||
return 1.0 # No relevant contexts to miss
|
||||
|
||||
found_count = 0
|
||||
for rel_ctx in relevant_contexts:
|
||||
for ret_ctx in retrieved_contexts:
|
||||
if _text_similarity(ret_ctx, rel_ctx) > 0.5:
|
||||
found_count += 1
|
||||
break
|
||||
|
||||
return found_count / len(relevant_contexts)
|
||||
|
||||
|
||||
def _text_similarity(text1: str, text2: str) -> float:
|
||||
"""
|
||||
Simple text similarity using word overlap (Jaccard similarity).
|
||||
"""
|
||||
words1 = set(text1.lower().split())
|
||||
words2 = set(text2.lower().split())
|
||||
|
||||
if not words1 or not words2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(words1 & words2)
|
||||
union = len(words1 | words2)
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
async def evaluate_faithfulness(
|
||||
answer: str,
|
||||
contexts: List[str],
|
||||
) -> Tuple[float, str]:
|
||||
"""
|
||||
Evaluate Faithfulness: Is the answer grounded in the provided contexts?
|
||||
|
||||
Uses LLM to check if claims in the answer are supported by contexts.
|
||||
|
||||
Args:
|
||||
answer: The generated answer
|
||||
contexts: The retrieved contexts used to generate the answer
|
||||
|
||||
Returns:
|
||||
Tuple of (faithfulness_score, explanation)
|
||||
"""
|
||||
if not OPENAI_API_KEY:
|
||||
return 0.5, "LLM not configured for faithfulness evaluation"
|
||||
|
||||
context_text = "\n---\n".join(contexts[:5]) # Limit context length
|
||||
|
||||
prompt = f"""Bewerte, ob die folgende Antwort vollständig durch die gegebenen Kontexte gestützt wird.
|
||||
|
||||
KONTEXTE:
|
||||
{context_text}
|
||||
|
||||
ANTWORT:
|
||||
{answer}
|
||||
|
||||
Analysiere:
|
||||
1. Sind alle Aussagen in der Antwort durch die Kontexte belegt?
|
||||
2. Gibt es Behauptungen ohne Grundlage in den Kontexten?
|
||||
|
||||
Antworte im Format:
|
||||
SCORE: [0.0-1.0]
|
||||
BEGRÜNDUNG: [Kurze Erklärung]"""
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": EVAL_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return 0.5, f"API error: {response.status_code}"
|
||||
|
||||
result = response.json()["choices"][0]["message"]["content"]
|
||||
|
||||
# Parse score from response
|
||||
import re
|
||||
score_match = re.search(r'SCORE:\s*([\d.]+)', result)
|
||||
score = float(score_match.group(1)) if score_match else 0.5
|
||||
|
||||
reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
|
||||
reason = reason_match.group(1).strip() if reason_match else result
|
||||
|
||||
return min(max(score, 0.0), 1.0), reason
|
||||
|
||||
except Exception as e:
|
||||
return 0.5, f"Evaluation error: {str(e)}"
|
||||
|
||||
|
||||
async def evaluate_answer_relevancy(
|
||||
query: str,
|
||||
answer: str,
|
||||
) -> Tuple[float, str]:
|
||||
"""
|
||||
Evaluate Answer Relevancy: Does the answer address the question?
|
||||
|
||||
Uses LLM to assess if the answer is relevant to the query.
|
||||
|
||||
Args:
|
||||
query: The original question
|
||||
answer: The generated answer
|
||||
|
||||
Returns:
|
||||
Tuple of (relevancy_score, explanation)
|
||||
"""
|
||||
if not OPENAI_API_KEY:
|
||||
return 0.5, "LLM not configured for relevancy evaluation"
|
||||
|
||||
prompt = f"""Bewerte, wie relevant die Antwort für die gestellte Frage ist.
|
||||
|
||||
FRAGE: {query}
|
||||
|
||||
ANTWORT: {answer}
|
||||
|
||||
Analysiere:
|
||||
1. Beantwortet die Antwort die gestellte Frage direkt?
|
||||
2. Ist die Antwort vollständig oder fehlen wichtige Aspekte?
|
||||
3. Enthält die Antwort irrelevante Informationen?
|
||||
|
||||
Antworte im Format:
|
||||
SCORE: [0.0-1.0]
|
||||
BEGRÜNDUNG: [Kurze Erklärung]"""
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": EVAL_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.0,
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return 0.5, f"API error: {response.status_code}"
|
||||
|
||||
result = response.json()["choices"][0]["message"]["content"]
|
||||
|
||||
import re
|
||||
score_match = re.search(r'SCORE:\s*([\d.]+)', result)
|
||||
score = float(score_match.group(1)) if score_match else 0.5
|
||||
|
||||
reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
|
||||
reason = reason_match.group(1).strip() if reason_match else result
|
||||
|
||||
return min(max(score, 0.0), 1.0), reason
|
||||
|
||||
except Exception as e:
|
||||
return 0.5, f"Evaluation error: {str(e)}"
|
||||
|
||||
|
||||
async def evaluate_rag_response(
|
||||
query: str,
|
||||
answer: str,
|
||||
retrieved_contexts: List[str],
|
||||
ground_truth_contexts: Optional[List[str]] = None,
|
||||
ground_truth_answer: Optional[str] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Comprehensive RAG evaluation combining all metrics.
|
||||
|
||||
Args:
|
||||
query: The original question
|
||||
answer: The generated answer
|
||||
retrieved_contexts: Contexts retrieved by RAG
|
||||
ground_truth_contexts: Optional ground truth relevant contexts
|
||||
ground_truth_answer: Optional ground truth answer
|
||||
|
||||
Returns:
|
||||
Evaluation results with all metrics
|
||||
"""
|
||||
results = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"query": query,
|
||||
"answer_length": len(answer),
|
||||
"contexts_count": len(retrieved_contexts),
|
||||
"metrics": {},
|
||||
}
|
||||
|
||||
# Context metrics (if ground truth available)
|
||||
if ground_truth_contexts:
|
||||
results["metrics"]["context_precision"] = calculate_context_precision(
|
||||
query, retrieved_contexts, ground_truth_contexts
|
||||
)
|
||||
results["metrics"]["context_recall"] = calculate_context_recall(
|
||||
query, retrieved_contexts, ground_truth_contexts
|
||||
)
|
||||
|
||||
# Faithfulness (requires LLM)
|
||||
if OPENAI_API_KEY and retrieved_contexts:
|
||||
faith_score, faith_reason = await evaluate_faithfulness(answer, retrieved_contexts)
|
||||
results["metrics"]["faithfulness"] = faith_score
|
||||
results["faithfulness_reason"] = faith_reason
|
||||
|
||||
# Answer relevancy (requires LLM)
|
||||
if OPENAI_API_KEY:
|
||||
rel_score, rel_reason = await evaluate_answer_relevancy(query, answer)
|
||||
results["metrics"]["answer_relevancy"] = rel_score
|
||||
results["answer_relevancy_reason"] = rel_reason
|
||||
|
||||
# Calculate overall score
|
||||
metric_values = list(results["metrics"].values())
|
||||
if metric_values:
|
||||
results["overall_score"] = sum(metric_values) / len(metric_values)
|
||||
|
||||
# Store results
|
||||
all_results = _load_eval_results()
|
||||
all_results.append(results)
|
||||
_save_eval_results(all_results)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_evaluation_summary(last_n: int = 100) -> Dict:
|
||||
"""
|
||||
Get summary statistics of recent evaluations.
|
||||
|
||||
Args:
|
||||
last_n: Number of recent evaluations to include
|
||||
|
||||
Returns:
|
||||
Summary with average scores and trends
|
||||
"""
|
||||
all_results = _load_eval_results()
|
||||
recent = all_results[-last_n:] if all_results else []
|
||||
|
||||
if not recent:
|
||||
return {
|
||||
"total_evaluations": 0,
|
||||
"message": "No evaluations yet",
|
||||
}
|
||||
|
||||
# Calculate averages
|
||||
metrics_sums = {}
|
||||
metrics_counts = {}
|
||||
|
||||
for result in recent:
|
||||
for metric, value in result.get("metrics", {}).items():
|
||||
if metric not in metrics_sums:
|
||||
metrics_sums[metric] = 0
|
||||
metrics_counts[metric] = 0
|
||||
metrics_sums[metric] += value
|
||||
metrics_counts[metric] += 1
|
||||
|
||||
averages = {
|
||||
metric: metrics_sums[metric] / metrics_counts[metric]
|
||||
for metric in metrics_sums
|
||||
}
|
||||
|
||||
return {
|
||||
"total_evaluations": len(all_results),
|
||||
"evaluations_in_summary": len(recent),
|
||||
"average_metrics": averages,
|
||||
"overall_average": sum(averages.values()) / len(averages) if averages else 0,
|
||||
}
|
||||
|
||||
|
||||
def get_evaluation_info() -> dict:
|
||||
"""Get information about evaluation configuration."""
|
||||
return {
|
||||
"enabled": EVALUATION_ENABLED,
|
||||
"llm_configured": bool(OPENAI_API_KEY),
|
||||
"eval_model": EVAL_MODEL,
|
||||
"results_file": str(EVAL_RESULTS_FILE),
|
||||
"metrics": [
|
||||
"context_precision",
|
||||
"context_recall",
|
||||
"faithfulness",
|
||||
"answer_relevancy",
|
||||
],
|
||||
}
|
||||
Reference in New Issue
Block a user