A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
394 lines
11 KiB
Python
394 lines
11 KiB
Python
"""
|
|
RAG Evaluation Module
|
|
|
|
Implements key RAG quality metrics inspired by RAGAS framework:
|
|
- Context Precision: Is the retrieved context relevant?
|
|
- Context Recall: Did we retrieve all necessary information?
|
|
- Faithfulness: Are answers grounded in the context?
|
|
- Answer Relevancy: Does the answer address the question?
|
|
|
|
These metrics help continuously monitor and improve RAG quality.
|
|
"""
|
|
|
|
import os
|
|
from typing import List, Dict, Optional, Tuple
|
|
from datetime import datetime
|
|
import json
|
|
from pathlib import Path
|
|
import httpx
|
|
|
|
# Configuration
|
|
EVALUATION_ENABLED = os.getenv("RAG_EVALUATION_ENABLED", "true").lower() == "true"
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
EVAL_MODEL = os.getenv("RAG_EVAL_MODEL", "gpt-4o-mini")
|
|
|
|
# Storage for evaluation results
|
|
EVAL_RESULTS_FILE = Path(os.getenv("RAG_EVAL_RESULTS_FILE", "/app/docs/rag_evaluation_results.json"))
|
|
|
|
|
|
class RAGEvaluationError(Exception):
|
|
"""Error during RAG evaluation."""
|
|
pass
|
|
|
|
|
|
def _load_eval_results() -> List[Dict]:
|
|
"""Load evaluation results from file."""
|
|
if EVAL_RESULTS_FILE.exists():
|
|
try:
|
|
with open(EVAL_RESULTS_FILE, 'r') as f:
|
|
return json.load(f)
|
|
except Exception:
|
|
return []
|
|
return []
|
|
|
|
|
|
def _save_eval_results(results: List[Dict]) -> None:
|
|
"""Save evaluation results to file."""
|
|
EVAL_RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(EVAL_RESULTS_FILE, 'w') as f:
|
|
json.dump(results[-1000:], f, indent=2) # Keep last 1000
|
|
|
|
|
|
def calculate_context_precision(
|
|
query: str,
|
|
retrieved_contexts: List[str],
|
|
relevant_contexts: List[str]
|
|
) -> float:
|
|
"""
|
|
Calculate Context Precision: What fraction of retrieved contexts are relevant?
|
|
|
|
Precision = |Relevant ∩ Retrieved| / |Retrieved|
|
|
|
|
Args:
|
|
query: The search query
|
|
retrieved_contexts: Contexts returned by RAG
|
|
relevant_contexts: Ground truth relevant contexts
|
|
|
|
Returns:
|
|
Precision score between 0 and 1
|
|
"""
|
|
if not retrieved_contexts:
|
|
return 0.0
|
|
|
|
# Simple text overlap check
|
|
relevant_count = 0
|
|
for ret_ctx in retrieved_contexts:
|
|
for rel_ctx in relevant_contexts:
|
|
# Check if there's significant overlap
|
|
if _text_similarity(ret_ctx, rel_ctx) > 0.5:
|
|
relevant_count += 1
|
|
break
|
|
|
|
return relevant_count / len(retrieved_contexts)
|
|
|
|
|
|
def calculate_context_recall(
|
|
query: str,
|
|
retrieved_contexts: List[str],
|
|
relevant_contexts: List[str]
|
|
) -> float:
|
|
"""
|
|
Calculate Context Recall: What fraction of relevant contexts were retrieved?
|
|
|
|
Recall = |Relevant ∩ Retrieved| / |Relevant|
|
|
|
|
Args:
|
|
query: The search query
|
|
retrieved_contexts: Contexts returned by RAG
|
|
relevant_contexts: Ground truth relevant contexts
|
|
|
|
Returns:
|
|
Recall score between 0 and 1
|
|
"""
|
|
if not relevant_contexts:
|
|
return 1.0 # No relevant contexts to miss
|
|
|
|
found_count = 0
|
|
for rel_ctx in relevant_contexts:
|
|
for ret_ctx in retrieved_contexts:
|
|
if _text_similarity(ret_ctx, rel_ctx) > 0.5:
|
|
found_count += 1
|
|
break
|
|
|
|
return found_count / len(relevant_contexts)
|
|
|
|
|
|
def _text_similarity(text1: str, text2: str) -> float:
|
|
"""
|
|
Simple text similarity using word overlap (Jaccard similarity).
|
|
"""
|
|
words1 = set(text1.lower().split())
|
|
words2 = set(text2.lower().split())
|
|
|
|
if not words1 or not words2:
|
|
return 0.0
|
|
|
|
intersection = len(words1 & words2)
|
|
union = len(words1 | words2)
|
|
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
|
|
async def evaluate_faithfulness(
|
|
answer: str,
|
|
contexts: List[str],
|
|
) -> Tuple[float, str]:
|
|
"""
|
|
Evaluate Faithfulness: Is the answer grounded in the provided contexts?
|
|
|
|
Uses LLM to check if claims in the answer are supported by contexts.
|
|
|
|
Args:
|
|
answer: The generated answer
|
|
contexts: The retrieved contexts used to generate the answer
|
|
|
|
Returns:
|
|
Tuple of (faithfulness_score, explanation)
|
|
"""
|
|
if not OPENAI_API_KEY:
|
|
return 0.5, "LLM not configured for faithfulness evaluation"
|
|
|
|
context_text = "\n---\n".join(contexts[:5]) # Limit context length
|
|
|
|
prompt = f"""Bewerte, ob die folgende Antwort vollständig durch die gegebenen Kontexte gestützt wird.
|
|
|
|
KONTEXTE:
|
|
{context_text}
|
|
|
|
ANTWORT:
|
|
{answer}
|
|
|
|
Analysiere:
|
|
1. Sind alle Aussagen in der Antwort durch die Kontexte belegt?
|
|
2. Gibt es Behauptungen ohne Grundlage in den Kontexten?
|
|
|
|
Antworte im Format:
|
|
SCORE: [0.0-1.0]
|
|
BEGRÜNDUNG: [Kurze Erklärung]"""
|
|
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
"https://api.openai.com/v1/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": EVAL_MODEL,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": 200,
|
|
"temperature": 0.0,
|
|
},
|
|
timeout=30.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
return 0.5, f"API error: {response.status_code}"
|
|
|
|
result = response.json()["choices"][0]["message"]["content"]
|
|
|
|
# Parse score from response
|
|
import re
|
|
score_match = re.search(r'SCORE:\s*([\d.]+)', result)
|
|
score = float(score_match.group(1)) if score_match else 0.5
|
|
|
|
reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
|
|
reason = reason_match.group(1).strip() if reason_match else result
|
|
|
|
return min(max(score, 0.0), 1.0), reason
|
|
|
|
except Exception as e:
|
|
return 0.5, f"Evaluation error: {str(e)}"
|
|
|
|
|
|
async def evaluate_answer_relevancy(
|
|
query: str,
|
|
answer: str,
|
|
) -> Tuple[float, str]:
|
|
"""
|
|
Evaluate Answer Relevancy: Does the answer address the question?
|
|
|
|
Uses LLM to assess if the answer is relevant to the query.
|
|
|
|
Args:
|
|
query: The original question
|
|
answer: The generated answer
|
|
|
|
Returns:
|
|
Tuple of (relevancy_score, explanation)
|
|
"""
|
|
if not OPENAI_API_KEY:
|
|
return 0.5, "LLM not configured for relevancy evaluation"
|
|
|
|
prompt = f"""Bewerte, wie relevant die Antwort für die gestellte Frage ist.
|
|
|
|
FRAGE: {query}
|
|
|
|
ANTWORT: {answer}
|
|
|
|
Analysiere:
|
|
1. Beantwortet die Antwort die gestellte Frage direkt?
|
|
2. Ist die Antwort vollständig oder fehlen wichtige Aspekte?
|
|
3. Enthält die Antwort irrelevante Informationen?
|
|
|
|
Antworte im Format:
|
|
SCORE: [0.0-1.0]
|
|
BEGRÜNDUNG: [Kurze Erklärung]"""
|
|
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
"https://api.openai.com/v1/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": EVAL_MODEL,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": 200,
|
|
"temperature": 0.0,
|
|
},
|
|
timeout=30.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
return 0.5, f"API error: {response.status_code}"
|
|
|
|
result = response.json()["choices"][0]["message"]["content"]
|
|
|
|
import re
|
|
score_match = re.search(r'SCORE:\s*([\d.]+)', result)
|
|
score = float(score_match.group(1)) if score_match else 0.5
|
|
|
|
reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
|
|
reason = reason_match.group(1).strip() if reason_match else result
|
|
|
|
return min(max(score, 0.0), 1.0), reason
|
|
|
|
except Exception as e:
|
|
return 0.5, f"Evaluation error: {str(e)}"
|
|
|
|
|
|
async def evaluate_rag_response(
|
|
query: str,
|
|
answer: str,
|
|
retrieved_contexts: List[str],
|
|
ground_truth_contexts: Optional[List[str]] = None,
|
|
ground_truth_answer: Optional[str] = None,
|
|
) -> Dict:
|
|
"""
|
|
Comprehensive RAG evaluation combining all metrics.
|
|
|
|
Args:
|
|
query: The original question
|
|
answer: The generated answer
|
|
retrieved_contexts: Contexts retrieved by RAG
|
|
ground_truth_contexts: Optional ground truth relevant contexts
|
|
ground_truth_answer: Optional ground truth answer
|
|
|
|
Returns:
|
|
Evaluation results with all metrics
|
|
"""
|
|
results = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"query": query,
|
|
"answer_length": len(answer),
|
|
"contexts_count": len(retrieved_contexts),
|
|
"metrics": {},
|
|
}
|
|
|
|
# Context metrics (if ground truth available)
|
|
if ground_truth_contexts:
|
|
results["metrics"]["context_precision"] = calculate_context_precision(
|
|
query, retrieved_contexts, ground_truth_contexts
|
|
)
|
|
results["metrics"]["context_recall"] = calculate_context_recall(
|
|
query, retrieved_contexts, ground_truth_contexts
|
|
)
|
|
|
|
# Faithfulness (requires LLM)
|
|
if OPENAI_API_KEY and retrieved_contexts:
|
|
faith_score, faith_reason = await evaluate_faithfulness(answer, retrieved_contexts)
|
|
results["metrics"]["faithfulness"] = faith_score
|
|
results["faithfulness_reason"] = faith_reason
|
|
|
|
# Answer relevancy (requires LLM)
|
|
if OPENAI_API_KEY:
|
|
rel_score, rel_reason = await evaluate_answer_relevancy(query, answer)
|
|
results["metrics"]["answer_relevancy"] = rel_score
|
|
results["answer_relevancy_reason"] = rel_reason
|
|
|
|
# Calculate overall score
|
|
metric_values = list(results["metrics"].values())
|
|
if metric_values:
|
|
results["overall_score"] = sum(metric_values) / len(metric_values)
|
|
|
|
# Store results
|
|
all_results = _load_eval_results()
|
|
all_results.append(results)
|
|
_save_eval_results(all_results)
|
|
|
|
return results
|
|
|
|
|
|
def get_evaluation_summary(last_n: int = 100) -> Dict:
|
|
"""
|
|
Get summary statistics of recent evaluations.
|
|
|
|
Args:
|
|
last_n: Number of recent evaluations to include
|
|
|
|
Returns:
|
|
Summary with average scores and trends
|
|
"""
|
|
all_results = _load_eval_results()
|
|
recent = all_results[-last_n:] if all_results else []
|
|
|
|
if not recent:
|
|
return {
|
|
"total_evaluations": 0,
|
|
"message": "No evaluations yet",
|
|
}
|
|
|
|
# Calculate averages
|
|
metrics_sums = {}
|
|
metrics_counts = {}
|
|
|
|
for result in recent:
|
|
for metric, value in result.get("metrics", {}).items():
|
|
if metric not in metrics_sums:
|
|
metrics_sums[metric] = 0
|
|
metrics_counts[metric] = 0
|
|
metrics_sums[metric] += value
|
|
metrics_counts[metric] += 1
|
|
|
|
averages = {
|
|
metric: metrics_sums[metric] / metrics_counts[metric]
|
|
for metric in metrics_sums
|
|
}
|
|
|
|
return {
|
|
"total_evaluations": len(all_results),
|
|
"evaluations_in_summary": len(recent),
|
|
"average_metrics": averages,
|
|
"overall_average": sum(averages.values()) / len(averages) if averages else 0,
|
|
}
|
|
|
|
|
|
def get_evaluation_info() -> dict:
|
|
"""Get information about evaluation configuration."""
|
|
return {
|
|
"enabled": EVALUATION_ENABLED,
|
|
"llm_configured": bool(OPENAI_API_KEY),
|
|
"eval_model": EVAL_MODEL,
|
|
"results_file": str(EVAL_RESULTS_FILE),
|
|
"metrics": [
|
|
"context_precision",
|
|
"context_recall",
|
|
"faithfulness",
|
|
"answer_relevancy",
|
|
],
|
|
}
|