""" RAG Evaluation Module Implements key RAG quality metrics inspired by RAGAS framework: - Context Precision: Is the retrieved context relevant? - Context Recall: Did we retrieve all necessary information? - Faithfulness: Are answers grounded in the context? - Answer Relevancy: Does the answer address the question? These metrics help continuously monitor and improve RAG quality. """ import os from typing import List, Dict, Optional, Tuple from datetime import datetime import json from pathlib import Path import httpx # Configuration EVALUATION_ENABLED = os.getenv("RAG_EVALUATION_ENABLED", "true").lower() == "true" OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") EVAL_MODEL = os.getenv("RAG_EVAL_MODEL", "gpt-4o-mini") # Storage for evaluation results EVAL_RESULTS_FILE = Path(os.getenv("RAG_EVAL_RESULTS_FILE", "/app/docs/rag_evaluation_results.json")) class RAGEvaluationError(Exception): """Error during RAG evaluation.""" pass def _load_eval_results() -> List[Dict]: """Load evaluation results from file.""" if EVAL_RESULTS_FILE.exists(): try: with open(EVAL_RESULTS_FILE, 'r') as f: return json.load(f) except Exception: return [] return [] def _save_eval_results(results: List[Dict]) -> None: """Save evaluation results to file.""" EVAL_RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True) with open(EVAL_RESULTS_FILE, 'w') as f: json.dump(results[-1000:], f, indent=2) # Keep last 1000 def calculate_context_precision( query: str, retrieved_contexts: List[str], relevant_contexts: List[str] ) -> float: """ Calculate Context Precision: What fraction of retrieved contexts are relevant? Precision = |Relevant ∩ Retrieved| / |Retrieved| Args: query: The search query retrieved_contexts: Contexts returned by RAG relevant_contexts: Ground truth relevant contexts Returns: Precision score between 0 and 1 """ if not retrieved_contexts: return 0.0 # Simple text overlap check relevant_count = 0 for ret_ctx in retrieved_contexts: for rel_ctx in relevant_contexts: # Check if there's significant overlap if _text_similarity(ret_ctx, rel_ctx) > 0.5: relevant_count += 1 break return relevant_count / len(retrieved_contexts) def calculate_context_recall( query: str, retrieved_contexts: List[str], relevant_contexts: List[str] ) -> float: """ Calculate Context Recall: What fraction of relevant contexts were retrieved? Recall = |Relevant ∩ Retrieved| / |Relevant| Args: query: The search query retrieved_contexts: Contexts returned by RAG relevant_contexts: Ground truth relevant contexts Returns: Recall score between 0 and 1 """ if not relevant_contexts: return 1.0 # No relevant contexts to miss found_count = 0 for rel_ctx in relevant_contexts: for ret_ctx in retrieved_contexts: if _text_similarity(ret_ctx, rel_ctx) > 0.5: found_count += 1 break return found_count / len(relevant_contexts) def _text_similarity(text1: str, text2: str) -> float: """ Simple text similarity using word overlap (Jaccard similarity). """ words1 = set(text1.lower().split()) words2 = set(text2.lower().split()) if not words1 or not words2: return 0.0 intersection = len(words1 & words2) union = len(words1 | words2) return intersection / union if union > 0 else 0.0 async def evaluate_faithfulness( answer: str, contexts: List[str], ) -> Tuple[float, str]: """ Evaluate Faithfulness: Is the answer grounded in the provided contexts? Uses LLM to check if claims in the answer are supported by contexts. Args: answer: The generated answer contexts: The retrieved contexts used to generate the answer Returns: Tuple of (faithfulness_score, explanation) """ if not OPENAI_API_KEY: return 0.5, "LLM not configured for faithfulness evaluation" context_text = "\n---\n".join(contexts[:5]) # Limit context length prompt = f"""Bewerte, ob die folgende Antwort vollständig durch die gegebenen Kontexte gestützt wird. KONTEXTE: {context_text} ANTWORT: {answer} Analysiere: 1. Sind alle Aussagen in der Antwort durch die Kontexte belegt? 2. Gibt es Behauptungen ohne Grundlage in den Kontexten? Antworte im Format: SCORE: [0.0-1.0] BEGRÜNDUNG: [Kurze Erklärung]""" try: async with httpx.AsyncClient() as client: response = await client.post( "https://api.openai.com/v1/chat/completions", headers={ "Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json" }, json={ "model": EVAL_MODEL, "messages": [{"role": "user", "content": prompt}], "max_tokens": 200, "temperature": 0.0, }, timeout=30.0 ) if response.status_code != 200: return 0.5, f"API error: {response.status_code}" result = response.json()["choices"][0]["message"]["content"] # Parse score from response import re score_match = re.search(r'SCORE:\s*([\d.]+)', result) score = float(score_match.group(1)) if score_match else 0.5 reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL) reason = reason_match.group(1).strip() if reason_match else result return min(max(score, 0.0), 1.0), reason except Exception as e: return 0.5, f"Evaluation error: {str(e)}" async def evaluate_answer_relevancy( query: str, answer: str, ) -> Tuple[float, str]: """ Evaluate Answer Relevancy: Does the answer address the question? Uses LLM to assess if the answer is relevant to the query. Args: query: The original question answer: The generated answer Returns: Tuple of (relevancy_score, explanation) """ if not OPENAI_API_KEY: return 0.5, "LLM not configured for relevancy evaluation" prompt = f"""Bewerte, wie relevant die Antwort für die gestellte Frage ist. FRAGE: {query} ANTWORT: {answer} Analysiere: 1. Beantwortet die Antwort die gestellte Frage direkt? 2. Ist die Antwort vollständig oder fehlen wichtige Aspekte? 3. Enthält die Antwort irrelevante Informationen? Antworte im Format: SCORE: [0.0-1.0] BEGRÜNDUNG: [Kurze Erklärung]""" try: async with httpx.AsyncClient() as client: response = await client.post( "https://api.openai.com/v1/chat/completions", headers={ "Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json" }, json={ "model": EVAL_MODEL, "messages": [{"role": "user", "content": prompt}], "max_tokens": 200, "temperature": 0.0, }, timeout=30.0 ) if response.status_code != 200: return 0.5, f"API error: {response.status_code}" result = response.json()["choices"][0]["message"]["content"] import re score_match = re.search(r'SCORE:\s*([\d.]+)', result) score = float(score_match.group(1)) if score_match else 0.5 reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL) reason = reason_match.group(1).strip() if reason_match else result return min(max(score, 0.0), 1.0), reason except Exception as e: return 0.5, f"Evaluation error: {str(e)}" async def evaluate_rag_response( query: str, answer: str, retrieved_contexts: List[str], ground_truth_contexts: Optional[List[str]] = None, ground_truth_answer: Optional[str] = None, ) -> Dict: """ Comprehensive RAG evaluation combining all metrics. Args: query: The original question answer: The generated answer retrieved_contexts: Contexts retrieved by RAG ground_truth_contexts: Optional ground truth relevant contexts ground_truth_answer: Optional ground truth answer Returns: Evaluation results with all metrics """ results = { "timestamp": datetime.now().isoformat(), "query": query, "answer_length": len(answer), "contexts_count": len(retrieved_contexts), "metrics": {}, } # Context metrics (if ground truth available) if ground_truth_contexts: results["metrics"]["context_precision"] = calculate_context_precision( query, retrieved_contexts, ground_truth_contexts ) results["metrics"]["context_recall"] = calculate_context_recall( query, retrieved_contexts, ground_truth_contexts ) # Faithfulness (requires LLM) if OPENAI_API_KEY and retrieved_contexts: faith_score, faith_reason = await evaluate_faithfulness(answer, retrieved_contexts) results["metrics"]["faithfulness"] = faith_score results["faithfulness_reason"] = faith_reason # Answer relevancy (requires LLM) if OPENAI_API_KEY: rel_score, rel_reason = await evaluate_answer_relevancy(query, answer) results["metrics"]["answer_relevancy"] = rel_score results["answer_relevancy_reason"] = rel_reason # Calculate overall score metric_values = list(results["metrics"].values()) if metric_values: results["overall_score"] = sum(metric_values) / len(metric_values) # Store results all_results = _load_eval_results() all_results.append(results) _save_eval_results(all_results) return results def get_evaluation_summary(last_n: int = 100) -> Dict: """ Get summary statistics of recent evaluations. Args: last_n: Number of recent evaluations to include Returns: Summary with average scores and trends """ all_results = _load_eval_results() recent = all_results[-last_n:] if all_results else [] if not recent: return { "total_evaluations": 0, "message": "No evaluations yet", } # Calculate averages metrics_sums = {} metrics_counts = {} for result in recent: for metric, value in result.get("metrics", {}).items(): if metric not in metrics_sums: metrics_sums[metric] = 0 metrics_counts[metric] = 0 metrics_sums[metric] += value metrics_counts[metric] += 1 averages = { metric: metrics_sums[metric] / metrics_counts[metric] for metric in metrics_sums } return { "total_evaluations": len(all_results), "evaluations_in_summary": len(recent), "average_metrics": averages, "overall_average": sum(averages.values()) / len(averages) if averages else 0, } def get_evaluation_info() -> dict: """Get information about evaluation configuration.""" return { "enabled": EVALUATION_ENABLED, "llm_configured": bool(OPENAI_API_KEY), "eval_model": EVAL_MODEL, "results_file": str(EVAL_RESULTS_FILE), "metrics": [ "context_precision", "context_recall", "faithfulness", "answer_relevancy", ], }