Initial commit: breakpilot-lehrer - Lehrer KI Platform

Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00
commit 5a31f52310
1224 changed files with 425430 additions and 0 deletions
@@ -0,0 +1,393 @@
+"""
+RAG Evaluation Module
+
+Implements key RAG quality metrics inspired by RAGAS framework:
+- Context Precision: Is the retrieved context relevant?
+- Context Recall: Did we retrieve all necessary information?
+- Faithfulness: Are answers grounded in the context?
+- Answer Relevancy: Does the answer address the question?
+
+These metrics help continuously monitor and improve RAG quality.
+"""
+
+import os
+from typing import List, Dict, Optional, Tuple
+from datetime import datetime
+import json
+from pathlib import Path
+import httpx
+
+# Configuration
+EVALUATION_ENABLED = os.getenv("RAG_EVALUATION_ENABLED", "true").lower() == "true"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+EVAL_MODEL = os.getenv("RAG_EVAL_MODEL", "gpt-4o-mini")
+
+# Storage for evaluation results
+EVAL_RESULTS_FILE = Path(os.getenv("RAG_EVAL_RESULTS_FILE", "/app/docs/rag_evaluation_results.json"))
+
+
+class RAGEvaluationError(Exception):
+    """Error during RAG evaluation."""
+    pass
+
+
+def _load_eval_results() -> List[Dict]:
+    """Load evaluation results from file."""
+    if EVAL_RESULTS_FILE.exists():
+        try:
+            with open(EVAL_RESULTS_FILE, 'r') as f:
+                return json.load(f)
+        except Exception:
+            return []
+    return []
+
+
+def _save_eval_results(results: List[Dict]) -> None:
+    """Save evaluation results to file."""
+    EVAL_RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(EVAL_RESULTS_FILE, 'w') as f:
+        json.dump(results[-1000:], f, indent=2)  # Keep last 1000
+
+
+def calculate_context_precision(
+    query: str,
+    retrieved_contexts: List[str],
+    relevant_contexts: List[str]
+) -> float:
+    """
+    Calculate Context Precision: What fraction of retrieved contexts are relevant?
+
+    Precision = |Relevant ∩ Retrieved| / |Retrieved|
+
+    Args:
+        query: The search query
+        retrieved_contexts: Contexts returned by RAG
+        relevant_contexts: Ground truth relevant contexts
+
+    Returns:
+        Precision score between 0 and 1
+    """
+    if not retrieved_contexts:
+        return 0.0
+
+    # Simple text overlap check
+    relevant_count = 0
+    for ret_ctx in retrieved_contexts:
+        for rel_ctx in relevant_contexts:
+            # Check if there's significant overlap
+            if _text_similarity(ret_ctx, rel_ctx) > 0.5:
+                relevant_count += 1
+                break
+
+    return relevant_count / len(retrieved_contexts)
+
+
+def calculate_context_recall(
+    query: str,
+    retrieved_contexts: List[str],
+    relevant_contexts: List[str]
+) -> float:
+    """
+    Calculate Context Recall: What fraction of relevant contexts were retrieved?
+
+    Recall = |Relevant ∩ Retrieved| / |Relevant|
+
+    Args:
+        query: The search query
+        retrieved_contexts: Contexts returned by RAG
+        relevant_contexts: Ground truth relevant contexts
+
+    Returns:
+        Recall score between 0 and 1
+    """
+    if not relevant_contexts:
+        return 1.0  # No relevant contexts to miss
+
+    found_count = 0
+    for rel_ctx in relevant_contexts:
+        for ret_ctx in retrieved_contexts:
+            if _text_similarity(ret_ctx, rel_ctx) > 0.5:
+                found_count += 1
+                break
+
+    return found_count / len(relevant_contexts)
+
+
+def _text_similarity(text1: str, text2: str) -> float:
+    """
+    Simple text similarity using word overlap (Jaccard similarity).
+    """
+    words1 = set(text1.lower().split())
+    words2 = set(text2.lower().split())
+
+    if not words1 or not words2:
+        return 0.0
+
+    intersection = len(words1 & words2)
+    union = len(words1 | words2)
+
+    return intersection / union if union > 0 else 0.0
+
+
+async def evaluate_faithfulness(
+    answer: str,
+    contexts: List[str],
+) -> Tuple[float, str]:
+    """
+    Evaluate Faithfulness: Is the answer grounded in the provided contexts?
+
+    Uses LLM to check if claims in the answer are supported by contexts.
+
+    Args:
+        answer: The generated answer
+        contexts: The retrieved contexts used to generate the answer
+
+    Returns:
+        Tuple of (faithfulness_score, explanation)
+    """
+    if not OPENAI_API_KEY:
+        return 0.5, "LLM not configured for faithfulness evaluation"
+
+    context_text = "\n---\n".join(contexts[:5])  # Limit context length
+
+    prompt = f"""Bewerte, ob die folgende Antwort vollständig durch die gegebenen Kontexte gestützt wird.
+
+KONTEXTE:
+{context_text}
+
+ANTWORT:
+{answer}
+
+Analysiere:
+1. Sind alle Aussagen in der Antwort durch die Kontexte belegt?
+2. Gibt es Behauptungen ohne Grundlage in den Kontexten?
+
+Antworte im Format:
+SCORE: [0.0-1.0]
+BEGRÜNDUNG: [Kurze Erklärung]"""
+
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {OPENAI_API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": EVAL_MODEL,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 200,
+                    "temperature": 0.0,
+                },
+                timeout=30.0
+            )
+
+            if response.status_code != 200:
+                return 0.5, f"API error: {response.status_code}"
+
+            result = response.json()["choices"][0]["message"]["content"]
+
+            # Parse score from response
+            import re
+            score_match = re.search(r'SCORE:\s*([\d.]+)', result)
+            score = float(score_match.group(1)) if score_match else 0.5
+
+            reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
+            reason = reason_match.group(1).strip() if reason_match else result
+
+            return min(max(score, 0.0), 1.0), reason
+
+    except Exception as e:
+        return 0.5, f"Evaluation error: {str(e)}"
+
+
+async def evaluate_answer_relevancy(
+    query: str,
+    answer: str,
+) -> Tuple[float, str]:
+    """
+    Evaluate Answer Relevancy: Does the answer address the question?
+
+    Uses LLM to assess if the answer is relevant to the query.
+
+    Args:
+        query: The original question
+        answer: The generated answer
+
+    Returns:
+        Tuple of (relevancy_score, explanation)
+    """
+    if not OPENAI_API_KEY:
+        return 0.5, "LLM not configured for relevancy evaluation"
+
+    prompt = f"""Bewerte, wie relevant die Antwort für die gestellte Frage ist.
+
+FRAGE: {query}
+
+ANTWORT: {answer}
+
+Analysiere:
+1. Beantwortet die Antwort die gestellte Frage direkt?
+2. Ist die Antwort vollständig oder fehlen wichtige Aspekte?
+3. Enthält die Antwort irrelevante Informationen?
+
+Antworte im Format:
+SCORE: [0.0-1.0]
+BEGRÜNDUNG: [Kurze Erklärung]"""
+
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {OPENAI_API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": EVAL_MODEL,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 200,
+                    "temperature": 0.0,
+                },
+                timeout=30.0
+            )
+
+            if response.status_code != 200:
+                return 0.5, f"API error: {response.status_code}"
+
+            result = response.json()["choices"][0]["message"]["content"]
+
+            import re
+            score_match = re.search(r'SCORE:\s*([\d.]+)', result)
+            score = float(score_match.group(1)) if score_match else 0.5
+
+            reason_match = re.search(r'BEGRÜNDUNG:\s*(.+)', result, re.DOTALL)
+            reason = reason_match.group(1).strip() if reason_match else result
+
+            return min(max(score, 0.0), 1.0), reason
+
+    except Exception as e:
+        return 0.5, f"Evaluation error: {str(e)}"
+
+
+async def evaluate_rag_response(
+    query: str,
+    answer: str,
+    retrieved_contexts: List[str],
+    ground_truth_contexts: Optional[List[str]] = None,
+    ground_truth_answer: Optional[str] = None,
+) -> Dict:
+    """
+    Comprehensive RAG evaluation combining all metrics.
+
+    Args:
+        query: The original question
+        answer: The generated answer
+        retrieved_contexts: Contexts retrieved by RAG
+        ground_truth_contexts: Optional ground truth relevant contexts
+        ground_truth_answer: Optional ground truth answer
+
+    Returns:
+        Evaluation results with all metrics
+    """
+    results = {
+        "timestamp": datetime.now().isoformat(),
+        "query": query,
+        "answer_length": len(answer),
+        "contexts_count": len(retrieved_contexts),
+        "metrics": {},
+    }
+
+    # Context metrics (if ground truth available)
+    if ground_truth_contexts:
+        results["metrics"]["context_precision"] = calculate_context_precision(
+            query, retrieved_contexts, ground_truth_contexts
+        )
+        results["metrics"]["context_recall"] = calculate_context_recall(
+            query, retrieved_contexts, ground_truth_contexts
+        )
+
+    # Faithfulness (requires LLM)
+    if OPENAI_API_KEY and retrieved_contexts:
+        faith_score, faith_reason = await evaluate_faithfulness(answer, retrieved_contexts)
+        results["metrics"]["faithfulness"] = faith_score
+        results["faithfulness_reason"] = faith_reason
+
+    # Answer relevancy (requires LLM)
+    if OPENAI_API_KEY:
+        rel_score, rel_reason = await evaluate_answer_relevancy(query, answer)
+        results["metrics"]["answer_relevancy"] = rel_score
+        results["answer_relevancy_reason"] = rel_reason
+
+    # Calculate overall score
+    metric_values = list(results["metrics"].values())
+    if metric_values:
+        results["overall_score"] = sum(metric_values) / len(metric_values)
+
+    # Store results
+    all_results = _load_eval_results()
+    all_results.append(results)
+    _save_eval_results(all_results)
+
+    return results
+
+
+def get_evaluation_summary(last_n: int = 100) -> Dict:
+    """
+    Get summary statistics of recent evaluations.
+
+    Args:
+        last_n: Number of recent evaluations to include
+
+    Returns:
+        Summary with average scores and trends
+    """
+    all_results = _load_eval_results()
+    recent = all_results[-last_n:] if all_results else []
+
+    if not recent:
+        return {
+            "total_evaluations": 0,
+            "message": "No evaluations yet",
+        }
+
+    # Calculate averages
+    metrics_sums = {}
+    metrics_counts = {}
+
+    for result in recent:
+        for metric, value in result.get("metrics", {}).items():
+            if metric not in metrics_sums:
+                metrics_sums[metric] = 0
+                metrics_counts[metric] = 0
+            metrics_sums[metric] += value
+            metrics_counts[metric] += 1
+
+    averages = {
+        metric: metrics_sums[metric] / metrics_counts[metric]
+        for metric in metrics_sums
+    }
+
+    return {
+        "total_evaluations": len(all_results),
+        "evaluations_in_summary": len(recent),
+        "average_metrics": averages,
+        "overall_average": sum(averages.values()) / len(averages) if averages else 0,
+    }
+
+
+def get_evaluation_info() -> dict:
+    """Get information about evaluation configuration."""
+    return {
+        "enabled": EVALUATION_ENABLED,
+        "llm_configured": bool(OPENAI_API_KEY),
+        "eval_model": EVAL_MODEL,
+        "results_file": str(EVAL_RESULTS_FILE),
+        "metrics": [
+            "context_precision",
+            "context_recall",
+            "faithfulness",
+            "answer_relevancy",
+        ],
+    }