refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav

- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00
parent 2ec4d8aabd
commit 9912997187
68 changed files with 12992 additions and 1432 deletions
@@ -0,0 +1,208 @@
+"""
+BQAS Metrics - RAGAS-inspired evaluation metrics
+"""
+from dataclasses import dataclass
+from typing import List, Dict, Any
+from datetime import datetime
+
+
+@dataclass
+class TestResult:
+    """Result of a single test case."""
+    test_id: str
+    test_name: str
+    user_input: str
+    expected_intent: str
+    detected_intent: str
+    response: str
+
+    # Scores
+    intent_accuracy: int  # 0-100
+    faithfulness: int  # 1-5
+    relevance: int  # 1-5
+    coherence: int  # 1-5
+    safety: str  # "pass" or "fail"
+
+    # Computed
+    composite_score: float
+    passed: bool
+    reasoning: str
+
+    # Metadata
+    timestamp: datetime
+    duration_ms: int
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "test_id": self.test_id,
+            "test_name": self.test_name,
+            "user_input": self.user_input,
+            "expected_intent": self.expected_intent,
+            "detected_intent": self.detected_intent,
+            "response": self.response,
+            "intent_accuracy": self.intent_accuracy,
+            "faithfulness": self.faithfulness,
+            "relevance": self.relevance,
+            "coherence": self.coherence,
+            "safety": self.safety,
+            "composite_score": self.composite_score,
+            "passed": self.passed,
+            "reasoning": self.reasoning,
+            "timestamp": self.timestamp.isoformat(),
+            "duration_ms": self.duration_ms,
+        }
+
+
+@dataclass
+class BQASMetrics:
+    """Aggregated metrics for a test run."""
+    total_tests: int
+    passed_tests: int
+    failed_tests: int
+
+    # Average scores
+    avg_intent_accuracy: float
+    avg_faithfulness: float
+    avg_relevance: float
+    avg_coherence: float
+    safety_pass_rate: float
+
+    # Composite
+    avg_composite_score: float
+
+    # By category
+    scores_by_intent: Dict[str, float]
+
+    # Failures
+    failed_test_ids: List[str]
+
+    # Timing
+    total_duration_ms: int
+    timestamp: datetime
+
+    @classmethod
+    def from_results(cls, results: List[TestResult]) -> "BQASMetrics":
+        """Calculate metrics from test results."""
+        if not results:
+            return cls(
+                total_tests=0,
+                passed_tests=0,
+                failed_tests=0,
+                avg_intent_accuracy=0.0,
+                avg_faithfulness=0.0,
+                avg_relevance=0.0,
+                avg_coherence=0.0,
+                safety_pass_rate=0.0,
+                avg_composite_score=0.0,
+                scores_by_intent={},
+                failed_test_ids=[],
+                total_duration_ms=0,
+                timestamp=datetime.utcnow(),
+            )
+
+        total = len(results)
+        passed = sum(1 for r in results if r.passed)
+
+        # Calculate averages
+        avg_intent = sum(r.intent_accuracy for r in results) / total
+        avg_faith = sum(r.faithfulness for r in results) / total
+        avg_rel = sum(r.relevance for r in results) / total
+        avg_coh = sum(r.coherence for r in results) / total
+        safety_rate = sum(1 for r in results if r.safety == "pass") / total
+        avg_composite = sum(r.composite_score for r in results) / total
+
+        # Group by intent
+        intent_scores: Dict[str, List[float]] = {}
+        for r in results:
+            if r.expected_intent not in intent_scores:
+                intent_scores[r.expected_intent] = []
+            intent_scores[r.expected_intent].append(r.composite_score)
+
+        scores_by_intent = {
+            intent: sum(scores) / len(scores)
+            for intent, scores in intent_scores.items()
+        }
+
+        # Failed tests
+        failed_ids = [r.test_id for r in results if not r.passed]
+
+        # Total duration
+        total_duration = sum(r.duration_ms for r in results)
+
+        return cls(
+            total_tests=total,
+            passed_tests=passed,
+            failed_tests=total - passed,
+            avg_intent_accuracy=avg_intent,
+            avg_faithfulness=avg_faith,
+            avg_relevance=avg_rel,
+            avg_coherence=avg_coh,
+            safety_pass_rate=safety_rate,
+            avg_composite_score=avg_composite,
+            scores_by_intent=scores_by_intent,
+            failed_test_ids=failed_ids,
+            total_duration_ms=total_duration,
+            timestamp=datetime.utcnow(),
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "total_tests": self.total_tests,
+            "passed_tests": self.passed_tests,
+            "failed_tests": self.failed_tests,
+            "pass_rate": self.passed_tests / self.total_tests if self.total_tests > 0 else 0,
+            "avg_intent_accuracy": round(self.avg_intent_accuracy, 2),
+            "avg_faithfulness": round(self.avg_faithfulness, 2),
+            "avg_relevance": round(self.avg_relevance, 2),
+            "avg_coherence": round(self.avg_coherence, 2),
+            "safety_pass_rate": round(self.safety_pass_rate, 3),
+            "avg_composite_score": round(self.avg_composite_score, 3),
+            "scores_by_intent": {k: round(v, 3) for k, v in self.scores_by_intent.items()},
+            "failed_test_ids": self.failed_test_ids,
+            "total_duration_ms": self.total_duration_ms,
+            "timestamp": self.timestamp.isoformat(),
+        }
+
+    def summary(self) -> str:
+        """Generate a human-readable summary."""
+        lines = [
+            "=" * 60,
+            "BQAS Test Run Summary",
+            "=" * 60,
+            f"Total Tests: {self.total_tests}",
+            f"Passed: {self.passed_tests} ({self.passed_tests/self.total_tests*100:.1f}%)" if self.total_tests > 0 else "Passed: 0",
+            f"Failed: {self.failed_tests}",
+            "",
+            "Scores:",
+            f"  Intent Accuracy: {self.avg_intent_accuracy:.1f}%",
+            f"  Faithfulness: {self.avg_faithfulness:.2f}/5",
+            f"  Relevance: {self.avg_relevance:.2f}/5",
+            f"  Coherence: {self.avg_coherence:.2f}/5",
+            f"  Safety Pass Rate: {self.safety_pass_rate*100:.1f}%",
+            f"  Composite Score: {self.avg_composite_score:.3f}/5",
+            "",
+            "By Intent:",
+        ]
+
+        for intent, score in sorted(self.scores_by_intent.items(), key=lambda x: x[1], reverse=True):
+            lines.append(f"  {intent}: {score:.3f}")
+
+        if self.failed_test_ids:
+            lines.extend([
+                "",
+                f"Failed Tests ({len(self.failed_test_ids)}):",
+            ])
+            for test_id in self.failed_test_ids[:10]:
+                lines.append(f"  - {test_id}")
+            if len(self.failed_test_ids) > 10:
+                lines.append(f"  ... and {len(self.failed_test_ids) - 10} more")
+
+        lines.extend([
+            "",
+            f"Duration: {self.total_duration_ms}ms",
+            "=" * 60,
+        ])
+
+        return "\n".join(lines)