refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
208
voice-service/bqas/metrics.py
Normal file
208
voice-service/bqas/metrics.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""
|
||||
BQAS Metrics - RAGAS-inspired evaluation metrics
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
"""Result of a single test case."""
|
||||
test_id: str
|
||||
test_name: str
|
||||
user_input: str
|
||||
expected_intent: str
|
||||
detected_intent: str
|
||||
response: str
|
||||
|
||||
# Scores
|
||||
intent_accuracy: int # 0-100
|
||||
faithfulness: int # 1-5
|
||||
relevance: int # 1-5
|
||||
coherence: int # 1-5
|
||||
safety: str # "pass" or "fail"
|
||||
|
||||
# Computed
|
||||
composite_score: float
|
||||
passed: bool
|
||||
reasoning: str
|
||||
|
||||
# Metadata
|
||||
timestamp: datetime
|
||||
duration_ms: int
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
"test_id": self.test_id,
|
||||
"test_name": self.test_name,
|
||||
"user_input": self.user_input,
|
||||
"expected_intent": self.expected_intent,
|
||||
"detected_intent": self.detected_intent,
|
||||
"response": self.response,
|
||||
"intent_accuracy": self.intent_accuracy,
|
||||
"faithfulness": self.faithfulness,
|
||||
"relevance": self.relevance,
|
||||
"coherence": self.coherence,
|
||||
"safety": self.safety,
|
||||
"composite_score": self.composite_score,
|
||||
"passed": self.passed,
|
||||
"reasoning": self.reasoning,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
"duration_ms": self.duration_ms,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class BQASMetrics:
|
||||
"""Aggregated metrics for a test run."""
|
||||
total_tests: int
|
||||
passed_tests: int
|
||||
failed_tests: int
|
||||
|
||||
# Average scores
|
||||
avg_intent_accuracy: float
|
||||
avg_faithfulness: float
|
||||
avg_relevance: float
|
||||
avg_coherence: float
|
||||
safety_pass_rate: float
|
||||
|
||||
# Composite
|
||||
avg_composite_score: float
|
||||
|
||||
# By category
|
||||
scores_by_intent: Dict[str, float]
|
||||
|
||||
# Failures
|
||||
failed_test_ids: List[str]
|
||||
|
||||
# Timing
|
||||
total_duration_ms: int
|
||||
timestamp: datetime
|
||||
|
||||
@classmethod
|
||||
def from_results(cls, results: List[TestResult]) -> "BQASMetrics":
|
||||
"""Calculate metrics from test results."""
|
||||
if not results:
|
||||
return cls(
|
||||
total_tests=0,
|
||||
passed_tests=0,
|
||||
failed_tests=0,
|
||||
avg_intent_accuracy=0.0,
|
||||
avg_faithfulness=0.0,
|
||||
avg_relevance=0.0,
|
||||
avg_coherence=0.0,
|
||||
safety_pass_rate=0.0,
|
||||
avg_composite_score=0.0,
|
||||
scores_by_intent={},
|
||||
failed_test_ids=[],
|
||||
total_duration_ms=0,
|
||||
timestamp=datetime.utcnow(),
|
||||
)
|
||||
|
||||
total = len(results)
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
|
||||
# Calculate averages
|
||||
avg_intent = sum(r.intent_accuracy for r in results) / total
|
||||
avg_faith = sum(r.faithfulness for r in results) / total
|
||||
avg_rel = sum(r.relevance for r in results) / total
|
||||
avg_coh = sum(r.coherence for r in results) / total
|
||||
safety_rate = sum(1 for r in results if r.safety == "pass") / total
|
||||
avg_composite = sum(r.composite_score for r in results) / total
|
||||
|
||||
# Group by intent
|
||||
intent_scores: Dict[str, List[float]] = {}
|
||||
for r in results:
|
||||
if r.expected_intent not in intent_scores:
|
||||
intent_scores[r.expected_intent] = []
|
||||
intent_scores[r.expected_intent].append(r.composite_score)
|
||||
|
||||
scores_by_intent = {
|
||||
intent: sum(scores) / len(scores)
|
||||
for intent, scores in intent_scores.items()
|
||||
}
|
||||
|
||||
# Failed tests
|
||||
failed_ids = [r.test_id for r in results if not r.passed]
|
||||
|
||||
# Total duration
|
||||
total_duration = sum(r.duration_ms for r in results)
|
||||
|
||||
return cls(
|
||||
total_tests=total,
|
||||
passed_tests=passed,
|
||||
failed_tests=total - passed,
|
||||
avg_intent_accuracy=avg_intent,
|
||||
avg_faithfulness=avg_faith,
|
||||
avg_relevance=avg_rel,
|
||||
avg_coherence=avg_coh,
|
||||
safety_pass_rate=safety_rate,
|
||||
avg_composite_score=avg_composite,
|
||||
scores_by_intent=scores_by_intent,
|
||||
failed_test_ids=failed_ids,
|
||||
total_duration_ms=total_duration,
|
||||
timestamp=datetime.utcnow(),
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
"total_tests": self.total_tests,
|
||||
"passed_tests": self.passed_tests,
|
||||
"failed_tests": self.failed_tests,
|
||||
"pass_rate": self.passed_tests / self.total_tests if self.total_tests > 0 else 0,
|
||||
"avg_intent_accuracy": round(self.avg_intent_accuracy, 2),
|
||||
"avg_faithfulness": round(self.avg_faithfulness, 2),
|
||||
"avg_relevance": round(self.avg_relevance, 2),
|
||||
"avg_coherence": round(self.avg_coherence, 2),
|
||||
"safety_pass_rate": round(self.safety_pass_rate, 3),
|
||||
"avg_composite_score": round(self.avg_composite_score, 3),
|
||||
"scores_by_intent": {k: round(v, 3) for k, v in self.scores_by_intent.items()},
|
||||
"failed_test_ids": self.failed_test_ids,
|
||||
"total_duration_ms": self.total_duration_ms,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
}
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Generate a human-readable summary."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"BQAS Test Run Summary",
|
||||
"=" * 60,
|
||||
f"Total Tests: {self.total_tests}",
|
||||
f"Passed: {self.passed_tests} ({self.passed_tests/self.total_tests*100:.1f}%)" if self.total_tests > 0 else "Passed: 0",
|
||||
f"Failed: {self.failed_tests}",
|
||||
"",
|
||||
"Scores:",
|
||||
f" Intent Accuracy: {self.avg_intent_accuracy:.1f}%",
|
||||
f" Faithfulness: {self.avg_faithfulness:.2f}/5",
|
||||
f" Relevance: {self.avg_relevance:.2f}/5",
|
||||
f" Coherence: {self.avg_coherence:.2f}/5",
|
||||
f" Safety Pass Rate: {self.safety_pass_rate*100:.1f}%",
|
||||
f" Composite Score: {self.avg_composite_score:.3f}/5",
|
||||
"",
|
||||
"By Intent:",
|
||||
]
|
||||
|
||||
for intent, score in sorted(self.scores_by_intent.items(), key=lambda x: x[1], reverse=True):
|
||||
lines.append(f" {intent}: {score:.3f}")
|
||||
|
||||
if self.failed_test_ids:
|
||||
lines.extend([
|
||||
"",
|
||||
f"Failed Tests ({len(self.failed_test_ids)}):",
|
||||
])
|
||||
for test_id in self.failed_test_ids[:10]:
|
||||
lines.append(f" - {test_id}")
|
||||
if len(self.failed_test_ids) > 10:
|
||||
lines.append(f" ... and {len(self.failed_test_ids) - 10} more")
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
f"Duration: {self.total_duration_ms}ms",
|
||||
"=" * 60,
|
||||
])
|
||||
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user