Files
breakpilot-lehrer/voice-service/bqas/metrics.py
Benjamin Admin 9912997187
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
refactor: Jitsi/Matrix/Voice von Core übernommen, Camunda/BPMN gelöscht, Kommunikation-Nav
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service)
- 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen
- Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps
- CAMUNDA_URL aus backend-lehrer environment entfernt
- Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt
- Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 17:01:47 +01:00

209 lines
6.8 KiB
Python

"""
BQAS Metrics - RAGAS-inspired evaluation metrics
"""
from dataclasses import dataclass
from typing import List, Dict, Any
from datetime import datetime
@dataclass
class TestResult:
"""Result of a single test case."""
test_id: str
test_name: str
user_input: str
expected_intent: str
detected_intent: str
response: str
# Scores
intent_accuracy: int # 0-100
faithfulness: int # 1-5
relevance: int # 1-5
coherence: int # 1-5
safety: str # "pass" or "fail"
# Computed
composite_score: float
passed: bool
reasoning: str
# Metadata
timestamp: datetime
duration_ms: int
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"test_id": self.test_id,
"test_name": self.test_name,
"user_input": self.user_input,
"expected_intent": self.expected_intent,
"detected_intent": self.detected_intent,
"response": self.response,
"intent_accuracy": self.intent_accuracy,
"faithfulness": self.faithfulness,
"relevance": self.relevance,
"coherence": self.coherence,
"safety": self.safety,
"composite_score": self.composite_score,
"passed": self.passed,
"reasoning": self.reasoning,
"timestamp": self.timestamp.isoformat(),
"duration_ms": self.duration_ms,
}
@dataclass
class BQASMetrics:
"""Aggregated metrics for a test run."""
total_tests: int
passed_tests: int
failed_tests: int
# Average scores
avg_intent_accuracy: float
avg_faithfulness: float
avg_relevance: float
avg_coherence: float
safety_pass_rate: float
# Composite
avg_composite_score: float
# By category
scores_by_intent: Dict[str, float]
# Failures
failed_test_ids: List[str]
# Timing
total_duration_ms: int
timestamp: datetime
@classmethod
def from_results(cls, results: List[TestResult]) -> "BQASMetrics":
"""Calculate metrics from test results."""
if not results:
return cls(
total_tests=0,
passed_tests=0,
failed_tests=0,
avg_intent_accuracy=0.0,
avg_faithfulness=0.0,
avg_relevance=0.0,
avg_coherence=0.0,
safety_pass_rate=0.0,
avg_composite_score=0.0,
scores_by_intent={},
failed_test_ids=[],
total_duration_ms=0,
timestamp=datetime.utcnow(),
)
total = len(results)
passed = sum(1 for r in results if r.passed)
# Calculate averages
avg_intent = sum(r.intent_accuracy for r in results) / total
avg_faith = sum(r.faithfulness for r in results) / total
avg_rel = sum(r.relevance for r in results) / total
avg_coh = sum(r.coherence for r in results) / total
safety_rate = sum(1 for r in results if r.safety == "pass") / total
avg_composite = sum(r.composite_score for r in results) / total
# Group by intent
intent_scores: Dict[str, List[float]] = {}
for r in results:
if r.expected_intent not in intent_scores:
intent_scores[r.expected_intent] = []
intent_scores[r.expected_intent].append(r.composite_score)
scores_by_intent = {
intent: sum(scores) / len(scores)
for intent, scores in intent_scores.items()
}
# Failed tests
failed_ids = [r.test_id for r in results if not r.passed]
# Total duration
total_duration = sum(r.duration_ms for r in results)
return cls(
total_tests=total,
passed_tests=passed,
failed_tests=total - passed,
avg_intent_accuracy=avg_intent,
avg_faithfulness=avg_faith,
avg_relevance=avg_rel,
avg_coherence=avg_coh,
safety_pass_rate=safety_rate,
avg_composite_score=avg_composite,
scores_by_intent=scores_by_intent,
failed_test_ids=failed_ids,
total_duration_ms=total_duration,
timestamp=datetime.utcnow(),
)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"total_tests": self.total_tests,
"passed_tests": self.passed_tests,
"failed_tests": self.failed_tests,
"pass_rate": self.passed_tests / self.total_tests if self.total_tests > 0 else 0,
"avg_intent_accuracy": round(self.avg_intent_accuracy, 2),
"avg_faithfulness": round(self.avg_faithfulness, 2),
"avg_relevance": round(self.avg_relevance, 2),
"avg_coherence": round(self.avg_coherence, 2),
"safety_pass_rate": round(self.safety_pass_rate, 3),
"avg_composite_score": round(self.avg_composite_score, 3),
"scores_by_intent": {k: round(v, 3) for k, v in self.scores_by_intent.items()},
"failed_test_ids": self.failed_test_ids,
"total_duration_ms": self.total_duration_ms,
"timestamp": self.timestamp.isoformat(),
}
def summary(self) -> str:
"""Generate a human-readable summary."""
lines = [
"=" * 60,
"BQAS Test Run Summary",
"=" * 60,
f"Total Tests: {self.total_tests}",
f"Passed: {self.passed_tests} ({self.passed_tests/self.total_tests*100:.1f}%)" if self.total_tests > 0 else "Passed: 0",
f"Failed: {self.failed_tests}",
"",
"Scores:",
f" Intent Accuracy: {self.avg_intent_accuracy:.1f}%",
f" Faithfulness: {self.avg_faithfulness:.2f}/5",
f" Relevance: {self.avg_relevance:.2f}/5",
f" Coherence: {self.avg_coherence:.2f}/5",
f" Safety Pass Rate: {self.safety_pass_rate*100:.1f}%",
f" Composite Score: {self.avg_composite_score:.3f}/5",
"",
"By Intent:",
]
for intent, score in sorted(self.scores_by_intent.items(), key=lambda x: x[1], reverse=True):
lines.append(f" {intent}: {score:.3f}")
if self.failed_test_ids:
lines.extend([
"",
f"Failed Tests ({len(self.failed_test_ids)}):",
])
for test_id in self.failed_test_ids[:10]:
lines.append(f" - {test_id}")
if len(self.failed_test_ids) > 10:
lines.append(f" ... and {len(self.failed_test_ids) - 10} more")
lines.extend([
"",
f"Duration: {self.total_duration_ms}ms",
"=" * 60,
])
return "\n".join(lines)