Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
- Voice-Service von Core nach Lehrer verschoben (bp-lehrer-voice-service) - 4 Jitsi-Services + 2 Synapse-Services in docker-compose.yml aufgenommen - Camunda komplett gelöscht: workflow pages, workflow-config.ts, bpmn-js deps - CAMUNDA_URL aus backend-lehrer environment entfernt - Sidebar: Kategorie "Compliance SDK" + "Katalogverwaltung" entfernt - Sidebar: Neue Kategorie "Kommunikation" mit Video & Chat, Voice Service, Alerts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
209 lines
6.8 KiB
Python
209 lines
6.8 KiB
Python
"""
|
|
BQAS Metrics - RAGAS-inspired evaluation metrics
|
|
"""
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict, Any
|
|
from datetime import datetime
|
|
|
|
|
|
@dataclass
|
|
class TestResult:
|
|
"""Result of a single test case."""
|
|
test_id: str
|
|
test_name: str
|
|
user_input: str
|
|
expected_intent: str
|
|
detected_intent: str
|
|
response: str
|
|
|
|
# Scores
|
|
intent_accuracy: int # 0-100
|
|
faithfulness: int # 1-5
|
|
relevance: int # 1-5
|
|
coherence: int # 1-5
|
|
safety: str # "pass" or "fail"
|
|
|
|
# Computed
|
|
composite_score: float
|
|
passed: bool
|
|
reasoning: str
|
|
|
|
# Metadata
|
|
timestamp: datetime
|
|
duration_ms: int
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary for serialization."""
|
|
return {
|
|
"test_id": self.test_id,
|
|
"test_name": self.test_name,
|
|
"user_input": self.user_input,
|
|
"expected_intent": self.expected_intent,
|
|
"detected_intent": self.detected_intent,
|
|
"response": self.response,
|
|
"intent_accuracy": self.intent_accuracy,
|
|
"faithfulness": self.faithfulness,
|
|
"relevance": self.relevance,
|
|
"coherence": self.coherence,
|
|
"safety": self.safety,
|
|
"composite_score": self.composite_score,
|
|
"passed": self.passed,
|
|
"reasoning": self.reasoning,
|
|
"timestamp": self.timestamp.isoformat(),
|
|
"duration_ms": self.duration_ms,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class BQASMetrics:
|
|
"""Aggregated metrics for a test run."""
|
|
total_tests: int
|
|
passed_tests: int
|
|
failed_tests: int
|
|
|
|
# Average scores
|
|
avg_intent_accuracy: float
|
|
avg_faithfulness: float
|
|
avg_relevance: float
|
|
avg_coherence: float
|
|
safety_pass_rate: float
|
|
|
|
# Composite
|
|
avg_composite_score: float
|
|
|
|
# By category
|
|
scores_by_intent: Dict[str, float]
|
|
|
|
# Failures
|
|
failed_test_ids: List[str]
|
|
|
|
# Timing
|
|
total_duration_ms: int
|
|
timestamp: datetime
|
|
|
|
@classmethod
|
|
def from_results(cls, results: List[TestResult]) -> "BQASMetrics":
|
|
"""Calculate metrics from test results."""
|
|
if not results:
|
|
return cls(
|
|
total_tests=0,
|
|
passed_tests=0,
|
|
failed_tests=0,
|
|
avg_intent_accuracy=0.0,
|
|
avg_faithfulness=0.0,
|
|
avg_relevance=0.0,
|
|
avg_coherence=0.0,
|
|
safety_pass_rate=0.0,
|
|
avg_composite_score=0.0,
|
|
scores_by_intent={},
|
|
failed_test_ids=[],
|
|
total_duration_ms=0,
|
|
timestamp=datetime.utcnow(),
|
|
)
|
|
|
|
total = len(results)
|
|
passed = sum(1 for r in results if r.passed)
|
|
|
|
# Calculate averages
|
|
avg_intent = sum(r.intent_accuracy for r in results) / total
|
|
avg_faith = sum(r.faithfulness for r in results) / total
|
|
avg_rel = sum(r.relevance for r in results) / total
|
|
avg_coh = sum(r.coherence for r in results) / total
|
|
safety_rate = sum(1 for r in results if r.safety == "pass") / total
|
|
avg_composite = sum(r.composite_score for r in results) / total
|
|
|
|
# Group by intent
|
|
intent_scores: Dict[str, List[float]] = {}
|
|
for r in results:
|
|
if r.expected_intent not in intent_scores:
|
|
intent_scores[r.expected_intent] = []
|
|
intent_scores[r.expected_intent].append(r.composite_score)
|
|
|
|
scores_by_intent = {
|
|
intent: sum(scores) / len(scores)
|
|
for intent, scores in intent_scores.items()
|
|
}
|
|
|
|
# Failed tests
|
|
failed_ids = [r.test_id for r in results if not r.passed]
|
|
|
|
# Total duration
|
|
total_duration = sum(r.duration_ms for r in results)
|
|
|
|
return cls(
|
|
total_tests=total,
|
|
passed_tests=passed,
|
|
failed_tests=total - passed,
|
|
avg_intent_accuracy=avg_intent,
|
|
avg_faithfulness=avg_faith,
|
|
avg_relevance=avg_rel,
|
|
avg_coherence=avg_coh,
|
|
safety_pass_rate=safety_rate,
|
|
avg_composite_score=avg_composite,
|
|
scores_by_intent=scores_by_intent,
|
|
failed_test_ids=failed_ids,
|
|
total_duration_ms=total_duration,
|
|
timestamp=datetime.utcnow(),
|
|
)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary for serialization."""
|
|
return {
|
|
"total_tests": self.total_tests,
|
|
"passed_tests": self.passed_tests,
|
|
"failed_tests": self.failed_tests,
|
|
"pass_rate": self.passed_tests / self.total_tests if self.total_tests > 0 else 0,
|
|
"avg_intent_accuracy": round(self.avg_intent_accuracy, 2),
|
|
"avg_faithfulness": round(self.avg_faithfulness, 2),
|
|
"avg_relevance": round(self.avg_relevance, 2),
|
|
"avg_coherence": round(self.avg_coherence, 2),
|
|
"safety_pass_rate": round(self.safety_pass_rate, 3),
|
|
"avg_composite_score": round(self.avg_composite_score, 3),
|
|
"scores_by_intent": {k: round(v, 3) for k, v in self.scores_by_intent.items()},
|
|
"failed_test_ids": self.failed_test_ids,
|
|
"total_duration_ms": self.total_duration_ms,
|
|
"timestamp": self.timestamp.isoformat(),
|
|
}
|
|
|
|
def summary(self) -> str:
|
|
"""Generate a human-readable summary."""
|
|
lines = [
|
|
"=" * 60,
|
|
"BQAS Test Run Summary",
|
|
"=" * 60,
|
|
f"Total Tests: {self.total_tests}",
|
|
f"Passed: {self.passed_tests} ({self.passed_tests/self.total_tests*100:.1f}%)" if self.total_tests > 0 else "Passed: 0",
|
|
f"Failed: {self.failed_tests}",
|
|
"",
|
|
"Scores:",
|
|
f" Intent Accuracy: {self.avg_intent_accuracy:.1f}%",
|
|
f" Faithfulness: {self.avg_faithfulness:.2f}/5",
|
|
f" Relevance: {self.avg_relevance:.2f}/5",
|
|
f" Coherence: {self.avg_coherence:.2f}/5",
|
|
f" Safety Pass Rate: {self.safety_pass_rate*100:.1f}%",
|
|
f" Composite Score: {self.avg_composite_score:.3f}/5",
|
|
"",
|
|
"By Intent:",
|
|
]
|
|
|
|
for intent, score in sorted(self.scores_by_intent.items(), key=lambda x: x[1], reverse=True):
|
|
lines.append(f" {intent}: {score:.3f}")
|
|
|
|
if self.failed_test_ids:
|
|
lines.extend([
|
|
"",
|
|
f"Failed Tests ({len(self.failed_test_ids)}):",
|
|
])
|
|
for test_id in self.failed_test_ids[:10]:
|
|
lines.append(f" - {test_id}")
|
|
if len(self.failed_test_ids) > 10:
|
|
lines.append(f" ... and {len(self.failed_test_ids) - 10} more")
|
|
|
|
lines.extend([
|
|
"",
|
|
f"Duration: {self.total_duration_ms}ms",
|
|
"=" * 60,
|
|
])
|
|
|
|
return "\n".join(lines)
|