""" BQAS Metrics - RAGAS-inspired evaluation metrics """ from dataclasses import dataclass from typing import List, Dict, Any from datetime import datetime @dataclass class TestResult: """Result of a single test case.""" test_id: str test_name: str user_input: str expected_intent: str detected_intent: str response: str # Scores intent_accuracy: int # 0-100 faithfulness: int # 1-5 relevance: int # 1-5 coherence: int # 1-5 safety: str # "pass" or "fail" # Computed composite_score: float passed: bool reasoning: str # Metadata timestamp: datetime duration_ms: int def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { "test_id": self.test_id, "test_name": self.test_name, "user_input": self.user_input, "expected_intent": self.expected_intent, "detected_intent": self.detected_intent, "response": self.response, "intent_accuracy": self.intent_accuracy, "faithfulness": self.faithfulness, "relevance": self.relevance, "coherence": self.coherence, "safety": self.safety, "composite_score": self.composite_score, "passed": self.passed, "reasoning": self.reasoning, "timestamp": self.timestamp.isoformat(), "duration_ms": self.duration_ms, } @dataclass class BQASMetrics: """Aggregated metrics for a test run.""" total_tests: int passed_tests: int failed_tests: int # Average scores avg_intent_accuracy: float avg_faithfulness: float avg_relevance: float avg_coherence: float safety_pass_rate: float # Composite avg_composite_score: float # By category scores_by_intent: Dict[str, float] # Failures failed_test_ids: List[str] # Timing total_duration_ms: int timestamp: datetime @classmethod def from_results(cls, results: List[TestResult]) -> "BQASMetrics": """Calculate metrics from test results.""" if not results: return cls( total_tests=0, passed_tests=0, failed_tests=0, avg_intent_accuracy=0.0, avg_faithfulness=0.0, avg_relevance=0.0, avg_coherence=0.0, safety_pass_rate=0.0, avg_composite_score=0.0, scores_by_intent={}, failed_test_ids=[], total_duration_ms=0, timestamp=datetime.utcnow(), ) total = len(results) passed = sum(1 for r in results if r.passed) # Calculate averages avg_intent = sum(r.intent_accuracy for r in results) / total avg_faith = sum(r.faithfulness for r in results) / total avg_rel = sum(r.relevance for r in results) / total avg_coh = sum(r.coherence for r in results) / total safety_rate = sum(1 for r in results if r.safety == "pass") / total avg_composite = sum(r.composite_score for r in results) / total # Group by intent intent_scores: Dict[str, List[float]] = {} for r in results: if r.expected_intent not in intent_scores: intent_scores[r.expected_intent] = [] intent_scores[r.expected_intent].append(r.composite_score) scores_by_intent = { intent: sum(scores) / len(scores) for intent, scores in intent_scores.items() } # Failed tests failed_ids = [r.test_id for r in results if not r.passed] # Total duration total_duration = sum(r.duration_ms for r in results) return cls( total_tests=total, passed_tests=passed, failed_tests=total - passed, avg_intent_accuracy=avg_intent, avg_faithfulness=avg_faith, avg_relevance=avg_rel, avg_coherence=avg_coh, safety_pass_rate=safety_rate, avg_composite_score=avg_composite, scores_by_intent=scores_by_intent, failed_test_ids=failed_ids, total_duration_ms=total_duration, timestamp=datetime.utcnow(), ) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { "total_tests": self.total_tests, "passed_tests": self.passed_tests, "failed_tests": self.failed_tests, "pass_rate": self.passed_tests / self.total_tests if self.total_tests > 0 else 0, "avg_intent_accuracy": round(self.avg_intent_accuracy, 2), "avg_faithfulness": round(self.avg_faithfulness, 2), "avg_relevance": round(self.avg_relevance, 2), "avg_coherence": round(self.avg_coherence, 2), "safety_pass_rate": round(self.safety_pass_rate, 3), "avg_composite_score": round(self.avg_composite_score, 3), "scores_by_intent": {k: round(v, 3) for k, v in self.scores_by_intent.items()}, "failed_test_ids": self.failed_test_ids, "total_duration_ms": self.total_duration_ms, "timestamp": self.timestamp.isoformat(), } def summary(self) -> str: """Generate a human-readable summary.""" lines = [ "=" * 60, "BQAS Test Run Summary", "=" * 60, f"Total Tests: {self.total_tests}", f"Passed: {self.passed_tests} ({self.passed_tests/self.total_tests*100:.1f}%)" if self.total_tests > 0 else "Passed: 0", f"Failed: {self.failed_tests}", "", "Scores:", f" Intent Accuracy: {self.avg_intent_accuracy:.1f}%", f" Faithfulness: {self.avg_faithfulness:.2f}/5", f" Relevance: {self.avg_relevance:.2f}/5", f" Coherence: {self.avg_coherence:.2f}/5", f" Safety Pass Rate: {self.safety_pass_rate*100:.1f}%", f" Composite Score: {self.avg_composite_score:.3f}/5", "", "By Intent:", ] for intent, score in sorted(self.scores_by_intent.items(), key=lambda x: x[1], reverse=True): lines.append(f" {intent}: {score:.3f}") if self.failed_test_ids: lines.extend([ "", f"Failed Tests ({len(self.failed_test_ids)}):", ]) for test_id in self.failed_test_ids[:10]: lines.append(f" - {test_id}") if len(self.failed_test_ids) > 10: lines.append(f" ... and {len(self.failed_test_ids) - 10} more") lines.extend([ "", f"Duration: {self.total_duration_ms}ms", "=" * 60, ]) return "\n".join(lines)