""" BQAS Golden Suite Runner - Loads and executes golden test cases """ import yaml import structlog from pathlib import Path from typing import List, Dict, Any, Optional from datetime import datetime from bqas.metrics import TestResult logger = structlog.get_logger(__name__) async def load_golden_tests() -> List[Dict[str, Any]]: """Load all golden test cases from YAML files.""" tests = [] golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" yaml_files = [ "intent_tests.yaml", "edge_cases.yaml", "workflow_tests.yaml", ] for filename in yaml_files: filepath = golden_dir / filename if filepath.exists(): try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data and 'tests' in data: for test in data['tests']: test['source_file'] = filename tests.extend(data['tests']) except Exception as e: logger.warning(f"Failed to load {filename}", error=str(e)) return tests async def load_rag_tests() -> List[Dict[str, Any]]: """Load RAG test cases from YAML.""" tests = [] rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml" if rag_file.exists(): try: with open(rag_file, 'r', encoding='utf-8') as f: documents = list(yaml.safe_load_all(f)) for doc in documents: if doc and 'tests' in doc: tests.extend(doc['tests']) if doc and 'edge_cases' in doc: tests.extend(doc['edge_cases']) except Exception as e: logger.warning(f"Failed to load RAG tests", error=str(e)) return tests def simulate_response(user_input: str, expected_intent: str) -> tuple: """Simulate voice service response for testing without live service.""" import random if random.random() < 0.90: detected_intent = expected_intent else: intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"] detected_intent = random.choice([i for i in intents if i != expected_intent]) responses = { "student_observation": f"Notiz wurde gespeichert: {user_input}", "reminder": f"Erinnerung erstellt: {user_input}", "worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}", "homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}", "parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}", "class_message": f"Nachricht an Klasse vorbereitet: {user_input}", "quiz_generate": f"Quiz wird erstellt: {user_input}", "quick_activity": f"Einstiegsaktivitaet geplant: {user_input}", "canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}", "canvas_layout": f"Layout wird angepasst: {user_input}", "operator_checklist": f"Operatoren-Checkliste geladen: {user_input}", "eh_passage": f"EH-Passage gefunden: {user_input}", "feedback_suggest": f"Feedback-Vorschlag: {user_input}", "reminder_schedule": f"Erinnerung geplant: {user_input}", "task_summary": f"Aufgabenuebersicht: {user_input}", "conference_topic": f"Konferenzthema notiert: {user_input}", "correction_note": f"Korrekturnotiz gespeichert: {user_input}", "worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}", } response = responses.get(detected_intent, f"Verstanden: {user_input}") return detected_intent, response def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult: """Create a failed test result due to error.""" return TestResult( test_id=test_case.get('id', 'UNKNOWN'), test_name=test_case.get('name', 'Error'), user_input=test_case.get('input', ''), expected_intent=test_case.get('expected_intent', ''), detected_intent='error', response='', intent_accuracy=0, faithfulness=1, relevance=1, coherence=1, safety='fail', composite_score=0.0, passed=False, reasoning=f"Test execution error: {error}", timestamp=datetime.utcnow(), duration_ms=0, ) async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]: """Simulate RAG service response.""" category = test_case.get('category', '') input_data = test_case.get('input', {}) expected = test_case.get('expected', {}) if category == 'eh_retrieval': concepts = expected.get('must_contain_concepts', []) passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. " passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden." return { "passage": passage, "source": "EH_Deutsch_Abitur_2024_NI.pdf", "relevance_score": 0.85, } elif category == 'operator_alignment': operator = input_data.get('operator', '') afb = expected.get('afb_level', 'II') actions = expected.get('expected_actions', []) return { "operator": operator, "definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.", "afb_level": afb, } elif category == 'hallucination_control': return { "response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...", "grounded": True, } elif category == 'privacy_compliance': return { "response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]", "contains_pii": False, } elif category == 'namespace_isolation': return { "response": "Zugriff nur auf Daten im eigenen Namespace.", "namespace_violation": False, } return {"response": "Simulated response", "success": True}