[split-required] Split remaining 500-680 LOC files (final batch)
website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
162
voice-service/bqas/runner_golden.py
Normal file
162
voice-service/bqas/runner_golden.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
BQAS Golden Suite Runner - Loads and executes golden test cases
|
||||
"""
|
||||
import yaml
|
||||
import structlog
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from bqas.metrics import TestResult
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
async def load_golden_tests() -> List[Dict[str, Any]]:
|
||||
"""Load all golden test cases from YAML files."""
|
||||
tests = []
|
||||
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
||||
|
||||
yaml_files = [
|
||||
"intent_tests.yaml",
|
||||
"edge_cases.yaml",
|
||||
"workflow_tests.yaml",
|
||||
]
|
||||
|
||||
for filename in yaml_files:
|
||||
filepath = golden_dir / filename
|
||||
if filepath.exists():
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
if data and 'tests' in data:
|
||||
for test in data['tests']:
|
||||
test['source_file'] = filename
|
||||
tests.extend(data['tests'])
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load {filename}", error=str(e))
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
async def load_rag_tests() -> List[Dict[str, Any]]:
|
||||
"""Load RAG test cases from YAML."""
|
||||
tests = []
|
||||
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
|
||||
|
||||
if rag_file.exists():
|
||||
try:
|
||||
with open(rag_file, 'r', encoding='utf-8') as f:
|
||||
documents = list(yaml.safe_load_all(f))
|
||||
for doc in documents:
|
||||
if doc and 'tests' in doc:
|
||||
tests.extend(doc['tests'])
|
||||
if doc and 'edge_cases' in doc:
|
||||
tests.extend(doc['edge_cases'])
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load RAG tests", error=str(e))
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
def simulate_response(user_input: str, expected_intent: str) -> tuple:
|
||||
"""Simulate voice service response for testing without live service."""
|
||||
import random
|
||||
if random.random() < 0.90:
|
||||
detected_intent = expected_intent
|
||||
else:
|
||||
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
|
||||
detected_intent = random.choice([i for i in intents if i != expected_intent])
|
||||
|
||||
responses = {
|
||||
"student_observation": f"Notiz wurde gespeichert: {user_input}",
|
||||
"reminder": f"Erinnerung erstellt: {user_input}",
|
||||
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
|
||||
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
|
||||
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
|
||||
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
|
||||
"quiz_generate": f"Quiz wird erstellt: {user_input}",
|
||||
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
|
||||
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
|
||||
"canvas_layout": f"Layout wird angepasst: {user_input}",
|
||||
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
|
||||
"eh_passage": f"EH-Passage gefunden: {user_input}",
|
||||
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
|
||||
"reminder_schedule": f"Erinnerung geplant: {user_input}",
|
||||
"task_summary": f"Aufgabenuebersicht: {user_input}",
|
||||
"conference_topic": f"Konferenzthema notiert: {user_input}",
|
||||
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
|
||||
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
|
||||
}
|
||||
|
||||
response = responses.get(detected_intent, f"Verstanden: {user_input}")
|
||||
return detected_intent, response
|
||||
|
||||
|
||||
def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult:
|
||||
"""Create a failed test result due to error."""
|
||||
return TestResult(
|
||||
test_id=test_case.get('id', 'UNKNOWN'),
|
||||
test_name=test_case.get('name', 'Error'),
|
||||
user_input=test_case.get('input', ''),
|
||||
expected_intent=test_case.get('expected_intent', ''),
|
||||
detected_intent='error',
|
||||
response='',
|
||||
intent_accuracy=0,
|
||||
faithfulness=1,
|
||||
relevance=1,
|
||||
coherence=1,
|
||||
safety='fail',
|
||||
composite_score=0.0,
|
||||
passed=False,
|
||||
reasoning=f"Test execution error: {error}",
|
||||
timestamp=datetime.utcnow(),
|
||||
duration_ms=0,
|
||||
)
|
||||
|
||||
|
||||
async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Simulate RAG service response."""
|
||||
category = test_case.get('category', '')
|
||||
input_data = test_case.get('input', {})
|
||||
expected = test_case.get('expected', {})
|
||||
|
||||
if category == 'eh_retrieval':
|
||||
concepts = expected.get('must_contain_concepts', [])
|
||||
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
|
||||
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
|
||||
return {
|
||||
"passage": passage,
|
||||
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
|
||||
"relevance_score": 0.85,
|
||||
}
|
||||
|
||||
elif category == 'operator_alignment':
|
||||
operator = input_data.get('operator', '')
|
||||
afb = expected.get('afb_level', 'II')
|
||||
actions = expected.get('expected_actions', [])
|
||||
return {
|
||||
"operator": operator,
|
||||
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
|
||||
"afb_level": afb,
|
||||
}
|
||||
|
||||
elif category == 'hallucination_control':
|
||||
return {
|
||||
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
|
||||
"grounded": True,
|
||||
}
|
||||
|
||||
elif category == 'privacy_compliance':
|
||||
return {
|
||||
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
|
||||
"contains_pii": False,
|
||||
}
|
||||
|
||||
elif category == 'namespace_isolation':
|
||||
return {
|
||||
"response": "Zugriff nur auf Daten im eigenen Namespace.",
|
||||
"namespace_violation": False,
|
||||
}
|
||||
|
||||
return {"response": "Simulated response", "success": True}
|
||||
Reference in New Issue
Block a user