website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
163 lines
6.1 KiB
Python
163 lines
6.1 KiB
Python
"""
|
|
BQAS Golden Suite Runner - Loads and executes golden test cases
|
|
"""
|
|
import yaml
|
|
import structlog
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
|
|
from bqas.metrics import TestResult
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
async def load_golden_tests() -> List[Dict[str, Any]]:
|
|
"""Load all golden test cases from YAML files."""
|
|
tests = []
|
|
golden_dir = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests"
|
|
|
|
yaml_files = [
|
|
"intent_tests.yaml",
|
|
"edge_cases.yaml",
|
|
"workflow_tests.yaml",
|
|
]
|
|
|
|
for filename in yaml_files:
|
|
filepath = golden_dir / filename
|
|
if filepath.exists():
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and 'tests' in data:
|
|
for test in data['tests']:
|
|
test['source_file'] = filename
|
|
tests.extend(data['tests'])
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load {filename}", error=str(e))
|
|
|
|
return tests
|
|
|
|
|
|
async def load_rag_tests() -> List[Dict[str, Any]]:
|
|
"""Load RAG test cases from YAML."""
|
|
tests = []
|
|
rag_file = Path(__file__).parent.parent / "tests" / "bqas" / "golden_tests" / "golden_rag_correction_v1.yaml"
|
|
|
|
if rag_file.exists():
|
|
try:
|
|
with open(rag_file, 'r', encoding='utf-8') as f:
|
|
documents = list(yaml.safe_load_all(f))
|
|
for doc in documents:
|
|
if doc and 'tests' in doc:
|
|
tests.extend(doc['tests'])
|
|
if doc and 'edge_cases' in doc:
|
|
tests.extend(doc['edge_cases'])
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load RAG tests", error=str(e))
|
|
|
|
return tests
|
|
|
|
|
|
def simulate_response(user_input: str, expected_intent: str) -> tuple:
|
|
"""Simulate voice service response for testing without live service."""
|
|
import random
|
|
if random.random() < 0.90:
|
|
detected_intent = expected_intent
|
|
else:
|
|
intents = ["student_observation", "reminder", "worksheet_generate", "parent_letter", "smalltalk"]
|
|
detected_intent = random.choice([i for i in intents if i != expected_intent])
|
|
|
|
responses = {
|
|
"student_observation": f"Notiz wurde gespeichert: {user_input}",
|
|
"reminder": f"Erinnerung erstellt: {user_input}",
|
|
"worksheet_generate": f"Arbeitsblatt wird generiert basierend auf: {user_input}",
|
|
"homework_check": f"Hausaufgabenkontrolle eingetragen: {user_input}",
|
|
"parent_letter": f"Elternbrief-Entwurf erstellt: {user_input}",
|
|
"class_message": f"Nachricht an Klasse vorbereitet: {user_input}",
|
|
"quiz_generate": f"Quiz wird erstellt: {user_input}",
|
|
"quick_activity": f"Einstiegsaktivitaet geplant: {user_input}",
|
|
"canvas_edit": f"Aenderung am Canvas wird ausgefuehrt: {user_input}",
|
|
"canvas_layout": f"Layout wird angepasst: {user_input}",
|
|
"operator_checklist": f"Operatoren-Checkliste geladen: {user_input}",
|
|
"eh_passage": f"EH-Passage gefunden: {user_input}",
|
|
"feedback_suggest": f"Feedback-Vorschlag: {user_input}",
|
|
"reminder_schedule": f"Erinnerung geplant: {user_input}",
|
|
"task_summary": f"Aufgabenuebersicht: {user_input}",
|
|
"conference_topic": f"Konferenzthema notiert: {user_input}",
|
|
"correction_note": f"Korrekturnotiz gespeichert: {user_input}",
|
|
"worksheet_differentiate": f"Differenzierung wird erstellt: {user_input}",
|
|
}
|
|
|
|
response = responses.get(detected_intent, f"Verstanden: {user_input}")
|
|
return detected_intent, response
|
|
|
|
|
|
def create_error_result(test_case: Dict[str, Any], error: str) -> TestResult:
|
|
"""Create a failed test result due to error."""
|
|
return TestResult(
|
|
test_id=test_case.get('id', 'UNKNOWN'),
|
|
test_name=test_case.get('name', 'Error'),
|
|
user_input=test_case.get('input', ''),
|
|
expected_intent=test_case.get('expected_intent', ''),
|
|
detected_intent='error',
|
|
response='',
|
|
intent_accuracy=0,
|
|
faithfulness=1,
|
|
relevance=1,
|
|
coherence=1,
|
|
safety='fail',
|
|
composite_score=0.0,
|
|
passed=False,
|
|
reasoning=f"Test execution error: {error}",
|
|
timestamp=datetime.utcnow(),
|
|
duration_ms=0,
|
|
)
|
|
|
|
|
|
async def simulate_rag_response(test_case: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Simulate RAG service response."""
|
|
category = test_case.get('category', '')
|
|
input_data = test_case.get('input', {})
|
|
expected = test_case.get('expected', {})
|
|
|
|
if category == 'eh_retrieval':
|
|
concepts = expected.get('must_contain_concepts', [])
|
|
passage = f"Der Erwartungshorizont sieht folgende Aspekte vor: {', '.join(concepts[:3])}. "
|
|
passage += "Diese muessen im Rahmen der Aufgabenbearbeitung beruecksichtigt werden."
|
|
return {
|
|
"passage": passage,
|
|
"source": "EH_Deutsch_Abitur_2024_NI.pdf",
|
|
"relevance_score": 0.85,
|
|
}
|
|
|
|
elif category == 'operator_alignment':
|
|
operator = input_data.get('operator', '')
|
|
afb = expected.get('afb_level', 'II')
|
|
actions = expected.get('expected_actions', [])
|
|
return {
|
|
"operator": operator,
|
|
"definition": f"'{operator}' gehoert zu Anforderungsbereich {afb}. Erwartete Handlungen: {', '.join(actions[:2])}.",
|
|
"afb_level": afb,
|
|
}
|
|
|
|
elif category == 'hallucination_control':
|
|
return {
|
|
"response": "Basierend auf den verfuegbaren Informationen kann ich folgendes feststellen...",
|
|
"grounded": True,
|
|
}
|
|
|
|
elif category == 'privacy_compliance':
|
|
return {
|
|
"response": "Die Arbeit zeigt folgende Merkmale... [anonymisiert]",
|
|
"contains_pii": False,
|
|
}
|
|
|
|
elif category == 'namespace_isolation':
|
|
return {
|
|
"response": "Zugriff nur auf Daten im eigenen Namespace.",
|
|
"namespace_violation": False,
|
|
}
|
|
|
|
return {"response": "Simulated response", "success": True}
|