""" Synthetic Tests Tests using synthetically generated test cases """ import pytest from typing import Dict, List from bqas.synthetic_generator import SyntheticGenerator, TEACHER_PATTERNS from bqas.judge import LLMJudge class TestSyntheticGenerator: """Tests for synthetic test generation.""" def test_teacher_patterns_exist(self): """Verify teacher patterns are defined.""" assert len(TEACHER_PATTERNS) > 0 assert "student_observation" in TEACHER_PATTERNS assert "worksheet_generate" in TEACHER_PATTERNS assert "parent_letter" in TEACHER_PATTERNS @pytest.mark.asyncio async def test_fallback_generation(self, synthetic_generator: SyntheticGenerator): """Test fallback pattern-based generation.""" variations = synthetic_generator._generate_fallback( intent="student_observation", count=5, ) assert len(variations) == 5 for v in variations: assert v.expected_intent == "student_observation" assert len(v.input) > 0 @pytest.mark.asyncio async def test_generate_variations(self, synthetic_generator: SyntheticGenerator): """Test LLM-based variation generation.""" # This test may be skipped if Ollama is not available try: variations = await synthetic_generator.generate_variations( intent="student_observation", count=3, ) assert len(variations) >= 1 # At least fallback should work for v in variations: assert v.expected_intent == "student_observation" except Exception as e: pytest.skip(f"Ollama not available: {e}") class TestSyntheticEvaluation: """Evaluate synthetic tests with LLM Judge.""" @pytest.mark.asyncio @pytest.mark.parametrize("intent", [ "student_observation", "worksheet_generate", "reminder", ]) async def test_synthetic_intent_quality( self, llm_judge: LLMJudge, synthetic_generator: SyntheticGenerator, intent: str, ): """Test quality of synthetic test cases.""" is_available = await llm_judge.health_check() if not is_available: pytest.skip("LLM judge not available") # Generate fallback variations (fast, doesn't need LLM) variations = synthetic_generator._generate_fallback(intent, count=3) scores = [] for var in variations: result = await llm_judge.evaluate( user_input=var.input, detected_intent=intent, response="Verstanden.", expected_intent=intent, ) scores.append(result.composite_score) avg_score = sum(scores) / len(scores) assert avg_score >= 3.0, f"Average score {avg_score} too low for {intent}" class TestSyntheticCoverage: """Test coverage of synthetic generation.""" def test_all_intents_have_patterns(self): """Verify all main intents have patterns.""" required_intents = [ "student_observation", "reminder", "homework_check", "worksheet_generate", "parent_letter", "class_message", "quiz_generate", "quick_activity", "canvas_edit", "canvas_layout", "operator_checklist", "eh_passage", "feedback_suggest", "reminder_schedule", "task_summary", ] for intent in required_intents: assert intent in TEACHER_PATTERNS, f"Missing patterns for: {intent}" assert len(TEACHER_PATTERNS[intent]) >= 2, f"Too few patterns for: {intent}" def test_pattern_placeholders(self): """Verify patterns have valid placeholders.""" import re for intent, patterns in TEACHER_PATTERNS.items(): for pattern in patterns: # Find all placeholders placeholders = re.findall(r'\{(\w+)\}', pattern) # Verify no empty placeholders for ph in placeholders: assert len(ph) > 0, f"Empty placeholder in {intent}: {pattern}"