breakpilot-pwa/voice-service/tests/bqas/test_synthetic.py

"""
Synthetic Tests
Tests using synthetically generated test cases
"""
import pytest
from typing import Dict, List

from bqas.synthetic_generator import SyntheticGenerator, TEACHER_PATTERNS
from bqas.judge import LLMJudge


class TestSyntheticGenerator:
    """Tests for synthetic test generation."""

    def test_teacher_patterns_exist(self):
        """Verify teacher patterns are defined."""
        assert len(TEACHER_PATTERNS) > 0
        assert "student_observation" in TEACHER_PATTERNS
        assert "worksheet_generate" in TEACHER_PATTERNS
        assert "parent_letter" in TEACHER_PATTERNS

    @pytest.mark.asyncio
    async def test_fallback_generation(self, synthetic_generator: SyntheticGenerator):
        """Test fallback pattern-based generation."""
        variations = synthetic_generator._generate_fallback(
            intent="student_observation",
            count=5,
        )

        assert len(variations) == 5
        for v in variations:
            assert v.expected_intent == "student_observation"
            assert len(v.input) > 0

    @pytest.mark.asyncio
    async def test_generate_variations(self, synthetic_generator: SyntheticGenerator):
        """Test LLM-based variation generation."""
        # This test may be skipped if Ollama is not available
        try:
            variations = await synthetic_generator.generate_variations(
                intent="student_observation",
                count=3,
            )

            assert len(variations) >= 1  # At least fallback should work
            for v in variations:
                assert v.expected_intent == "student_observation"

        except Exception as e:
            pytest.skip(f"Ollama not available: {e}")


class TestSyntheticEvaluation:
    """Evaluate synthetic tests with LLM Judge."""

    @pytest.mark.asyncio
    @pytest.mark.parametrize("intent", [
        "student_observation",
        "worksheet_generate",
        "reminder",
    ])
    async def test_synthetic_intent_quality(
        self,
        llm_judge: LLMJudge,
        synthetic_generator: SyntheticGenerator,
        intent: str,
    ):
        """Test quality of synthetic test cases."""
        is_available = await llm_judge.health_check()
        if not is_available:
            pytest.skip("LLM judge not available")

        # Generate fallback variations (fast, doesn't need LLM)
        variations = synthetic_generator._generate_fallback(intent, count=3)

        scores = []
        for var in variations:
            result = await llm_judge.evaluate(
                user_input=var.input,
                detected_intent=intent,
                response="Verstanden.",
                expected_intent=intent,
            )
            scores.append(result.composite_score)

        avg_score = sum(scores) / len(scores)
        assert avg_score >= 3.0, f"Average score {avg_score} too low for {intent}"


class TestSyntheticCoverage:
    """Test coverage of synthetic generation."""

    def test_all_intents_have_patterns(self):
        """Verify all main intents have patterns."""
        required_intents = [
            "student_observation",
            "reminder",
            "homework_check",
            "worksheet_generate",
            "parent_letter",
            "class_message",
            "quiz_generate",
            "quick_activity",
            "canvas_edit",
            "canvas_layout",
            "operator_checklist",
            "eh_passage",
            "feedback_suggest",
            "reminder_schedule",
            "task_summary",
        ]

        for intent in required_intents:
            assert intent in TEACHER_PATTERNS, f"Missing patterns for: {intent}"
            assert len(TEACHER_PATTERNS[intent]) >= 2, f"Too few patterns for: {intent}"

    def test_pattern_placeholders(self):
        """Verify patterns have valid placeholders."""
        import re

        for intent, patterns in TEACHER_PATTERNS.items():
            for pattern in patterns:
                # Find all placeholders
                placeholders = re.findall(r'\{(\w+)\}', pattern)

                # Verify no empty placeholders
                for ph in placeholders:
                    assert len(ph) > 0, f"Empty placeholder in {intent}: {pattern}"