breakpilot-lehrer/klausur-service/backend/tests/test_advanced_rag.py

"""
Tests for Advanced RAG Features

Tests for the newly implemented RAG quality improvements:
- HyDE (Hypothetical Document Embeddings)
- Hybrid Search (Dense + Sparse/BM25)
- RAG Evaluation (RAGAS-inspired metrics)
- PDF Extraction (Unstructured.io, PyMuPDF, PyPDF2)
- Self-RAG / Corrective RAG

Run with: pytest tests/test_advanced_rag.py -v
"""

import pytest
from unittest.mock import AsyncMock, MagicMock, patch
import asyncio


# =============================================================================
# HyDE Tests
# =============================================================================

class TestHyDE:
    """Tests for HyDE (Hypothetical Document Embeddings) module."""

    def test_hyde_config(self):
        """Test HyDE configuration loading."""
        from hyde import HYDE_ENABLED, HYDE_LLM_BACKEND, HYDE_MODEL, get_hyde_info

        info = get_hyde_info()
        assert "enabled" in info
        assert "llm_backend" in info
        assert "model" in info

    def test_hyde_prompt_template(self):
        """Test that HyDE prompt template is properly formatted."""
        from hyde import HYDE_PROMPT_TEMPLATE

        assert "{query}" in HYDE_PROMPT_TEMPLATE
        assert "Erwartungshorizont" in HYDE_PROMPT_TEMPLATE
        assert "Bildungsstandards" in HYDE_PROMPT_TEMPLATE

    @pytest.mark.asyncio
    async def test_generate_hypothetical_document_disabled(self):
        """Test HyDE returns original query when disabled."""
        from hyde import generate_hypothetical_document

        with patch('hyde.HYDE_ENABLED', False):
            result = await generate_hypothetical_document("Test query")
            assert result == "Test query"

    @pytest.mark.asyncio
    async def test_hyde_search_fallback(self):
        """Test hyde_search falls back gracefully without LLM."""
        from hyde import hyde_search

        async def mock_search(query, **kwargs):
            return [{"id": "1", "text": "Result for: " + query}]

        with patch('hyde.HYDE_ENABLED', False):
            result = await hyde_search(
                query="Test query",
                search_func=mock_search,
            )

            assert result["hyde_used"] is False
            # When disabled, original_query should match the query
            assert result["original_query"] == "Test query"


# =============================================================================
# Hybrid Search Tests
# =============================================================================

class TestHybridSearch:
    """Tests for Hybrid Search (Dense + Sparse) module."""

    def test_hybrid_search_config(self):
        """Test hybrid search configuration."""
        from hybrid_search import HYBRID_ENABLED, DENSE_WEIGHT, SPARSE_WEIGHT, get_hybrid_search_info

        info = get_hybrid_search_info()
        assert "enabled" in info
        assert "dense_weight" in info
        assert "sparse_weight" in info
        assert info["dense_weight"] + info["sparse_weight"] == pytest.approx(1.0)

    def test_bm25_tokenization(self):
        """Test BM25 German tokenization."""
        from hybrid_search import BM25

        bm25 = BM25()
        tokens = bm25._tokenize("Der Erwartungshorizont für die Abiturprüfung")

        # German stopwords should be removed
        assert "der" not in tokens
        assert "für" not in tokens
        assert "die" not in tokens

        # Content words should remain
        assert "erwartungshorizont" in tokens
        assert "abiturprüfung" in tokens

    def test_bm25_stopwords(self):
        """Test that German stopwords are defined."""
        from hybrid_search import GERMAN_STOPWORDS

        assert "der" in GERMAN_STOPWORDS
        assert "die" in GERMAN_STOPWORDS
        assert "und" in GERMAN_STOPWORDS
        assert "ist" in GERMAN_STOPWORDS
        assert len(GERMAN_STOPWORDS) > 50

    def test_bm25_fit_and_search(self):
        """Test BM25 fitting and searching."""
        from hybrid_search import BM25

        documents = [
            "Der Erwartungshorizont für Mathematik enthält Bewertungskriterien.",
            "Die Gedichtanalyse erfordert formale und inhaltliche Aspekte.",
            "Biologie Klausur Bewertung nach Anforderungsbereichen.",
        ]

        bm25 = BM25()
        bm25.fit(documents)

        assert bm25.N == 3
        assert len(bm25.corpus) == 3

        results = bm25.search("Mathematik Bewertungskriterien", top_k=2)
        assert len(results) == 2
        assert results[0][0] == 0  # First document should rank highest

    def test_normalize_scores(self):
        """Test score normalization."""
        from hybrid_search import normalize_scores

        scores = [0.5, 1.0, 0.0, 0.75]
        normalized = normalize_scores(scores)

        assert min(normalized) == 0.0
        assert max(normalized) == 1.0
        assert len(normalized) == len(scores)

    def test_normalize_scores_empty(self):
        """Test normalization with empty list."""
        from hybrid_search import normalize_scores

        assert normalize_scores([]) == []

    def test_normalize_scores_same_value(self):
        """Test normalization when all scores are the same."""
        from hybrid_search import normalize_scores

        scores = [0.5, 0.5, 0.5]
        normalized = normalize_scores(scores)
        assert all(s == 1.0 for s in normalized)


# =============================================================================
# RAG Evaluation Tests
# =============================================================================

class TestRAGEvaluation:
    """Tests for RAG Evaluation (RAGAS-inspired) module."""

    def test_evaluation_config(self):
        """Test RAG evaluation configuration."""
        from rag_evaluation import EVALUATION_ENABLED, EVAL_MODEL, get_evaluation_info

        info = get_evaluation_info()
        assert "enabled" in info
        assert "metrics" in info
        assert "context_precision" in info["metrics"]
        assert "faithfulness" in info["metrics"]

    def test_text_similarity(self):
        """Test Jaccard text similarity."""
        from rag_evaluation import _text_similarity

        # Same text
        sim1 = _text_similarity("Hello world", "Hello world")
        assert sim1 == 1.0

        # No overlap
        sim2 = _text_similarity("Hello world", "Goodbye universe")
        assert sim2 == 0.0

        # Partial overlap
        sim3 = _text_similarity("Hello beautiful world", "Hello cruel world")
        assert 0 < sim3 < 1

    def test_context_precision(self):
        """Test context precision calculation."""
        from rag_evaluation import calculate_context_precision

        retrieved = ["Doc A about topic X", "Doc B about topic Y"]
        relevant = ["Doc A about topic X"]

        precision = calculate_context_precision("query", retrieved, relevant)
        assert precision == 0.5  # 1 out of 2 retrieved is relevant

    def test_context_precision_empty_retrieved(self):
        """Test precision with no retrieved documents."""
        from rag_evaluation import calculate_context_precision

        precision = calculate_context_precision("query", [], ["relevant"])
        assert precision == 0.0

    def test_context_recall(self):
        """Test context recall calculation."""
        from rag_evaluation import calculate_context_recall

        retrieved = ["Doc A about topic X"]
        relevant = ["Doc A about topic X", "Doc B about topic Y"]

        recall = calculate_context_recall("query", retrieved, relevant)
        assert recall == 0.5  # Found 1 out of 2 relevant

    def test_context_recall_no_relevant(self):
        """Test recall when there are no relevant documents."""
        from rag_evaluation import calculate_context_recall

        recall = calculate_context_recall("query", ["something"], [])
        assert recall == 1.0  # Nothing to miss

    def test_load_save_eval_results(self):
        """Test evaluation results file operations."""
        from rag_evaluation import _load_eval_results, _save_eval_results
        from pathlib import Path
        import tempfile
        import os

        # Use temp file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
            temp_path = Path(f.name)  # Convert to Path object

        try:
            with patch('rag_evaluation.EVAL_RESULTS_FILE', temp_path):
                # Save some results
                test_results = [{"test": "data", "score": 0.8}]
                _save_eval_results(test_results)

                # Load them back
                loaded = _load_eval_results()
                assert loaded == test_results
        finally:
            os.unlink(temp_path)


# =============================================================================
# PDF Extraction Tests
# =============================================================================

class TestPDFExtraction:
    """Tests for PDF Extraction module."""

    def test_pdf_extraction_config(self):
        """Test PDF extraction configuration."""
        from pdf_extraction import PDF_BACKEND, get_pdf_extraction_info

        info = get_pdf_extraction_info()
        assert "configured_backend" in info
        assert "available_backends" in info
        assert "recommended" in info

    def test_detect_available_backends(self):
        """Test backend detection."""
        from pdf_extraction import _detect_available_backends

        backends = _detect_available_backends()
        assert isinstance(backends, list)
        # In Docker container, at least pypdf (BSD) or unstructured (Apache 2.0) should be available
        # In local test environment without dependencies, list may be empty
        # NOTE: PyMuPDF (AGPL) is NOT installed by default for license compliance
        if backends:
            # If any backend is found, verify it's one of the license-compliant options
            for backend in backends:
                assert backend in ["pypdf", "unstructured", "pymupdf"]

    def test_pdf_extraction_result_class(self):
        """Test PDFExtractionResult data class."""
        from pdf_extraction import PDFExtractionResult

        result = PDFExtractionResult(
            text="Extracted text",
            backend_used="pypdf",
            pages=5,
            elements=[{"type": "paragraph"}],
            tables=[{"text": "table data"}],
            metadata={"key": "value"},
        )

        assert result.text == "Extracted text"
        assert result.backend_used == "pypdf"
        assert result.pages == 5
        assert len(result.elements) == 1
        assert len(result.tables) == 1

        # Test to_dict
        d = result.to_dict()
        assert d["text"] == "Extracted text"
        assert d["element_count"] == 1
        assert d["table_count"] == 1

    def test_pdf_extraction_error(self):
        """Test PDF extraction error handling."""
        from pdf_extraction import PDFExtractionError

        with pytest.raises(PDFExtractionError):
            raise PDFExtractionError("Test error")

    @pytest.mark.xfail(reason="_extract_with_pypdf is internal function not exposed in API")
    def test_pypdf_extraction(self):
        """Test pypdf extraction with a simple PDF (BSD-3-Clause licensed)."""
        from pdf_extraction import _extract_with_pypdf, PDFExtractionError

        # Create a minimal valid PDF
        # This is a very simple PDF that PyPDF2 can parse
        simple_pdf = b"""%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000206 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
300
%%EOF"""

        # This may fail because the PDF is too minimal, but tests the code path
        try:
            result = _extract_with_pypdf(simple_pdf)
            assert result.backend_used == "pypdf"
        except PDFExtractionError:
            # Expected for minimal PDF
            pass


# =============================================================================
# Self-RAG Tests
# =============================================================================

class TestSelfRAG:
    """Tests for Self-RAG / Corrective RAG module."""

    def test_self_rag_config(self):
        """Test Self-RAG configuration."""
        from self_rag import (
            SELF_RAG_ENABLED, RELEVANCE_THRESHOLD, GROUNDING_THRESHOLD,
            MAX_RETRIEVAL_ATTEMPTS, get_self_rag_info
        )

        info = get_self_rag_info()
        assert "enabled" in info
        assert "relevance_threshold" in info
        assert "grounding_threshold" in info
        assert "max_retrieval_attempts" in info
        assert "features" in info

    def test_retrieval_decision_enum(self):
        """Test RetrievalDecision enum."""
        from self_rag import RetrievalDecision

        assert RetrievalDecision.SUFFICIENT.value == "sufficient"
        assert RetrievalDecision.NEEDS_MORE.value == "needs_more"
        assert RetrievalDecision.REFORMULATE.value == "reformulate"
        assert RetrievalDecision.FALLBACK.value == "fallback"

    @pytest.mark.asyncio
    async def test_grade_document_relevance_no_llm(self):
        """Test document grading without LLM (keyword-based)."""
        from self_rag import grade_document_relevance

        with patch('self_rag.OPENAI_API_KEY', ''):
            score, reason = await grade_document_relevance(
                "Mathematik Bewertungskriterien",
                "Der Erwartungshorizont für Mathematik enthält klare Bewertungskriterien."
            )

            assert 0 <= score <= 1
            assert "Keyword" in reason

    @pytest.mark.asyncio
    async def test_decide_retrieval_strategy_empty_docs(self):
        """Test retrieval decision with no documents."""
        from self_rag import decide_retrieval_strategy, RetrievalDecision

        decision, meta = await decide_retrieval_strategy("query", [], attempt=1)
        assert decision == RetrievalDecision.REFORMULATE
        assert "No documents" in meta.get("reason", "")

    @pytest.mark.asyncio
    async def test_decide_retrieval_strategy_max_attempts(self):
        """Test retrieval decision at max attempts."""
        from self_rag import decide_retrieval_strategy, RetrievalDecision, MAX_RETRIEVAL_ATTEMPTS

        decision, meta = await decide_retrieval_strategy(
            "query", [], attempt=MAX_RETRIEVAL_ATTEMPTS
        )
        assert decision == RetrievalDecision.FALLBACK

    @pytest.mark.asyncio
    async def test_reformulate_query_no_llm(self):
        """Test query reformulation without LLM."""
        from self_rag import reformulate_query

        with patch('self_rag.OPENAI_API_KEY', ''):
            result = await reformulate_query("EA Mathematik Anforderungen")

            # Should expand abbreviations
            assert "erhöhtes Anforderungsniveau" in result or result == "EA Mathematik Anforderungen"

    @pytest.mark.asyncio
    async def test_filter_relevant_documents(self):
        """Test document filtering by relevance."""
        from self_rag import filter_relevant_documents

        docs = [
            {"text": "Mathematik Bewertungskriterien für Abitur"},
            {"text": "Rezept für Schokoladenkuchen"},
            {"text": "Erwartungshorizont Mathematik eA"},
        ]

        with patch('self_rag.OPENAI_API_KEY', ''):
            relevant, filtered = await filter_relevant_documents(
                "Mathematik Abitur Bewertung",
                docs,
                threshold=0.1  # Low threshold for keyword matching
            )

            # All docs should have relevance_score added
            for doc in relevant + filtered:
                assert "relevance_score" in doc

    @pytest.mark.asyncio
    async def test_self_rag_retrieve_disabled(self):
        """Test self_rag_retrieve when disabled."""
        from self_rag import self_rag_retrieve

        async def mock_search(query, **kwargs):
            return [{"id": "1", "text": "Result"}]

        with patch('self_rag.SELF_RAG_ENABLED', False):
            result = await self_rag_retrieve(
                query="Test query",
                search_func=mock_search,
            )

            assert result["self_rag_enabled"] is False
            assert len(result["results"]) == 1


# =============================================================================
# Integration Tests - Module Availability
# =============================================================================

class TestModuleAvailability:
    """Test that all advanced RAG modules are properly importable."""

    def test_hyde_import(self):
        """Test HyDE module import."""
        from hyde import (
            generate_hypothetical_document,
            hyde_search,
            get_hyde_info,
            HYDE_ENABLED,
        )
        assert callable(generate_hypothetical_document)
        assert callable(hyde_search)

    def test_hybrid_search_import(self):
        """Test Hybrid Search module import."""
        from hybrid_search import (
            BM25,
            hybrid_search,
            get_hybrid_search_info,
            HYBRID_ENABLED,
        )
        assert callable(hybrid_search)
        assert BM25 is not None

    def test_rag_evaluation_import(self):
        """Test RAG Evaluation module import."""
        from rag_evaluation import (
            calculate_context_precision,
            calculate_context_recall,
            evaluate_faithfulness,
            evaluate_answer_relevancy,
            evaluate_rag_response,
            get_evaluation_info,
        )
        assert callable(calculate_context_precision)
        assert callable(evaluate_rag_response)

    def test_pdf_extraction_import(self):
        """Test PDF Extraction module import."""
        from pdf_extraction import (
            extract_text_from_pdf,
            extract_text_from_pdf_enhanced,
            get_pdf_extraction_info,
            PDFExtractionResult,
        )
        assert callable(extract_text_from_pdf)
        assert callable(extract_text_from_pdf_enhanced)

    def test_self_rag_import(self):
        """Test Self-RAG module import."""
        from self_rag import (
            grade_document_relevance,
            filter_relevant_documents,
            self_rag_retrieve,
            get_self_rag_info,
            RetrievalDecision,
        )
        assert callable(self_rag_retrieve)
        assert RetrievalDecision is not None


# =============================================================================
# End-to-End Feature Verification
# =============================================================================

class TestFeatureVerification:
    """Verify that all features are properly configured and usable."""

    def test_all_features_have_info_endpoints(self):
        """Test that all features provide info functions."""
        from hyde import get_hyde_info
        from hybrid_search import get_hybrid_search_info
        from rag_evaluation import get_evaluation_info
        from pdf_extraction import get_pdf_extraction_info
        from self_rag import get_self_rag_info

        infos = [
            get_hyde_info(),
            get_hybrid_search_info(),
            get_evaluation_info(),
            get_pdf_extraction_info(),
            get_self_rag_info(),
        ]

        for info in infos:
            assert isinstance(info, dict)
            # Each should have an "enabled" or similar status field
            assert any(k in info for k in ["enabled", "configured_backend", "available_backends"])

    def test_environment_variables_documented(self):
        """Test that all environment variables are accessible."""
        import os

        # These env vars should be used by the modules
        env_vars = [
            "HYDE_ENABLED",
            "HYDE_LLM_BACKEND",
            "HYBRID_SEARCH_ENABLED",
            "HYBRID_DENSE_WEIGHT",
            "RAG_EVALUATION_ENABLED",
            "PDF_EXTRACTION_BACKEND",
            "SELF_RAG_ENABLED",
        ]

        # Just verify they're readable (will use defaults if not set)
        for var in env_vars:
            os.getenv(var, "default")  # Should not raise


# =============================================================================
# Admin API Tests (RAG Documentation with HTML rendering)
# =============================================================================

class TestRAGAdminAPI:
    """Tests for RAG Admin API endpoints."""

    @pytest.mark.xfail(reason="get_rag_documentation not yet implemented - Backlog item")
    @pytest.mark.asyncio
    async def test_rag_documentation_markdown_format(self):
        """Test RAG documentation endpoint returns markdown."""
        from admin_api import get_rag_documentation

        result = await get_rag_documentation(format="markdown")

        assert result["format"] == "markdown"
        assert "content" in result
        assert result["status"] in ["success", "inline"]

    @pytest.mark.xfail(reason="get_rag_documentation not yet implemented - Backlog item")
    @pytest.mark.asyncio
    async def test_rag_documentation_html_format(self):
        """Test RAG documentation endpoint returns HTML with tables."""
        from admin_api import get_rag_documentation

        result = await get_rag_documentation(format="html")

        assert result["format"] == "html"
        assert "content" in result

        # HTML should contain proper table styling
        html = result["content"]
        assert "<table" in html
        assert "<th>" in html or "<td>" in html
        assert "<style>" in html
        assert "border-collapse" in html

    @pytest.mark.xfail(reason="get_rag_system_info not yet implemented - Backlog item")
    @pytest.mark.asyncio
    async def test_rag_system_info_has_feature_status(self):
        """Test RAG system-info includes feature status."""
        from admin_api import get_rag_system_info

        result = await get_rag_system_info()

        # Check feature status structure
        assert "feature_status" in result.__dict__ or hasattr(result, 'feature_status')

    @pytest.mark.xfail(reason="get_rag_system_info not yet implemented - Backlog item")
    @pytest.mark.asyncio
    async def test_rag_system_info_has_privacy_notes(self):
        """Test RAG system-info includes privacy notes."""
        from admin_api import get_rag_system_info

        result = await get_rag_system_info()

        assert hasattr(result, 'privacy_notes') or "privacy_notes" in str(result)


# =============================================================================
# Reranker Tests
# =============================================================================

class TestReranker:
    """Tests for Re-Ranker module."""

    def test_reranker_config(self):
        """Test reranker configuration."""
        from reranker import RERANKER_BACKEND, LOCAL_RERANKER_MODEL, get_reranker_info

        info = get_reranker_info()
        assert "backend" in info
        assert "model" in info
        # Note: key is "embedding_service_available" not "available"
        assert "embedding_service_available" in info

    def test_reranker_model_license(self):
        """Test that default reranker model is Apache 2.0 licensed."""
        from reranker import LOCAL_RERANKER_MODEL

        # BAAI/bge-reranker-v2-m3 is Apache 2.0 licensed
        assert "bge-reranker" in LOCAL_RERANKER_MODEL
        # Should NOT use MS MARCO models (non-commercial license)
        assert "ms-marco" not in LOCAL_RERANKER_MODEL.lower()

    @pytest.mark.xfail(reason="rerank_results not yet implemented - async wrapper planned")
    @pytest.mark.asyncio
    async def test_rerank_results(self):
        """Test reranking of results."""
        from reranker import rerank_results

        results = [
            {"text": "Mathematik Bewertungskriterien", "score": 0.5},
            {"text": "Rezept fuer Kuchen", "score": 0.6},
            {"text": "Erwartungshorizont Mathe Abitur", "score": 0.4},
        ]

        reranked = await rerank_results(
            query="Mathematik Abitur Bewertung",
            results=results,
            top_k=2,
        )

        # Should return top_k results
        assert len(reranked) <= 2

        # Results should have rerank score
        for r in reranked:
            assert "rerank_score" in r or "score" in r


# =============================================================================
# API Integration Tests (EH RAG Query with rerank param)
# =============================================================================

class TestEHRAGQueryAPI:
    """Tests for EH RAG Query API with advanced features."""

    def test_eh_rag_query_params(self):
        """Test that RAG query accepts rerank parameter."""
        # This is a structural test - verify the API model accepts the param
        from pydantic import BaseModel
        from typing import Optional

        # Mock the expected request model
        class RAGQueryRequest(BaseModel):
            query_text: str
            passphrase: str
            subject: Optional[str] = None
            limit: int = 5
            rerank: bool = True  # New param

        # Should not raise validation error
        req = RAGQueryRequest(
            query_text="Test",
            passphrase="secret",
            rerank=True,
        )
        assert req.rerank is True

    def test_rag_result_has_search_info(self):
        """Test that RAG result model supports search_info."""
        from pydantic import BaseModel
        from typing import Optional, List, Dict, Any

        # Mock the expected response model
        class RAGSource(BaseModel):
            text: str
            score: float
            reranked: Optional[bool] = None

        class RAGResult(BaseModel):
            context: str
            sources: List[RAGSource]
            query: str
            search_info: Optional[Dict[str, Any]] = None

        # Should support search_info
        result = RAGResult(
            context="Test context",
            sources=[RAGSource(text="Test", score=0.8, reranked=True)],
            query="Test query",
            search_info={
                "rerank_applied": True,
                "hybrid_search_applied": True,
                "total_candidates": 20,
                "embedding_model": "BAAI/bge-m3",
            }
        )

        assert result.search_info["rerank_applied"] is True
        assert result.sources[0].reranked is True


# =============================================================================
# Run Tests
# =============================================================================

if __name__ == "__main__":
    pytest.main([__file__, "-v"])