""" Tests for Advanced RAG Features Tests for the newly implemented RAG quality improvements: - HyDE (Hypothetical Document Embeddings) - Hybrid Search (Dense + Sparse/BM25) - RAG Evaluation (RAGAS-inspired metrics) - PDF Extraction (Unstructured.io, PyMuPDF, PyPDF2) - Self-RAG / Corrective RAG Run with: pytest tests/test_advanced_rag.py -v """ import pytest from unittest.mock import AsyncMock, MagicMock, patch import asyncio # ============================================================================= # HyDE Tests # ============================================================================= class TestHyDE: """Tests for HyDE (Hypothetical Document Embeddings) module.""" def test_hyde_config(self): """Test HyDE configuration loading.""" from hyde import HYDE_ENABLED, HYDE_LLM_BACKEND, HYDE_MODEL, get_hyde_info info = get_hyde_info() assert "enabled" in info assert "llm_backend" in info assert "model" in info def test_hyde_prompt_template(self): """Test that HyDE prompt template is properly formatted.""" from hyde import HYDE_PROMPT_TEMPLATE assert "{query}" in HYDE_PROMPT_TEMPLATE assert "Erwartungshorizont" in HYDE_PROMPT_TEMPLATE assert "Bildungsstandards" in HYDE_PROMPT_TEMPLATE @pytest.mark.asyncio async def test_generate_hypothetical_document_disabled(self): """Test HyDE returns original query when disabled.""" from hyde import generate_hypothetical_document with patch('hyde.HYDE_ENABLED', False): result = await generate_hypothetical_document("Test query") assert result == "Test query" @pytest.mark.asyncio async def test_hyde_search_fallback(self): """Test hyde_search falls back gracefully without LLM.""" from hyde import hyde_search async def mock_search(query, **kwargs): return [{"id": "1", "text": "Result for: " + query}] with patch('hyde.HYDE_ENABLED', False): result = await hyde_search( query="Test query", search_func=mock_search, ) assert result["hyde_used"] is False # When disabled, original_query should match the query assert result["original_query"] == "Test query" # ============================================================================= # Hybrid Search Tests # ============================================================================= class TestHybridSearch: """Tests for Hybrid Search (Dense + Sparse) module.""" def test_hybrid_search_config(self): """Test hybrid search configuration.""" from hybrid_search import HYBRID_ENABLED, DENSE_WEIGHT, SPARSE_WEIGHT, get_hybrid_search_info info = get_hybrid_search_info() assert "enabled" in info assert "dense_weight" in info assert "sparse_weight" in info assert info["dense_weight"] + info["sparse_weight"] == pytest.approx(1.0) def test_bm25_tokenization(self): """Test BM25 German tokenization.""" from hybrid_search import BM25 bm25 = BM25() tokens = bm25._tokenize("Der Erwartungshorizont für die Abiturprüfung") # German stopwords should be removed assert "der" not in tokens assert "für" not in tokens assert "die" not in tokens # Content words should remain assert "erwartungshorizont" in tokens assert "abiturprüfung" in tokens def test_bm25_stopwords(self): """Test that German stopwords are defined.""" from hybrid_search import GERMAN_STOPWORDS assert "der" in GERMAN_STOPWORDS assert "die" in GERMAN_STOPWORDS assert "und" in GERMAN_STOPWORDS assert "ist" in GERMAN_STOPWORDS assert len(GERMAN_STOPWORDS) > 50 def test_bm25_fit_and_search(self): """Test BM25 fitting and searching.""" from hybrid_search import BM25 documents = [ "Der Erwartungshorizont für Mathematik enthält Bewertungskriterien.", "Die Gedichtanalyse erfordert formale und inhaltliche Aspekte.", "Biologie Klausur Bewertung nach Anforderungsbereichen.", ] bm25 = BM25() bm25.fit(documents) assert bm25.N == 3 assert len(bm25.corpus) == 3 results = bm25.search("Mathematik Bewertungskriterien", top_k=2) assert len(results) == 2 assert results[0][0] == 0 # First document should rank highest def test_normalize_scores(self): """Test score normalization.""" from hybrid_search import normalize_scores scores = [0.5, 1.0, 0.0, 0.75] normalized = normalize_scores(scores) assert min(normalized) == 0.0 assert max(normalized) == 1.0 assert len(normalized) == len(scores) def test_normalize_scores_empty(self): """Test normalization with empty list.""" from hybrid_search import normalize_scores assert normalize_scores([]) == [] def test_normalize_scores_same_value(self): """Test normalization when all scores are the same.""" from hybrid_search import normalize_scores scores = [0.5, 0.5, 0.5] normalized = normalize_scores(scores) assert all(s == 1.0 for s in normalized) # ============================================================================= # RAG Evaluation Tests # ============================================================================= class TestRAGEvaluation: """Tests for RAG Evaluation (RAGAS-inspired) module.""" def test_evaluation_config(self): """Test RAG evaluation configuration.""" from rag_evaluation import EVALUATION_ENABLED, EVAL_MODEL, get_evaluation_info info = get_evaluation_info() assert "enabled" in info assert "metrics" in info assert "context_precision" in info["metrics"] assert "faithfulness" in info["metrics"] def test_text_similarity(self): """Test Jaccard text similarity.""" from rag_evaluation import _text_similarity # Same text sim1 = _text_similarity("Hello world", "Hello world") assert sim1 == 1.0 # No overlap sim2 = _text_similarity("Hello world", "Goodbye universe") assert sim2 == 0.0 # Partial overlap sim3 = _text_similarity("Hello beautiful world", "Hello cruel world") assert 0 < sim3 < 1 def test_context_precision(self): """Test context precision calculation.""" from rag_evaluation import calculate_context_precision retrieved = ["Doc A about topic X", "Doc B about topic Y"] relevant = ["Doc A about topic X"] precision = calculate_context_precision("query", retrieved, relevant) assert precision == 0.5 # 1 out of 2 retrieved is relevant def test_context_precision_empty_retrieved(self): """Test precision with no retrieved documents.""" from rag_evaluation import calculate_context_precision precision = calculate_context_precision("query", [], ["relevant"]) assert precision == 0.0 def test_context_recall(self): """Test context recall calculation.""" from rag_evaluation import calculate_context_recall retrieved = ["Doc A about topic X"] relevant = ["Doc A about topic X", "Doc B about topic Y"] recall = calculate_context_recall("query", retrieved, relevant) assert recall == 0.5 # Found 1 out of 2 relevant def test_context_recall_no_relevant(self): """Test recall when there are no relevant documents.""" from rag_evaluation import calculate_context_recall recall = calculate_context_recall("query", ["something"], []) assert recall == 1.0 # Nothing to miss def test_load_save_eval_results(self): """Test evaluation results file operations.""" from rag_evaluation import _load_eval_results, _save_eval_results from pathlib import Path import tempfile import os # Use temp file with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: temp_path = Path(f.name) # Convert to Path object try: with patch('rag_evaluation.EVAL_RESULTS_FILE', temp_path): # Save some results test_results = [{"test": "data", "score": 0.8}] _save_eval_results(test_results) # Load them back loaded = _load_eval_results() assert loaded == test_results finally: os.unlink(temp_path) # ============================================================================= # PDF Extraction Tests # ============================================================================= class TestPDFExtraction: """Tests for PDF Extraction module.""" def test_pdf_extraction_config(self): """Test PDF extraction configuration.""" from pdf_extraction import PDF_BACKEND, get_pdf_extraction_info info = get_pdf_extraction_info() assert "configured_backend" in info assert "available_backends" in info assert "recommended" in info def test_detect_available_backends(self): """Test backend detection.""" from pdf_extraction import _detect_available_backends backends = _detect_available_backends() assert isinstance(backends, list) # In Docker container, at least pypdf (BSD) or unstructured (Apache 2.0) should be available # In local test environment without dependencies, list may be empty # NOTE: PyMuPDF (AGPL) is NOT installed by default for license compliance if backends: # If any backend is found, verify it's one of the license-compliant options for backend in backends: assert backend in ["pypdf", "unstructured", "pymupdf"] def test_pdf_extraction_result_class(self): """Test PDFExtractionResult data class.""" from pdf_extraction import PDFExtractionResult result = PDFExtractionResult( text="Extracted text", backend_used="pypdf", pages=5, elements=[{"type": "paragraph"}], tables=[{"text": "table data"}], metadata={"key": "value"}, ) assert result.text == "Extracted text" assert result.backend_used == "pypdf" assert result.pages == 5 assert len(result.elements) == 1 assert len(result.tables) == 1 # Test to_dict d = result.to_dict() assert d["text"] == "Extracted text" assert d["element_count"] == 1 assert d["table_count"] == 1 def test_pdf_extraction_error(self): """Test PDF extraction error handling.""" from pdf_extraction import PDFExtractionError with pytest.raises(PDFExtractionError): raise PDFExtractionError("Test error") @pytest.mark.xfail(reason="_extract_with_pypdf is internal function not exposed in API") def test_pypdf_extraction(self): """Test pypdf extraction with a simple PDF (BSD-3-Clause licensed).""" from pdf_extraction import _extract_with_pypdf, PDFExtractionError # Create a minimal valid PDF # This is a very simple PDF that PyPDF2 can parse simple_pdf = b"""%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >> endobj 4 0 obj << /Length 44 >> stream BT /F1 12 Tf 100 700 Td (Hello World) Tj ET endstream endobj xref 0 5 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n 0000000206 00000 n trailer << /Size 5 /Root 1 0 R >> startxref 300 %%EOF""" # This may fail because the PDF is too minimal, but tests the code path try: result = _extract_with_pypdf(simple_pdf) assert result.backend_used == "pypdf" except PDFExtractionError: # Expected for minimal PDF pass # ============================================================================= # Self-RAG Tests # ============================================================================= class TestSelfRAG: """Tests for Self-RAG / Corrective RAG module.""" def test_self_rag_config(self): """Test Self-RAG configuration.""" from self_rag import ( SELF_RAG_ENABLED, RELEVANCE_THRESHOLD, GROUNDING_THRESHOLD, MAX_RETRIEVAL_ATTEMPTS, get_self_rag_info ) info = get_self_rag_info() assert "enabled" in info assert "relevance_threshold" in info assert "grounding_threshold" in info assert "max_retrieval_attempts" in info assert "features" in info def test_retrieval_decision_enum(self): """Test RetrievalDecision enum.""" from self_rag import RetrievalDecision assert RetrievalDecision.SUFFICIENT.value == "sufficient" assert RetrievalDecision.NEEDS_MORE.value == "needs_more" assert RetrievalDecision.REFORMULATE.value == "reformulate" assert RetrievalDecision.FALLBACK.value == "fallback" @pytest.mark.asyncio async def test_grade_document_relevance_no_llm(self): """Test document grading without LLM (keyword-based).""" from self_rag import grade_document_relevance with patch('self_rag.OPENAI_API_KEY', ''): score, reason = await grade_document_relevance( "Mathematik Bewertungskriterien", "Der Erwartungshorizont für Mathematik enthält klare Bewertungskriterien." ) assert 0 <= score <= 1 assert "Keyword" in reason @pytest.mark.asyncio async def test_decide_retrieval_strategy_empty_docs(self): """Test retrieval decision with no documents.""" from self_rag import decide_retrieval_strategy, RetrievalDecision decision, meta = await decide_retrieval_strategy("query", [], attempt=1) assert decision == RetrievalDecision.REFORMULATE assert "No documents" in meta.get("reason", "") @pytest.mark.asyncio async def test_decide_retrieval_strategy_max_attempts(self): """Test retrieval decision at max attempts.""" from self_rag import decide_retrieval_strategy, RetrievalDecision, MAX_RETRIEVAL_ATTEMPTS decision, meta = await decide_retrieval_strategy( "query", [], attempt=MAX_RETRIEVAL_ATTEMPTS ) assert decision == RetrievalDecision.FALLBACK @pytest.mark.asyncio async def test_reformulate_query_no_llm(self): """Test query reformulation without LLM.""" from self_rag import reformulate_query with patch('self_rag.OPENAI_API_KEY', ''): result = await reformulate_query("EA Mathematik Anforderungen") # Should expand abbreviations assert "erhöhtes Anforderungsniveau" in result or result == "EA Mathematik Anforderungen" @pytest.mark.asyncio async def test_filter_relevant_documents(self): """Test document filtering by relevance.""" from self_rag import filter_relevant_documents docs = [ {"text": "Mathematik Bewertungskriterien für Abitur"}, {"text": "Rezept für Schokoladenkuchen"}, {"text": "Erwartungshorizont Mathematik eA"}, ] with patch('self_rag.OPENAI_API_KEY', ''): relevant, filtered = await filter_relevant_documents( "Mathematik Abitur Bewertung", docs, threshold=0.1 # Low threshold for keyword matching ) # All docs should have relevance_score added for doc in relevant + filtered: assert "relevance_score" in doc @pytest.mark.asyncio async def test_self_rag_retrieve_disabled(self): """Test self_rag_retrieve when disabled.""" from self_rag import self_rag_retrieve async def mock_search(query, **kwargs): return [{"id": "1", "text": "Result"}] with patch('self_rag.SELF_RAG_ENABLED', False): result = await self_rag_retrieve( query="Test query", search_func=mock_search, ) assert result["self_rag_enabled"] is False assert len(result["results"]) == 1 # ============================================================================= # Integration Tests - Module Availability # ============================================================================= class TestModuleAvailability: """Test that all advanced RAG modules are properly importable.""" def test_hyde_import(self): """Test HyDE module import.""" from hyde import ( generate_hypothetical_document, hyde_search, get_hyde_info, HYDE_ENABLED, ) assert callable(generate_hypothetical_document) assert callable(hyde_search) def test_hybrid_search_import(self): """Test Hybrid Search module import.""" from hybrid_search import ( BM25, hybrid_search, get_hybrid_search_info, HYBRID_ENABLED, ) assert callable(hybrid_search) assert BM25 is not None def test_rag_evaluation_import(self): """Test RAG Evaluation module import.""" from rag_evaluation import ( calculate_context_precision, calculate_context_recall, evaluate_faithfulness, evaluate_answer_relevancy, evaluate_rag_response, get_evaluation_info, ) assert callable(calculate_context_precision) assert callable(evaluate_rag_response) def test_pdf_extraction_import(self): """Test PDF Extraction module import.""" from pdf_extraction import ( extract_text_from_pdf, extract_text_from_pdf_enhanced, get_pdf_extraction_info, PDFExtractionResult, ) assert callable(extract_text_from_pdf) assert callable(extract_text_from_pdf_enhanced) def test_self_rag_import(self): """Test Self-RAG module import.""" from self_rag import ( grade_document_relevance, filter_relevant_documents, self_rag_retrieve, get_self_rag_info, RetrievalDecision, ) assert callable(self_rag_retrieve) assert RetrievalDecision is not None # ============================================================================= # End-to-End Feature Verification # ============================================================================= class TestFeatureVerification: """Verify that all features are properly configured and usable.""" def test_all_features_have_info_endpoints(self): """Test that all features provide info functions.""" from hyde import get_hyde_info from hybrid_search import get_hybrid_search_info from rag_evaluation import get_evaluation_info from pdf_extraction import get_pdf_extraction_info from self_rag import get_self_rag_info infos = [ get_hyde_info(), get_hybrid_search_info(), get_evaluation_info(), get_pdf_extraction_info(), get_self_rag_info(), ] for info in infos: assert isinstance(info, dict) # Each should have an "enabled" or similar status field assert any(k in info for k in ["enabled", "configured_backend", "available_backends"]) def test_environment_variables_documented(self): """Test that all environment variables are accessible.""" import os # These env vars should be used by the modules env_vars = [ "HYDE_ENABLED", "HYDE_LLM_BACKEND", "HYBRID_SEARCH_ENABLED", "HYBRID_DENSE_WEIGHT", "RAG_EVALUATION_ENABLED", "PDF_EXTRACTION_BACKEND", "SELF_RAG_ENABLED", ] # Just verify they're readable (will use defaults if not set) for var in env_vars: os.getenv(var, "default") # Should not raise # ============================================================================= # Admin API Tests (RAG Documentation with HTML rendering) # ============================================================================= class TestRAGAdminAPI: """Tests for RAG Admin API endpoints.""" @pytest.mark.xfail(reason="get_rag_documentation not yet implemented - Backlog item") @pytest.mark.asyncio async def test_rag_documentation_markdown_format(self): """Test RAG documentation endpoint returns markdown.""" from admin_api import get_rag_documentation result = await get_rag_documentation(format="markdown") assert result["format"] == "markdown" assert "content" in result assert result["status"] in ["success", "inline"] @pytest.mark.xfail(reason="get_rag_documentation not yet implemented - Backlog item") @pytest.mark.asyncio async def test_rag_documentation_html_format(self): """Test RAG documentation endpoint returns HTML with tables.""" from admin_api import get_rag_documentation result = await get_rag_documentation(format="html") assert result["format"] == "html" assert "content" in result # HTML should contain proper table styling html = result["content"] assert "
| " in html assert " |