Files
breakpilot-lehrer/klausur-service/backend/tests/test_advanced_rag.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

770 lines
26 KiB
Python

"""
Tests for Advanced RAG Features
Tests for the newly implemented RAG quality improvements:
- HyDE (Hypothetical Document Embeddings)
- Hybrid Search (Dense + Sparse/BM25)
- RAG Evaluation (RAGAS-inspired metrics)
- PDF Extraction (Unstructured.io, PyMuPDF, PyPDF2)
- Self-RAG / Corrective RAG
Run with: pytest tests/test_advanced_rag.py -v
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
import asyncio
# =============================================================================
# HyDE Tests
# =============================================================================
class TestHyDE:
"""Tests for HyDE (Hypothetical Document Embeddings) module."""
def test_hyde_config(self):
"""Test HyDE configuration loading."""
from hyde import HYDE_ENABLED, HYDE_LLM_BACKEND, HYDE_MODEL, get_hyde_info
info = get_hyde_info()
assert "enabled" in info
assert "llm_backend" in info
assert "model" in info
def test_hyde_prompt_template(self):
"""Test that HyDE prompt template is properly formatted."""
from hyde import HYDE_PROMPT_TEMPLATE
assert "{query}" in HYDE_PROMPT_TEMPLATE
assert "Erwartungshorizont" in HYDE_PROMPT_TEMPLATE
assert "Bildungsstandards" in HYDE_PROMPT_TEMPLATE
@pytest.mark.asyncio
async def test_generate_hypothetical_document_disabled(self):
"""Test HyDE returns original query when disabled."""
from hyde import generate_hypothetical_document
with patch('hyde.HYDE_ENABLED', False):
result = await generate_hypothetical_document("Test query")
assert result == "Test query"
@pytest.mark.asyncio
async def test_hyde_search_fallback(self):
"""Test hyde_search falls back gracefully without LLM."""
from hyde import hyde_search
async def mock_search(query, **kwargs):
return [{"id": "1", "text": "Result for: " + query}]
with patch('hyde.HYDE_ENABLED', False):
result = await hyde_search(
query="Test query",
search_func=mock_search,
)
assert result["hyde_used"] is False
# When disabled, original_query should match the query
assert result["original_query"] == "Test query"
# =============================================================================
# Hybrid Search Tests
# =============================================================================
class TestHybridSearch:
"""Tests for Hybrid Search (Dense + Sparse) module."""
def test_hybrid_search_config(self):
"""Test hybrid search configuration."""
from hybrid_search import HYBRID_ENABLED, DENSE_WEIGHT, SPARSE_WEIGHT, get_hybrid_search_info
info = get_hybrid_search_info()
assert "enabled" in info
assert "dense_weight" in info
assert "sparse_weight" in info
assert info["dense_weight"] + info["sparse_weight"] == pytest.approx(1.0)
def test_bm25_tokenization(self):
"""Test BM25 German tokenization."""
from hybrid_search import BM25
bm25 = BM25()
tokens = bm25._tokenize("Der Erwartungshorizont für die Abiturprüfung")
# German stopwords should be removed
assert "der" not in tokens
assert "für" not in tokens
assert "die" not in tokens
# Content words should remain
assert "erwartungshorizont" in tokens
assert "abiturprüfung" in tokens
def test_bm25_stopwords(self):
"""Test that German stopwords are defined."""
from hybrid_search import GERMAN_STOPWORDS
assert "der" in GERMAN_STOPWORDS
assert "die" in GERMAN_STOPWORDS
assert "und" in GERMAN_STOPWORDS
assert "ist" in GERMAN_STOPWORDS
assert len(GERMAN_STOPWORDS) > 50
def test_bm25_fit_and_search(self):
"""Test BM25 fitting and searching."""
from hybrid_search import BM25
documents = [
"Der Erwartungshorizont für Mathematik enthält Bewertungskriterien.",
"Die Gedichtanalyse erfordert formale und inhaltliche Aspekte.",
"Biologie Klausur Bewertung nach Anforderungsbereichen.",
]
bm25 = BM25()
bm25.fit(documents)
assert bm25.N == 3
assert len(bm25.corpus) == 3
results = bm25.search("Mathematik Bewertungskriterien", top_k=2)
assert len(results) == 2
assert results[0][0] == 0 # First document should rank highest
def test_normalize_scores(self):
"""Test score normalization."""
from hybrid_search import normalize_scores
scores = [0.5, 1.0, 0.0, 0.75]
normalized = normalize_scores(scores)
assert min(normalized) == 0.0
assert max(normalized) == 1.0
assert len(normalized) == len(scores)
def test_normalize_scores_empty(self):
"""Test normalization with empty list."""
from hybrid_search import normalize_scores
assert normalize_scores([]) == []
def test_normalize_scores_same_value(self):
"""Test normalization when all scores are the same."""
from hybrid_search import normalize_scores
scores = [0.5, 0.5, 0.5]
normalized = normalize_scores(scores)
assert all(s == 1.0 for s in normalized)
# =============================================================================
# RAG Evaluation Tests
# =============================================================================
class TestRAGEvaluation:
"""Tests for RAG Evaluation (RAGAS-inspired) module."""
def test_evaluation_config(self):
"""Test RAG evaluation configuration."""
from rag_evaluation import EVALUATION_ENABLED, EVAL_MODEL, get_evaluation_info
info = get_evaluation_info()
assert "enabled" in info
assert "metrics" in info
assert "context_precision" in info["metrics"]
assert "faithfulness" in info["metrics"]
def test_text_similarity(self):
"""Test Jaccard text similarity."""
from rag_evaluation import _text_similarity
# Same text
sim1 = _text_similarity("Hello world", "Hello world")
assert sim1 == 1.0
# No overlap
sim2 = _text_similarity("Hello world", "Goodbye universe")
assert sim2 == 0.0
# Partial overlap
sim3 = _text_similarity("Hello beautiful world", "Hello cruel world")
assert 0 < sim3 < 1
def test_context_precision(self):
"""Test context precision calculation."""
from rag_evaluation import calculate_context_precision
retrieved = ["Doc A about topic X", "Doc B about topic Y"]
relevant = ["Doc A about topic X"]
precision = calculate_context_precision("query", retrieved, relevant)
assert precision == 0.5 # 1 out of 2 retrieved is relevant
def test_context_precision_empty_retrieved(self):
"""Test precision with no retrieved documents."""
from rag_evaluation import calculate_context_precision
precision = calculate_context_precision("query", [], ["relevant"])
assert precision == 0.0
def test_context_recall(self):
"""Test context recall calculation."""
from rag_evaluation import calculate_context_recall
retrieved = ["Doc A about topic X"]
relevant = ["Doc A about topic X", "Doc B about topic Y"]
recall = calculate_context_recall("query", retrieved, relevant)
assert recall == 0.5 # Found 1 out of 2 relevant
def test_context_recall_no_relevant(self):
"""Test recall when there are no relevant documents."""
from rag_evaluation import calculate_context_recall
recall = calculate_context_recall("query", ["something"], [])
assert recall == 1.0 # Nothing to miss
def test_load_save_eval_results(self):
"""Test evaluation results file operations."""
from rag_evaluation import _load_eval_results, _save_eval_results
from pathlib import Path
import tempfile
import os
# Use temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
temp_path = Path(f.name) # Convert to Path object
try:
with patch('rag_evaluation.EVAL_RESULTS_FILE', temp_path):
# Save some results
test_results = [{"test": "data", "score": 0.8}]
_save_eval_results(test_results)
# Load them back
loaded = _load_eval_results()
assert loaded == test_results
finally:
os.unlink(temp_path)
# =============================================================================
# PDF Extraction Tests
# =============================================================================
class TestPDFExtraction:
"""Tests for PDF Extraction module."""
def test_pdf_extraction_config(self):
"""Test PDF extraction configuration."""
from pdf_extraction import PDF_BACKEND, get_pdf_extraction_info
info = get_pdf_extraction_info()
assert "configured_backend" in info
assert "available_backends" in info
assert "recommended" in info
def test_detect_available_backends(self):
"""Test backend detection."""
from pdf_extraction import _detect_available_backends
backends = _detect_available_backends()
assert isinstance(backends, list)
# In Docker container, at least pypdf (BSD) or unstructured (Apache 2.0) should be available
# In local test environment without dependencies, list may be empty
# NOTE: PyMuPDF (AGPL) is NOT installed by default for license compliance
if backends:
# If any backend is found, verify it's one of the license-compliant options
for backend in backends:
assert backend in ["pypdf", "unstructured", "pymupdf"]
def test_pdf_extraction_result_class(self):
"""Test PDFExtractionResult data class."""
from pdf_extraction import PDFExtractionResult
result = PDFExtractionResult(
text="Extracted text",
backend_used="pypdf",
pages=5,
elements=[{"type": "paragraph"}],
tables=[{"text": "table data"}],
metadata={"key": "value"},
)
assert result.text == "Extracted text"
assert result.backend_used == "pypdf"
assert result.pages == 5
assert len(result.elements) == 1
assert len(result.tables) == 1
# Test to_dict
d = result.to_dict()
assert d["text"] == "Extracted text"
assert d["element_count"] == 1
assert d["table_count"] == 1
def test_pdf_extraction_error(self):
"""Test PDF extraction error handling."""
from pdf_extraction import PDFExtractionError
with pytest.raises(PDFExtractionError):
raise PDFExtractionError("Test error")
@pytest.mark.xfail(reason="_extract_with_pypdf is internal function not exposed in API")
def test_pypdf_extraction(self):
"""Test pypdf extraction with a simple PDF (BSD-3-Clause licensed)."""
from pdf_extraction import _extract_with_pypdf, PDFExtractionError
# Create a minimal valid PDF
# This is a very simple PDF that PyPDF2 can parse
simple_pdf = b"""%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000206 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
300
%%EOF"""
# This may fail because the PDF is too minimal, but tests the code path
try:
result = _extract_with_pypdf(simple_pdf)
assert result.backend_used == "pypdf"
except PDFExtractionError:
# Expected for minimal PDF
pass
# =============================================================================
# Self-RAG Tests
# =============================================================================
class TestSelfRAG:
"""Tests for Self-RAG / Corrective RAG module."""
def test_self_rag_config(self):
"""Test Self-RAG configuration."""
from self_rag import (
SELF_RAG_ENABLED, RELEVANCE_THRESHOLD, GROUNDING_THRESHOLD,
MAX_RETRIEVAL_ATTEMPTS, get_self_rag_info
)
info = get_self_rag_info()
assert "enabled" in info
assert "relevance_threshold" in info
assert "grounding_threshold" in info
assert "max_retrieval_attempts" in info
assert "features" in info
def test_retrieval_decision_enum(self):
"""Test RetrievalDecision enum."""
from self_rag import RetrievalDecision
assert RetrievalDecision.SUFFICIENT.value == "sufficient"
assert RetrievalDecision.NEEDS_MORE.value == "needs_more"
assert RetrievalDecision.REFORMULATE.value == "reformulate"
assert RetrievalDecision.FALLBACK.value == "fallback"
@pytest.mark.asyncio
async def test_grade_document_relevance_no_llm(self):
"""Test document grading without LLM (keyword-based)."""
from self_rag import grade_document_relevance
with patch('self_rag.OPENAI_API_KEY', ''):
score, reason = await grade_document_relevance(
"Mathematik Bewertungskriterien",
"Der Erwartungshorizont für Mathematik enthält klare Bewertungskriterien."
)
assert 0 <= score <= 1
assert "Keyword" in reason
@pytest.mark.asyncio
async def test_decide_retrieval_strategy_empty_docs(self):
"""Test retrieval decision with no documents."""
from self_rag import decide_retrieval_strategy, RetrievalDecision
decision, meta = await decide_retrieval_strategy("query", [], attempt=1)
assert decision == RetrievalDecision.REFORMULATE
assert "No documents" in meta.get("reason", "")
@pytest.mark.asyncio
async def test_decide_retrieval_strategy_max_attempts(self):
"""Test retrieval decision at max attempts."""
from self_rag import decide_retrieval_strategy, RetrievalDecision, MAX_RETRIEVAL_ATTEMPTS
decision, meta = await decide_retrieval_strategy(
"query", [], attempt=MAX_RETRIEVAL_ATTEMPTS
)
assert decision == RetrievalDecision.FALLBACK
@pytest.mark.asyncio
async def test_reformulate_query_no_llm(self):
"""Test query reformulation without LLM."""
from self_rag import reformulate_query
with patch('self_rag.OPENAI_API_KEY', ''):
result = await reformulate_query("EA Mathematik Anforderungen")
# Should expand abbreviations
assert "erhöhtes Anforderungsniveau" in result or result == "EA Mathematik Anforderungen"
@pytest.mark.asyncio
async def test_filter_relevant_documents(self):
"""Test document filtering by relevance."""
from self_rag import filter_relevant_documents
docs = [
{"text": "Mathematik Bewertungskriterien für Abitur"},
{"text": "Rezept für Schokoladenkuchen"},
{"text": "Erwartungshorizont Mathematik eA"},
]
with patch('self_rag.OPENAI_API_KEY', ''):
relevant, filtered = await filter_relevant_documents(
"Mathematik Abitur Bewertung",
docs,
threshold=0.1 # Low threshold for keyword matching
)
# All docs should have relevance_score added
for doc in relevant + filtered:
assert "relevance_score" in doc
@pytest.mark.asyncio
async def test_self_rag_retrieve_disabled(self):
"""Test self_rag_retrieve when disabled."""
from self_rag import self_rag_retrieve
async def mock_search(query, **kwargs):
return [{"id": "1", "text": "Result"}]
with patch('self_rag.SELF_RAG_ENABLED', False):
result = await self_rag_retrieve(
query="Test query",
search_func=mock_search,
)
assert result["self_rag_enabled"] is False
assert len(result["results"]) == 1
# =============================================================================
# Integration Tests - Module Availability
# =============================================================================
class TestModuleAvailability:
"""Test that all advanced RAG modules are properly importable."""
def test_hyde_import(self):
"""Test HyDE module import."""
from hyde import (
generate_hypothetical_document,
hyde_search,
get_hyde_info,
HYDE_ENABLED,
)
assert callable(generate_hypothetical_document)
assert callable(hyde_search)
def test_hybrid_search_import(self):
"""Test Hybrid Search module import."""
from hybrid_search import (
BM25,
hybrid_search,
get_hybrid_search_info,
HYBRID_ENABLED,
)
assert callable(hybrid_search)
assert BM25 is not None
def test_rag_evaluation_import(self):
"""Test RAG Evaluation module import."""
from rag_evaluation import (
calculate_context_precision,
calculate_context_recall,
evaluate_faithfulness,
evaluate_answer_relevancy,
evaluate_rag_response,
get_evaluation_info,
)
assert callable(calculate_context_precision)
assert callable(evaluate_rag_response)
def test_pdf_extraction_import(self):
"""Test PDF Extraction module import."""
from pdf_extraction import (
extract_text_from_pdf,
extract_text_from_pdf_enhanced,
get_pdf_extraction_info,
PDFExtractionResult,
)
assert callable(extract_text_from_pdf)
assert callable(extract_text_from_pdf_enhanced)
def test_self_rag_import(self):
"""Test Self-RAG module import."""
from self_rag import (
grade_document_relevance,
filter_relevant_documents,
self_rag_retrieve,
get_self_rag_info,
RetrievalDecision,
)
assert callable(self_rag_retrieve)
assert RetrievalDecision is not None
# =============================================================================
# End-to-End Feature Verification
# =============================================================================
class TestFeatureVerification:
"""Verify that all features are properly configured and usable."""
def test_all_features_have_info_endpoints(self):
"""Test that all features provide info functions."""
from hyde import get_hyde_info
from hybrid_search import get_hybrid_search_info
from rag_evaluation import get_evaluation_info
from pdf_extraction import get_pdf_extraction_info
from self_rag import get_self_rag_info
infos = [
get_hyde_info(),
get_hybrid_search_info(),
get_evaluation_info(),
get_pdf_extraction_info(),
get_self_rag_info(),
]
for info in infos:
assert isinstance(info, dict)
# Each should have an "enabled" or similar status field
assert any(k in info for k in ["enabled", "configured_backend", "available_backends"])
def test_environment_variables_documented(self):
"""Test that all environment variables are accessible."""
import os
# These env vars should be used by the modules
env_vars = [
"HYDE_ENABLED",
"HYDE_LLM_BACKEND",
"HYBRID_SEARCH_ENABLED",
"HYBRID_DENSE_WEIGHT",
"RAG_EVALUATION_ENABLED",
"PDF_EXTRACTION_BACKEND",
"SELF_RAG_ENABLED",
]
# Just verify they're readable (will use defaults if not set)
for var in env_vars:
os.getenv(var, "default") # Should not raise
# =============================================================================
# Admin API Tests (RAG Documentation with HTML rendering)
# =============================================================================
class TestRAGAdminAPI:
"""Tests for RAG Admin API endpoints."""
@pytest.mark.xfail(reason="get_rag_documentation not yet implemented - Backlog item")
@pytest.mark.asyncio
async def test_rag_documentation_markdown_format(self):
"""Test RAG documentation endpoint returns markdown."""
from admin_api import get_rag_documentation
result = await get_rag_documentation(format="markdown")
assert result["format"] == "markdown"
assert "content" in result
assert result["status"] in ["success", "inline"]
@pytest.mark.xfail(reason="get_rag_documentation not yet implemented - Backlog item")
@pytest.mark.asyncio
async def test_rag_documentation_html_format(self):
"""Test RAG documentation endpoint returns HTML with tables."""
from admin_api import get_rag_documentation
result = await get_rag_documentation(format="html")
assert result["format"] == "html"
assert "content" in result
# HTML should contain proper table styling
html = result["content"]
assert "<table" in html
assert "<th>" in html or "<td>" in html
assert "<style>" in html
assert "border-collapse" in html
@pytest.mark.xfail(reason="get_rag_system_info not yet implemented - Backlog item")
@pytest.mark.asyncio
async def test_rag_system_info_has_feature_status(self):
"""Test RAG system-info includes feature status."""
from admin_api import get_rag_system_info
result = await get_rag_system_info()
# Check feature status structure
assert "feature_status" in result.__dict__ or hasattr(result, 'feature_status')
@pytest.mark.xfail(reason="get_rag_system_info not yet implemented - Backlog item")
@pytest.mark.asyncio
async def test_rag_system_info_has_privacy_notes(self):
"""Test RAG system-info includes privacy notes."""
from admin_api import get_rag_system_info
result = await get_rag_system_info()
assert hasattr(result, 'privacy_notes') or "privacy_notes" in str(result)
# =============================================================================
# Reranker Tests
# =============================================================================
class TestReranker:
"""Tests for Re-Ranker module."""
def test_reranker_config(self):
"""Test reranker configuration."""
from reranker import RERANKER_BACKEND, LOCAL_RERANKER_MODEL, get_reranker_info
info = get_reranker_info()
assert "backend" in info
assert "model" in info
# Note: key is "embedding_service_available" not "available"
assert "embedding_service_available" in info
def test_reranker_model_license(self):
"""Test that default reranker model is Apache 2.0 licensed."""
from reranker import LOCAL_RERANKER_MODEL
# BAAI/bge-reranker-v2-m3 is Apache 2.0 licensed
assert "bge-reranker" in LOCAL_RERANKER_MODEL
# Should NOT use MS MARCO models (non-commercial license)
assert "ms-marco" not in LOCAL_RERANKER_MODEL.lower()
@pytest.mark.xfail(reason="rerank_results not yet implemented - async wrapper planned")
@pytest.mark.asyncio
async def test_rerank_results(self):
"""Test reranking of results."""
from reranker import rerank_results
results = [
{"text": "Mathematik Bewertungskriterien", "score": 0.5},
{"text": "Rezept fuer Kuchen", "score": 0.6},
{"text": "Erwartungshorizont Mathe Abitur", "score": 0.4},
]
reranked = await rerank_results(
query="Mathematik Abitur Bewertung",
results=results,
top_k=2,
)
# Should return top_k results
assert len(reranked) <= 2
# Results should have rerank score
for r in reranked:
assert "rerank_score" in r or "score" in r
# =============================================================================
# API Integration Tests (EH RAG Query with rerank param)
# =============================================================================
class TestEHRAGQueryAPI:
"""Tests for EH RAG Query API with advanced features."""
def test_eh_rag_query_params(self):
"""Test that RAG query accepts rerank parameter."""
# This is a structural test - verify the API model accepts the param
from pydantic import BaseModel
from typing import Optional
# Mock the expected request model
class RAGQueryRequest(BaseModel):
query_text: str
passphrase: str
subject: Optional[str] = None
limit: int = 5
rerank: bool = True # New param
# Should not raise validation error
req = RAGQueryRequest(
query_text="Test",
passphrase="secret",
rerank=True,
)
assert req.rerank is True
def test_rag_result_has_search_info(self):
"""Test that RAG result model supports search_info."""
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
# Mock the expected response model
class RAGSource(BaseModel):
text: str
score: float
reranked: Optional[bool] = None
class RAGResult(BaseModel):
context: str
sources: List[RAGSource]
query: str
search_info: Optional[Dict[str, Any]] = None
# Should support search_info
result = RAGResult(
context="Test context",
sources=[RAGSource(text="Test", score=0.8, reranked=True)],
query="Test query",
search_info={
"rerank_applied": True,
"hybrid_search_applied": True,
"total_candidates": 20,
"embedding_model": "BAAI/bge-m3",
}
)
assert result.search_info["rerank_applied"] is True
assert result.sources[0].reranked is True
# =============================================================================
# Run Tests
# =============================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v"])