""" Tests for Legal Templates RAG System. Tests template_sources.py, github_crawler.py, legal_templates_ingestion.py, and the admin API endpoints for legal templates. """ import pytest from unittest.mock import AsyncMock, MagicMock, patch from datetime import datetime import json # ============================================================================= # Template Sources Tests # ============================================================================= class TestLicenseType: """Tests for LicenseType enum.""" def test_license_types_exist(self): """Test that all expected license types are defined.""" from template_sources import LicenseType assert LicenseType.PUBLIC_DOMAIN.value == "public_domain" assert LicenseType.CC0.value == "cc0" assert LicenseType.UNLICENSE.value == "unlicense" assert LicenseType.MIT.value == "mit" assert LicenseType.CC_BY_4.value == "cc_by_4" assert LicenseType.REUSE_NOTICE.value == "reuse_notice" class TestLicenseInfo: """Tests for LicenseInfo dataclass.""" def test_license_info_creation(self): """Test creating a LicenseInfo instance.""" from template_sources import LicenseInfo, LicenseType info = LicenseInfo( id=LicenseType.CC0, name="CC0 1.0 Universal", url="https://creativecommons.org/publicdomain/zero/1.0/", attribution_required=False, ) assert info.id == LicenseType.CC0 assert info.attribution_required is False assert info.training_allowed is True assert info.output_allowed is True def test_get_attribution_text_no_attribution(self): """Test attribution text when not required.""" from template_sources import LicenseInfo, LicenseType info = LicenseInfo( id=LicenseType.CC0, name="CC0", url="https://example.com", attribution_required=False, ) result = info.get_attribution_text("Test Source", "https://test.com") assert result == "" def test_get_attribution_text_with_template(self): """Test attribution text with template.""" from template_sources import LicenseInfo, LicenseType info = LicenseInfo( id=LicenseType.MIT, name="MIT License", url="https://opensource.org/licenses/MIT", attribution_required=True, attribution_template="Based on [{source_name}]({source_url}) - MIT License", ) result = info.get_attribution_text("Test Source", "https://test.com") assert "Test Source" in result assert "https://test.com" in result class TestSourceConfig: """Tests for SourceConfig dataclass.""" def test_source_config_creation(self): """Test creating a SourceConfig instance.""" from template_sources import SourceConfig, LicenseType source = SourceConfig( name="test-source", license_type=LicenseType.CC0, template_types=["privacy_policy", "terms_of_service"], languages=["de", "en"], jurisdiction="DE", description="Test description", repo_url="https://github.com/test/repo", ) assert source.name == "test-source" assert source.license_type == LicenseType.CC0 assert "privacy_policy" in source.template_types assert source.enabled is True def test_source_config_license_info(self): """Test getting license info from source config.""" from template_sources import SourceConfig, LicenseType, LICENSES source = SourceConfig( name="test-source", license_type=LicenseType.MIT, template_types=["privacy_policy"], languages=["en"], jurisdiction="US", description="Test", ) info = source.license_info assert info.id == LicenseType.MIT assert info.attribution_required is True class TestTemplateSources: """Tests for TEMPLATE_SOURCES list.""" def test_template_sources_not_empty(self): """Test that template sources are defined.""" from template_sources import TEMPLATE_SOURCES assert len(TEMPLATE_SOURCES) > 0 def test_github_site_policy_exists(self): """Test that github-site-policy source exists.""" from template_sources import TEMPLATE_SOURCES source = next((s for s in TEMPLATE_SOURCES if s.name == "github-site-policy"), None) assert source is not None assert source.repo_url == "https://github.com/github/site-policy" def test_enabled_sources(self): """Test getting enabled sources.""" from template_sources import get_enabled_sources enabled = get_enabled_sources() assert all(s.enabled for s in enabled) def test_sources_by_priority(self): """Test getting sources by priority.""" from template_sources import get_sources_by_priority # Priority 1 sources only p1 = get_sources_by_priority(1) assert all(s.priority == 1 for s in p1) # Priority 1-2 sources p2 = get_sources_by_priority(2) assert all(s.priority <= 2 for s in p2) def test_sources_by_license(self): """Test getting sources by license type.""" from template_sources import get_sources_by_license, LicenseType cc0_sources = get_sources_by_license(LicenseType.CC0) assert all(s.license_type == LicenseType.CC0 for s in cc0_sources) # ============================================================================= # GitHub Crawler Tests # ============================================================================= class TestMarkdownParser: """Tests for MarkdownParser class.""" def test_parse_simple_markdown(self): """Test parsing simple markdown content.""" from github_crawler import MarkdownParser content = """# Test Title This is some content. ## Section 1 More content here. """ doc = MarkdownParser.parse(content, "test.md") assert doc.title == "Test Title" assert doc.file_type == "markdown" assert "content" in doc.text def test_extract_title_from_heading(self): """Test extracting title from h1 heading.""" from github_crawler import MarkdownParser title = MarkdownParser._extract_title("# My Document\n\nContent", "fallback.md") assert title == "My Document" def test_extract_title_fallback(self): """Test fallback to filename when no heading.""" from github_crawler import MarkdownParser title = MarkdownParser._extract_title("No heading here", "my-document.md") assert title == "My Document" def test_detect_german_language(self): """Test German language detection.""" from github_crawler import MarkdownParser german_text = "Dies ist eine Datenschutzerklaerung fuer die Verarbeitung personenbezogener Daten." lang = MarkdownParser._detect_language(german_text) assert lang == "de" def test_detect_english_language(self): """Test English language detection.""" from github_crawler import MarkdownParser english_text = "This is a privacy policy for processing personal data in our application." lang = MarkdownParser._detect_language(english_text) assert lang == "en" def test_find_placeholders(self): """Test finding placeholder patterns.""" from github_crawler import MarkdownParser content = "Company: [COMPANY_NAME], Contact: {email}, Address: __ADDRESS__" placeholders = MarkdownParser._find_placeholders(content) assert "[COMPANY_NAME]" in placeholders assert "{email}" in placeholders assert "__ADDRESS__" in placeholders class TestHTMLParser: """Tests for HTMLParser class.""" def test_parse_simple_html(self): """Test parsing simple HTML content.""" from github_crawler import HTMLParser content = """
This is content.
""" doc = HTMLParser.parse(content, "test.html") assert doc.title == "Test Page" assert doc.file_type == "html" assert "Welcome" in doc.text assert "content" in doc.text def test_html_to_text_removes_scripts(self): """Test that scripts are removed from HTML.""" from github_crawler import HTMLParser html = "Text
More
" text = HTMLParser._html_to_text(html) assert "alert" not in text assert "Text" in text assert "More" in text class TestJSONParser: """Tests for JSONParser class.""" def test_parse_simple_json(self): """Test parsing simple JSON content.""" from github_crawler import JSONParser content = json.dumps({ "title": "Privacy Policy", "text": "This is the privacy policy content.", "language": "en", }) docs = JSONParser.parse(content, "policy.json") assert len(docs) == 1 assert docs[0].title == "Privacy Policy" assert "privacy policy content" in docs[0].text def test_parse_nested_json(self): """Test parsing nested JSON structures.""" from github_crawler import JSONParser content = json.dumps({ "sections": { "intro": {"title": "Introduction", "text": "Welcome text"}, "data": {"title": "Data Collection", "text": "Collection info"}, } }) docs = JSONParser.parse(content, "nested.json") # Should extract nested documents assert len(docs) >= 2 class TestExtractedDocument: """Tests for ExtractedDocument dataclass.""" def test_extracted_document_hash(self): """Test that source hash is auto-generated.""" from github_crawler import ExtractedDocument doc = ExtractedDocument( text="Some content", title="Test", file_path="test.md", file_type="markdown", source_url="https://example.com", ) assert doc.source_hash != "" assert len(doc.source_hash) == 64 # SHA256 hex # ============================================================================= # Legal Templates Ingestion Tests # ============================================================================= class TestLegalTemplatesIngestion: """Tests for LegalTemplatesIngestion class.""" @pytest.fixture def mock_qdrant(self): """Mock Qdrant client.""" with patch('legal_templates_ingestion.QdrantClient') as mock: client = MagicMock() client.get_collections.return_value.collections = [] mock.return_value = client yield client @pytest.fixture def mock_http_client(self): """Mock HTTP client for embeddings.""" with patch('legal_templates_ingestion.httpx.AsyncClient') as mock: client = AsyncMock() mock.return_value = client yield client def test_chunk_text_short(self): """Test chunking short text.""" from legal_templates_ingestion import LegalTemplatesIngestion with patch('legal_templates_ingestion.QdrantClient'): ingestion = LegalTemplatesIngestion() chunks = ingestion._chunk_text("Short text", chunk_size=1000) assert len(chunks) == 1 assert chunks[0] == "Short text" def test_chunk_text_long(self): """Test chunking long text.""" from legal_templates_ingestion import LegalTemplatesIngestion with patch('legal_templates_ingestion.QdrantClient'): ingestion = LegalTemplatesIngestion() # Create text longer than chunk size long_text = "This is a sentence. " * 100 chunks = ingestion._chunk_text(long_text, chunk_size=200, overlap=50) assert len(chunks) > 1 # Each chunk should be roughly chunk_size for chunk in chunks: assert len(chunk) <= 250 # Allow some buffer def test_split_sentences(self): """Test German sentence splitting.""" from legal_templates_ingestion import LegalTemplatesIngestion with patch('legal_templates_ingestion.QdrantClient'): ingestion = LegalTemplatesIngestion() text = "Dies ist Satz eins. Dies ist Satz zwei. Und Satz drei." sentences = ingestion._split_sentences(text) assert len(sentences) == 3 def test_split_sentences_preserves_abbreviations(self): """Test that abbreviations don't split sentences.""" from legal_templates_ingestion import LegalTemplatesIngestion with patch('legal_templates_ingestion.QdrantClient'): ingestion = LegalTemplatesIngestion() text = "Das ist z.B. ein Beispiel. Und noch ein Satz." sentences = ingestion._split_sentences(text) assert len(sentences) == 2 assert "z.B." in sentences[0] or "z.b." in sentences[0].lower() def test_infer_template_type_privacy(self): """Test inferring privacy policy type.""" from legal_templates_ingestion import LegalTemplatesIngestion from github_crawler import ExtractedDocument from template_sources import SourceConfig, LicenseType with patch('legal_templates_ingestion.QdrantClient'): ingestion = LegalTemplatesIngestion() doc = ExtractedDocument( text="Diese Datenschutzerklaerung informiert Sie ueber die Verarbeitung personenbezogener Daten.", title="Datenschutz", file_path="privacy.md", file_type="markdown", source_url="https://example.com", ) source = SourceConfig( name="test", license_type=LicenseType.CC0, template_types=["privacy_policy"], languages=["de"], jurisdiction="DE", description="Test", ) template_type = ingestion._infer_template_type(doc, source) assert template_type == "privacy_policy" def test_infer_clause_category(self): """Test inferring clause category.""" from legal_templates_ingestion import LegalTemplatesIngestion with patch('legal_templates_ingestion.QdrantClient'): ingestion = LegalTemplatesIngestion() # Test liability clause text = "Die Haftung des Anbieters ist auf grobe Fahrlässigkeit beschränkt." category = ingestion._infer_clause_category(text) assert category == "haftung" # Test privacy clause text = "Wir verarbeiten personenbezogene Daten gemäß der DSGVO." category = ingestion._infer_clause_category(text) assert category == "datenschutz" # ============================================================================= # Admin API Templates Tests # ============================================================================= class TestTemplatesAdminAPI: """Tests for /api/v1/admin/templates/* endpoints.""" def test_templates_status_structure(self): """Test the structure of templates status response.""" from admin_api import _templates_ingestion_status # Reset status _templates_ingestion_status["running"] = False _templates_ingestion_status["last_run"] = None _templates_ingestion_status["current_source"] = None _templates_ingestion_status["results"] = {} assert _templates_ingestion_status["running"] is False assert _templates_ingestion_status["results"] == {} def test_templates_status_running(self): """Test status when ingestion is running.""" from admin_api import _templates_ingestion_status _templates_ingestion_status["running"] = True _templates_ingestion_status["current_source"] = "github-site-policy" _templates_ingestion_status["last_run"] = datetime.now().isoformat() assert _templates_ingestion_status["running"] is True assert _templates_ingestion_status["current_source"] == "github-site-policy" def test_templates_results_tracking(self): """Test that ingestion results are tracked correctly.""" from admin_api import _templates_ingestion_status _templates_ingestion_status["results"] = { "github-site-policy": { "status": "completed", "documents_found": 15, "chunks_indexed": 42, "errors": [], }, "opr-vc": { "status": "failed", "documents_found": 0, "chunks_indexed": 0, "errors": ["Connection timeout"], }, } results = _templates_ingestion_status["results"] assert results["github-site-policy"]["status"] == "completed" assert results["github-site-policy"]["chunks_indexed"] == 42 assert results["opr-vc"]["status"] == "failed" assert len(results["opr-vc"]["errors"]) > 0 class TestTemplateTypeLabels: """Tests for template type labels and constants.""" def test_template_types_defined(self): """Test that template types are properly defined.""" from template_sources import TEMPLATE_TYPES assert "privacy_policy" in TEMPLATE_TYPES assert "terms_of_service" in TEMPLATE_TYPES assert "cookie_banner" in TEMPLATE_TYPES assert "impressum" in TEMPLATE_TYPES assert "widerruf" in TEMPLATE_TYPES assert "dpa" in TEMPLATE_TYPES def test_jurisdictions_defined(self): """Test that jurisdictions are properly defined.""" from template_sources import JURISDICTIONS assert "DE" in JURISDICTIONS assert "AT" in JURISDICTIONS assert "CH" in JURISDICTIONS assert "EU" in JURISDICTIONS assert "US" in JURISDICTIONS # ============================================================================= # Qdrant Service Templates Tests # ============================================================================= class TestQdrantServiceTemplates: """Tests for legal templates Qdrant service functions.""" @pytest.fixture def mock_qdrant_client(self): """Mock Qdrant client for templates.""" with patch('qdrant_service.get_qdrant_client') as mock: client = MagicMock() client.get_collections.return_value.collections = [] mock.return_value = client yield client def test_legal_templates_collection_name(self): """Test that collection name is correct.""" from qdrant_service import LEGAL_TEMPLATES_COLLECTION assert LEGAL_TEMPLATES_COLLECTION == "bp_legal_templates" def test_legal_templates_vector_size(self): """Test that vector size is correct for BGE-M3.""" from qdrant_service import LEGAL_TEMPLATES_VECTOR_SIZE assert LEGAL_TEMPLATES_VECTOR_SIZE == 1024 # ============================================================================= # Integration Tests (require mocking external services) # ============================================================================= class TestTemplatesIntegration: """Integration tests for the templates system.""" @pytest.fixture def mock_all_services(self): """Mock all external services.""" with patch('legal_templates_ingestion.QdrantClient') as qdrant_mock, \ patch('legal_templates_ingestion.httpx.AsyncClient') as http_mock: qdrant = MagicMock() qdrant.get_collections.return_value.collections = [] qdrant_mock.return_value = qdrant http = AsyncMock() http.post.return_value.json.return_value = {"embeddings": [[0.1] * 1024]} http.post.return_value.raise_for_status = MagicMock() http_mock.return_value.__aenter__.return_value = http yield {"qdrant": qdrant, "http": http} def test_full_chunk_creation_pipeline(self, mock_all_services): """Test the full chunk creation pipeline.""" from legal_templates_ingestion import LegalTemplatesIngestion from github_crawler import ExtractedDocument from template_sources import SourceConfig, LicenseType ingestion = LegalTemplatesIngestion() doc = ExtractedDocument( text="# Datenschutzerklaerung\n\nWir nehmen den Schutz Ihrer personenbezogenen Daten sehr ernst. Diese Datenschutzerklaerung informiert Sie ueber die Verarbeitung Ihrer Daten gemaess der DSGVO.", title="Datenschutzerklaerung", file_path="privacy.md", file_type="markdown", source_url="https://example.com/privacy.md", source_commit="abc123", placeholders=["[FIRMENNAME]"], language="de", # Explicitly set language ) source = SourceConfig( name="test-source", license_type=LicenseType.CC0, template_types=["privacy_policy"], languages=["de"], jurisdiction="DE", description="Test source", repo_url="https://github.com/test/repo", ) chunks = ingestion._create_chunks(doc, source) assert len(chunks) >= 1 assert chunks[0].template_type == "privacy_policy" assert chunks[0].language == "de" assert chunks[0].jurisdiction == "DE" assert chunks[0].license_id == "cc0" assert chunks[0].attribution_required is False assert "[FIRMENNAME]" in chunks[0].placeholders # ============================================================================= # Test Runner Configuration # ============================================================================= if __name__ == "__main__": pytest.main([__file__, "-v"])