Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
624 lines
22 KiB
Python
624 lines
22 KiB
Python
"""
|
|
Tests for Legal Templates RAG System.
|
|
|
|
Tests template_sources.py, github_crawler.py, legal_templates_ingestion.py,
|
|
and the admin API endpoints for legal templates.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from datetime import datetime
|
|
import json
|
|
|
|
|
|
# =============================================================================
|
|
# Template Sources Tests
|
|
# =============================================================================
|
|
|
|
class TestLicenseType:
|
|
"""Tests for LicenseType enum."""
|
|
|
|
def test_license_types_exist(self):
|
|
"""Test that all expected license types are defined."""
|
|
from template_sources import LicenseType
|
|
|
|
assert LicenseType.PUBLIC_DOMAIN.value == "public_domain"
|
|
assert LicenseType.CC0.value == "cc0"
|
|
assert LicenseType.UNLICENSE.value == "unlicense"
|
|
assert LicenseType.MIT.value == "mit"
|
|
assert LicenseType.CC_BY_4.value == "cc_by_4"
|
|
assert LicenseType.REUSE_NOTICE.value == "reuse_notice"
|
|
|
|
|
|
class TestLicenseInfo:
|
|
"""Tests for LicenseInfo dataclass."""
|
|
|
|
def test_license_info_creation(self):
|
|
"""Test creating a LicenseInfo instance."""
|
|
from template_sources import LicenseInfo, LicenseType
|
|
|
|
info = LicenseInfo(
|
|
id=LicenseType.CC0,
|
|
name="CC0 1.0 Universal",
|
|
url="https://creativecommons.org/publicdomain/zero/1.0/",
|
|
attribution_required=False,
|
|
)
|
|
|
|
assert info.id == LicenseType.CC0
|
|
assert info.attribution_required is False
|
|
assert info.training_allowed is True
|
|
assert info.output_allowed is True
|
|
|
|
def test_get_attribution_text_no_attribution(self):
|
|
"""Test attribution text when not required."""
|
|
from template_sources import LicenseInfo, LicenseType
|
|
|
|
info = LicenseInfo(
|
|
id=LicenseType.CC0,
|
|
name="CC0",
|
|
url="https://example.com",
|
|
attribution_required=False,
|
|
)
|
|
|
|
result = info.get_attribution_text("Test Source", "https://test.com")
|
|
assert result == ""
|
|
|
|
def test_get_attribution_text_with_template(self):
|
|
"""Test attribution text with template."""
|
|
from template_sources import LicenseInfo, LicenseType
|
|
|
|
info = LicenseInfo(
|
|
id=LicenseType.MIT,
|
|
name="MIT License",
|
|
url="https://opensource.org/licenses/MIT",
|
|
attribution_required=True,
|
|
attribution_template="Based on [{source_name}]({source_url}) - MIT License",
|
|
)
|
|
|
|
result = info.get_attribution_text("Test Source", "https://test.com")
|
|
assert "Test Source" in result
|
|
assert "https://test.com" in result
|
|
|
|
|
|
class TestSourceConfig:
|
|
"""Tests for SourceConfig dataclass."""
|
|
|
|
def test_source_config_creation(self):
|
|
"""Test creating a SourceConfig instance."""
|
|
from template_sources import SourceConfig, LicenseType
|
|
|
|
source = SourceConfig(
|
|
name="test-source",
|
|
license_type=LicenseType.CC0,
|
|
template_types=["privacy_policy", "terms_of_service"],
|
|
languages=["de", "en"],
|
|
jurisdiction="DE",
|
|
description="Test description",
|
|
repo_url="https://github.com/test/repo",
|
|
)
|
|
|
|
assert source.name == "test-source"
|
|
assert source.license_type == LicenseType.CC0
|
|
assert "privacy_policy" in source.template_types
|
|
assert source.enabled is True
|
|
|
|
def test_source_config_license_info(self):
|
|
"""Test getting license info from source config."""
|
|
from template_sources import SourceConfig, LicenseType, LICENSES
|
|
|
|
source = SourceConfig(
|
|
name="test-source",
|
|
license_type=LicenseType.MIT,
|
|
template_types=["privacy_policy"],
|
|
languages=["en"],
|
|
jurisdiction="US",
|
|
description="Test",
|
|
)
|
|
|
|
info = source.license_info
|
|
assert info.id == LicenseType.MIT
|
|
assert info.attribution_required is True
|
|
|
|
|
|
class TestTemplateSources:
|
|
"""Tests for TEMPLATE_SOURCES list."""
|
|
|
|
def test_template_sources_not_empty(self):
|
|
"""Test that template sources are defined."""
|
|
from template_sources import TEMPLATE_SOURCES
|
|
|
|
assert len(TEMPLATE_SOURCES) > 0
|
|
|
|
def test_github_site_policy_exists(self):
|
|
"""Test that github-site-policy source exists."""
|
|
from template_sources import TEMPLATE_SOURCES
|
|
|
|
source = next((s for s in TEMPLATE_SOURCES if s.name == "github-site-policy"), None)
|
|
assert source is not None
|
|
assert source.repo_url == "https://github.com/github/site-policy"
|
|
|
|
def test_enabled_sources(self):
|
|
"""Test getting enabled sources."""
|
|
from template_sources import get_enabled_sources
|
|
|
|
enabled = get_enabled_sources()
|
|
assert all(s.enabled for s in enabled)
|
|
|
|
def test_sources_by_priority(self):
|
|
"""Test getting sources by priority."""
|
|
from template_sources import get_sources_by_priority
|
|
|
|
# Priority 1 sources only
|
|
p1 = get_sources_by_priority(1)
|
|
assert all(s.priority == 1 for s in p1)
|
|
|
|
# Priority 1-2 sources
|
|
p2 = get_sources_by_priority(2)
|
|
assert all(s.priority <= 2 for s in p2)
|
|
|
|
def test_sources_by_license(self):
|
|
"""Test getting sources by license type."""
|
|
from template_sources import get_sources_by_license, LicenseType
|
|
|
|
cc0_sources = get_sources_by_license(LicenseType.CC0)
|
|
assert all(s.license_type == LicenseType.CC0 for s in cc0_sources)
|
|
|
|
|
|
# =============================================================================
|
|
# GitHub Crawler Tests
|
|
# =============================================================================
|
|
|
|
class TestMarkdownParser:
|
|
"""Tests for MarkdownParser class."""
|
|
|
|
def test_parse_simple_markdown(self):
|
|
"""Test parsing simple markdown content."""
|
|
from github_crawler import MarkdownParser
|
|
|
|
content = """# Test Title
|
|
|
|
This is some content.
|
|
|
|
## Section 1
|
|
|
|
More content here.
|
|
"""
|
|
doc = MarkdownParser.parse(content, "test.md")
|
|
|
|
assert doc.title == "Test Title"
|
|
assert doc.file_type == "markdown"
|
|
assert "content" in doc.text
|
|
|
|
def test_extract_title_from_heading(self):
|
|
"""Test extracting title from h1 heading."""
|
|
from github_crawler import MarkdownParser
|
|
|
|
title = MarkdownParser._extract_title("# My Document\n\nContent", "fallback.md")
|
|
assert title == "My Document"
|
|
|
|
def test_extract_title_fallback(self):
|
|
"""Test fallback to filename when no heading."""
|
|
from github_crawler import MarkdownParser
|
|
|
|
title = MarkdownParser._extract_title("No heading here", "my-document.md")
|
|
assert title == "My Document"
|
|
|
|
def test_detect_german_language(self):
|
|
"""Test German language detection."""
|
|
from github_crawler import MarkdownParser
|
|
|
|
german_text = "Dies ist eine Datenschutzerklaerung fuer die Verarbeitung personenbezogener Daten."
|
|
lang = MarkdownParser._detect_language(german_text)
|
|
assert lang == "de"
|
|
|
|
def test_detect_english_language(self):
|
|
"""Test English language detection."""
|
|
from github_crawler import MarkdownParser
|
|
|
|
english_text = "This is a privacy policy for processing personal data in our application."
|
|
lang = MarkdownParser._detect_language(english_text)
|
|
assert lang == "en"
|
|
|
|
def test_find_placeholders(self):
|
|
"""Test finding placeholder patterns."""
|
|
from github_crawler import MarkdownParser
|
|
|
|
content = "Company: [COMPANY_NAME], Contact: {email}, Address: __ADDRESS__"
|
|
placeholders = MarkdownParser._find_placeholders(content)
|
|
|
|
assert "[COMPANY_NAME]" in placeholders
|
|
assert "{email}" in placeholders
|
|
assert "__ADDRESS__" in placeholders
|
|
|
|
|
|
class TestHTMLParser:
|
|
"""Tests for HTMLParser class."""
|
|
|
|
def test_parse_simple_html(self):
|
|
"""Test parsing simple HTML content."""
|
|
from github_crawler import HTMLParser
|
|
|
|
content = """<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Test Page</title></head>
|
|
<body>
|
|
<h1>Welcome</h1>
|
|
<p>This is content.</p>
|
|
</body>
|
|
</html>"""
|
|
doc = HTMLParser.parse(content, "test.html")
|
|
|
|
assert doc.title == "Test Page"
|
|
assert doc.file_type == "html"
|
|
assert "Welcome" in doc.text
|
|
assert "content" in doc.text
|
|
|
|
def test_html_to_text_removes_scripts(self):
|
|
"""Test that scripts are removed from HTML."""
|
|
from github_crawler import HTMLParser
|
|
|
|
html = "<p>Text</p><script>alert('bad');</script><p>More</p>"
|
|
text = HTMLParser._html_to_text(html)
|
|
|
|
assert "alert" not in text
|
|
assert "Text" in text
|
|
assert "More" in text
|
|
|
|
|
|
class TestJSONParser:
|
|
"""Tests for JSONParser class."""
|
|
|
|
def test_parse_simple_json(self):
|
|
"""Test parsing simple JSON content."""
|
|
from github_crawler import JSONParser
|
|
|
|
content = json.dumps({
|
|
"title": "Privacy Policy",
|
|
"text": "This is the privacy policy content.",
|
|
"language": "en",
|
|
})
|
|
|
|
docs = JSONParser.parse(content, "policy.json")
|
|
|
|
assert len(docs) == 1
|
|
assert docs[0].title == "Privacy Policy"
|
|
assert "privacy policy content" in docs[0].text
|
|
|
|
def test_parse_nested_json(self):
|
|
"""Test parsing nested JSON structures."""
|
|
from github_crawler import JSONParser
|
|
|
|
content = json.dumps({
|
|
"sections": {
|
|
"intro": {"title": "Introduction", "text": "Welcome text"},
|
|
"data": {"title": "Data Collection", "text": "Collection info"},
|
|
}
|
|
})
|
|
|
|
docs = JSONParser.parse(content, "nested.json")
|
|
# Should extract nested documents
|
|
assert len(docs) >= 2
|
|
|
|
|
|
class TestExtractedDocument:
|
|
"""Tests for ExtractedDocument dataclass."""
|
|
|
|
def test_extracted_document_hash(self):
|
|
"""Test that source hash is auto-generated."""
|
|
from github_crawler import ExtractedDocument
|
|
|
|
doc = ExtractedDocument(
|
|
text="Some content",
|
|
title="Test",
|
|
file_path="test.md",
|
|
file_type="markdown",
|
|
source_url="https://example.com",
|
|
)
|
|
|
|
assert doc.source_hash != ""
|
|
assert len(doc.source_hash) == 64 # SHA256 hex
|
|
|
|
|
|
# =============================================================================
|
|
# Legal Templates Ingestion Tests
|
|
# =============================================================================
|
|
|
|
class TestLegalTemplatesIngestion:
|
|
"""Tests for LegalTemplatesIngestion class."""
|
|
|
|
@pytest.fixture
|
|
def mock_qdrant(self):
|
|
"""Mock Qdrant client."""
|
|
with patch('legal_templates_ingestion.QdrantClient') as mock:
|
|
client = MagicMock()
|
|
client.get_collections.return_value.collections = []
|
|
mock.return_value = client
|
|
yield client
|
|
|
|
@pytest.fixture
|
|
def mock_http_client(self):
|
|
"""Mock HTTP client for embeddings."""
|
|
with patch('legal_templates_ingestion.httpx.AsyncClient') as mock:
|
|
client = AsyncMock()
|
|
mock.return_value = client
|
|
yield client
|
|
|
|
def test_chunk_text_short(self):
|
|
"""Test chunking short text."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
|
|
with patch('legal_templates_ingestion.QdrantClient'):
|
|
ingestion = LegalTemplatesIngestion()
|
|
chunks = ingestion._chunk_text("Short text", chunk_size=1000)
|
|
|
|
assert len(chunks) == 1
|
|
assert chunks[0] == "Short text"
|
|
|
|
def test_chunk_text_long(self):
|
|
"""Test chunking long text."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
|
|
with patch('legal_templates_ingestion.QdrantClient'):
|
|
ingestion = LegalTemplatesIngestion()
|
|
|
|
# Create text longer than chunk size
|
|
long_text = "This is a sentence. " * 100
|
|
chunks = ingestion._chunk_text(long_text, chunk_size=200, overlap=50)
|
|
|
|
assert len(chunks) > 1
|
|
# Each chunk should be roughly chunk_size
|
|
for chunk in chunks:
|
|
assert len(chunk) <= 250 # Allow some buffer
|
|
|
|
def test_split_sentences(self):
|
|
"""Test German sentence splitting."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
|
|
with patch('legal_templates_ingestion.QdrantClient'):
|
|
ingestion = LegalTemplatesIngestion()
|
|
text = "Dies ist Satz eins. Dies ist Satz zwei. Und Satz drei."
|
|
sentences = ingestion._split_sentences(text)
|
|
|
|
assert len(sentences) == 3
|
|
|
|
def test_split_sentences_preserves_abbreviations(self):
|
|
"""Test that abbreviations don't split sentences."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
|
|
with patch('legal_templates_ingestion.QdrantClient'):
|
|
ingestion = LegalTemplatesIngestion()
|
|
text = "Das ist z.B. ein Beispiel. Und noch ein Satz."
|
|
sentences = ingestion._split_sentences(text)
|
|
|
|
assert len(sentences) == 2
|
|
assert "z.B." in sentences[0] or "z.b." in sentences[0].lower()
|
|
|
|
def test_infer_template_type_privacy(self):
|
|
"""Test inferring privacy policy type."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
from github_crawler import ExtractedDocument
|
|
from template_sources import SourceConfig, LicenseType
|
|
|
|
with patch('legal_templates_ingestion.QdrantClient'):
|
|
ingestion = LegalTemplatesIngestion()
|
|
|
|
doc = ExtractedDocument(
|
|
text="Diese Datenschutzerklaerung informiert Sie ueber die Verarbeitung personenbezogener Daten.",
|
|
title="Datenschutz",
|
|
file_path="privacy.md",
|
|
file_type="markdown",
|
|
source_url="https://example.com",
|
|
)
|
|
|
|
source = SourceConfig(
|
|
name="test",
|
|
license_type=LicenseType.CC0,
|
|
template_types=["privacy_policy"],
|
|
languages=["de"],
|
|
jurisdiction="DE",
|
|
description="Test",
|
|
)
|
|
|
|
template_type = ingestion._infer_template_type(doc, source)
|
|
assert template_type == "privacy_policy"
|
|
|
|
def test_infer_clause_category(self):
|
|
"""Test inferring clause category."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
|
|
with patch('legal_templates_ingestion.QdrantClient'):
|
|
ingestion = LegalTemplatesIngestion()
|
|
|
|
# Test liability clause
|
|
text = "Die Haftung des Anbieters ist auf grobe Fahrlässigkeit beschränkt."
|
|
category = ingestion._infer_clause_category(text)
|
|
assert category == "haftung"
|
|
|
|
# Test privacy clause
|
|
text = "Wir verarbeiten personenbezogene Daten gemäß der DSGVO."
|
|
category = ingestion._infer_clause_category(text)
|
|
assert category == "datenschutz"
|
|
|
|
|
|
# =============================================================================
|
|
# Admin API Templates Tests
|
|
# =============================================================================
|
|
|
|
class TestTemplatesAdminAPI:
|
|
"""Tests for /api/v1/admin/templates/* endpoints."""
|
|
|
|
def test_templates_status_structure(self):
|
|
"""Test the structure of templates status response."""
|
|
from admin_api import _templates_ingestion_status
|
|
|
|
# Reset status
|
|
_templates_ingestion_status["running"] = False
|
|
_templates_ingestion_status["last_run"] = None
|
|
_templates_ingestion_status["current_source"] = None
|
|
_templates_ingestion_status["results"] = {}
|
|
|
|
assert _templates_ingestion_status["running"] is False
|
|
assert _templates_ingestion_status["results"] == {}
|
|
|
|
def test_templates_status_running(self):
|
|
"""Test status when ingestion is running."""
|
|
from admin_api import _templates_ingestion_status
|
|
|
|
_templates_ingestion_status["running"] = True
|
|
_templates_ingestion_status["current_source"] = "github-site-policy"
|
|
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
|
|
|
|
assert _templates_ingestion_status["running"] is True
|
|
assert _templates_ingestion_status["current_source"] == "github-site-policy"
|
|
|
|
def test_templates_results_tracking(self):
|
|
"""Test that ingestion results are tracked correctly."""
|
|
from admin_api import _templates_ingestion_status
|
|
|
|
_templates_ingestion_status["results"] = {
|
|
"github-site-policy": {
|
|
"status": "completed",
|
|
"documents_found": 15,
|
|
"chunks_indexed": 42,
|
|
"errors": [],
|
|
},
|
|
"opr-vc": {
|
|
"status": "failed",
|
|
"documents_found": 0,
|
|
"chunks_indexed": 0,
|
|
"errors": ["Connection timeout"],
|
|
},
|
|
}
|
|
|
|
results = _templates_ingestion_status["results"]
|
|
assert results["github-site-policy"]["status"] == "completed"
|
|
assert results["github-site-policy"]["chunks_indexed"] == 42
|
|
assert results["opr-vc"]["status"] == "failed"
|
|
assert len(results["opr-vc"]["errors"]) > 0
|
|
|
|
|
|
class TestTemplateTypeLabels:
|
|
"""Tests for template type labels and constants."""
|
|
|
|
def test_template_types_defined(self):
|
|
"""Test that template types are properly defined."""
|
|
from template_sources import TEMPLATE_TYPES
|
|
|
|
assert "privacy_policy" in TEMPLATE_TYPES
|
|
assert "terms_of_service" in TEMPLATE_TYPES
|
|
assert "cookie_banner" in TEMPLATE_TYPES
|
|
assert "impressum" in TEMPLATE_TYPES
|
|
assert "widerruf" in TEMPLATE_TYPES
|
|
assert "dpa" in TEMPLATE_TYPES
|
|
|
|
def test_jurisdictions_defined(self):
|
|
"""Test that jurisdictions are properly defined."""
|
|
from template_sources import JURISDICTIONS
|
|
|
|
assert "DE" in JURISDICTIONS
|
|
assert "AT" in JURISDICTIONS
|
|
assert "CH" in JURISDICTIONS
|
|
assert "EU" in JURISDICTIONS
|
|
assert "US" in JURISDICTIONS
|
|
|
|
|
|
# =============================================================================
|
|
# Qdrant Service Templates Tests
|
|
# =============================================================================
|
|
|
|
class TestQdrantServiceTemplates:
|
|
"""Tests for legal templates Qdrant service functions."""
|
|
|
|
@pytest.fixture
|
|
def mock_qdrant_client(self):
|
|
"""Mock Qdrant client for templates."""
|
|
with patch('qdrant_service.get_qdrant_client') as mock:
|
|
client = MagicMock()
|
|
client.get_collections.return_value.collections = []
|
|
mock.return_value = client
|
|
yield client
|
|
|
|
def test_legal_templates_collection_name(self):
|
|
"""Test that collection name is correct."""
|
|
from qdrant_service import LEGAL_TEMPLATES_COLLECTION
|
|
|
|
assert LEGAL_TEMPLATES_COLLECTION == "bp_legal_templates"
|
|
|
|
def test_legal_templates_vector_size(self):
|
|
"""Test that vector size is correct for BGE-M3."""
|
|
from qdrant_service import LEGAL_TEMPLATES_VECTOR_SIZE
|
|
|
|
assert LEGAL_TEMPLATES_VECTOR_SIZE == 1024
|
|
|
|
|
|
# =============================================================================
|
|
# Integration Tests (require mocking external services)
|
|
# =============================================================================
|
|
|
|
class TestTemplatesIntegration:
|
|
"""Integration tests for the templates system."""
|
|
|
|
@pytest.fixture
|
|
def mock_all_services(self):
|
|
"""Mock all external services."""
|
|
with patch('legal_templates_ingestion.QdrantClient') as qdrant_mock, \
|
|
patch('legal_templates_ingestion.httpx.AsyncClient') as http_mock:
|
|
|
|
qdrant = MagicMock()
|
|
qdrant.get_collections.return_value.collections = []
|
|
qdrant_mock.return_value = qdrant
|
|
|
|
http = AsyncMock()
|
|
http.post.return_value.json.return_value = {"embeddings": [[0.1] * 1024]}
|
|
http.post.return_value.raise_for_status = MagicMock()
|
|
http_mock.return_value.__aenter__.return_value = http
|
|
|
|
yield {"qdrant": qdrant, "http": http}
|
|
|
|
def test_full_chunk_creation_pipeline(self, mock_all_services):
|
|
"""Test the full chunk creation pipeline."""
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
from github_crawler import ExtractedDocument
|
|
from template_sources import SourceConfig, LicenseType
|
|
|
|
ingestion = LegalTemplatesIngestion()
|
|
|
|
doc = ExtractedDocument(
|
|
text="# Datenschutzerklaerung\n\nWir nehmen den Schutz Ihrer personenbezogenen Daten sehr ernst. Diese Datenschutzerklaerung informiert Sie ueber die Verarbeitung Ihrer Daten gemaess der DSGVO.",
|
|
title="Datenschutzerklaerung",
|
|
file_path="privacy.md",
|
|
file_type="markdown",
|
|
source_url="https://example.com/privacy.md",
|
|
source_commit="abc123",
|
|
placeholders=["[FIRMENNAME]"],
|
|
language="de", # Explicitly set language
|
|
)
|
|
|
|
source = SourceConfig(
|
|
name="test-source",
|
|
license_type=LicenseType.CC0,
|
|
template_types=["privacy_policy"],
|
|
languages=["de"],
|
|
jurisdiction="DE",
|
|
description="Test source",
|
|
repo_url="https://github.com/test/repo",
|
|
)
|
|
|
|
chunks = ingestion._create_chunks(doc, source)
|
|
|
|
assert len(chunks) >= 1
|
|
assert chunks[0].template_type == "privacy_policy"
|
|
assert chunks[0].language == "de"
|
|
assert chunks[0].jurisdiction == "DE"
|
|
assert chunks[0].license_id == "cc0"
|
|
assert chunks[0].attribution_required is False
|
|
assert "[FIRMENNAME]" in chunks[0].placeholders
|
|
|
|
|
|
# =============================================================================
|
|
# Test Runner Configuration
|
|
# =============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|