Files
breakpilot-lehrer/klausur-service/backend/tests/test_legal_templates.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

624 lines
22 KiB
Python

"""
Tests for Legal Templates RAG System.
Tests template_sources.py, github_crawler.py, legal_templates_ingestion.py,
and the admin API endpoints for legal templates.
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from datetime import datetime
import json
# =============================================================================
# Template Sources Tests
# =============================================================================
class TestLicenseType:
"""Tests for LicenseType enum."""
def test_license_types_exist(self):
"""Test that all expected license types are defined."""
from template_sources import LicenseType
assert LicenseType.PUBLIC_DOMAIN.value == "public_domain"
assert LicenseType.CC0.value == "cc0"
assert LicenseType.UNLICENSE.value == "unlicense"
assert LicenseType.MIT.value == "mit"
assert LicenseType.CC_BY_4.value == "cc_by_4"
assert LicenseType.REUSE_NOTICE.value == "reuse_notice"
class TestLicenseInfo:
"""Tests for LicenseInfo dataclass."""
def test_license_info_creation(self):
"""Test creating a LicenseInfo instance."""
from template_sources import LicenseInfo, LicenseType
info = LicenseInfo(
id=LicenseType.CC0,
name="CC0 1.0 Universal",
url="https://creativecommons.org/publicdomain/zero/1.0/",
attribution_required=False,
)
assert info.id == LicenseType.CC0
assert info.attribution_required is False
assert info.training_allowed is True
assert info.output_allowed is True
def test_get_attribution_text_no_attribution(self):
"""Test attribution text when not required."""
from template_sources import LicenseInfo, LicenseType
info = LicenseInfo(
id=LicenseType.CC0,
name="CC0",
url="https://example.com",
attribution_required=False,
)
result = info.get_attribution_text("Test Source", "https://test.com")
assert result == ""
def test_get_attribution_text_with_template(self):
"""Test attribution text with template."""
from template_sources import LicenseInfo, LicenseType
info = LicenseInfo(
id=LicenseType.MIT,
name="MIT License",
url="https://opensource.org/licenses/MIT",
attribution_required=True,
attribution_template="Based on [{source_name}]({source_url}) - MIT License",
)
result = info.get_attribution_text("Test Source", "https://test.com")
assert "Test Source" in result
assert "https://test.com" in result
class TestSourceConfig:
"""Tests for SourceConfig dataclass."""
def test_source_config_creation(self):
"""Test creating a SourceConfig instance."""
from template_sources import SourceConfig, LicenseType
source = SourceConfig(
name="test-source",
license_type=LicenseType.CC0,
template_types=["privacy_policy", "terms_of_service"],
languages=["de", "en"],
jurisdiction="DE",
description="Test description",
repo_url="https://github.com/test/repo",
)
assert source.name == "test-source"
assert source.license_type == LicenseType.CC0
assert "privacy_policy" in source.template_types
assert source.enabled is True
def test_source_config_license_info(self):
"""Test getting license info from source config."""
from template_sources import SourceConfig, LicenseType, LICENSES
source = SourceConfig(
name="test-source",
license_type=LicenseType.MIT,
template_types=["privacy_policy"],
languages=["en"],
jurisdiction="US",
description="Test",
)
info = source.license_info
assert info.id == LicenseType.MIT
assert info.attribution_required is True
class TestTemplateSources:
"""Tests for TEMPLATE_SOURCES list."""
def test_template_sources_not_empty(self):
"""Test that template sources are defined."""
from template_sources import TEMPLATE_SOURCES
assert len(TEMPLATE_SOURCES) > 0
def test_github_site_policy_exists(self):
"""Test that github-site-policy source exists."""
from template_sources import TEMPLATE_SOURCES
source = next((s for s in TEMPLATE_SOURCES if s.name == "github-site-policy"), None)
assert source is not None
assert source.repo_url == "https://github.com/github/site-policy"
def test_enabled_sources(self):
"""Test getting enabled sources."""
from template_sources import get_enabled_sources
enabled = get_enabled_sources()
assert all(s.enabled for s in enabled)
def test_sources_by_priority(self):
"""Test getting sources by priority."""
from template_sources import get_sources_by_priority
# Priority 1 sources only
p1 = get_sources_by_priority(1)
assert all(s.priority == 1 for s in p1)
# Priority 1-2 sources
p2 = get_sources_by_priority(2)
assert all(s.priority <= 2 for s in p2)
def test_sources_by_license(self):
"""Test getting sources by license type."""
from template_sources import get_sources_by_license, LicenseType
cc0_sources = get_sources_by_license(LicenseType.CC0)
assert all(s.license_type == LicenseType.CC0 for s in cc0_sources)
# =============================================================================
# GitHub Crawler Tests
# =============================================================================
class TestMarkdownParser:
"""Tests for MarkdownParser class."""
def test_parse_simple_markdown(self):
"""Test parsing simple markdown content."""
from github_crawler import MarkdownParser
content = """# Test Title
This is some content.
## Section 1
More content here.
"""
doc = MarkdownParser.parse(content, "test.md")
assert doc.title == "Test Title"
assert doc.file_type == "markdown"
assert "content" in doc.text
def test_extract_title_from_heading(self):
"""Test extracting title from h1 heading."""
from github_crawler import MarkdownParser
title = MarkdownParser._extract_title("# My Document\n\nContent", "fallback.md")
assert title == "My Document"
def test_extract_title_fallback(self):
"""Test fallback to filename when no heading."""
from github_crawler import MarkdownParser
title = MarkdownParser._extract_title("No heading here", "my-document.md")
assert title == "My Document"
def test_detect_german_language(self):
"""Test German language detection."""
from github_crawler import MarkdownParser
german_text = "Dies ist eine Datenschutzerklaerung fuer die Verarbeitung personenbezogener Daten."
lang = MarkdownParser._detect_language(german_text)
assert lang == "de"
def test_detect_english_language(self):
"""Test English language detection."""
from github_crawler import MarkdownParser
english_text = "This is a privacy policy for processing personal data in our application."
lang = MarkdownParser._detect_language(english_text)
assert lang == "en"
def test_find_placeholders(self):
"""Test finding placeholder patterns."""
from github_crawler import MarkdownParser
content = "Company: [COMPANY_NAME], Contact: {email}, Address: __ADDRESS__"
placeholders = MarkdownParser._find_placeholders(content)
assert "[COMPANY_NAME]" in placeholders
assert "{email}" in placeholders
assert "__ADDRESS__" in placeholders
class TestHTMLParser:
"""Tests for HTMLParser class."""
def test_parse_simple_html(self):
"""Test parsing simple HTML content."""
from github_crawler import HTMLParser
content = """<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<h1>Welcome</h1>
<p>This is content.</p>
</body>
</html>"""
doc = HTMLParser.parse(content, "test.html")
assert doc.title == "Test Page"
assert doc.file_type == "html"
assert "Welcome" in doc.text
assert "content" in doc.text
def test_html_to_text_removes_scripts(self):
"""Test that scripts are removed from HTML."""
from github_crawler import HTMLParser
html = "<p>Text</p><script>alert('bad');</script><p>More</p>"
text = HTMLParser._html_to_text(html)
assert "alert" not in text
assert "Text" in text
assert "More" in text
class TestJSONParser:
"""Tests for JSONParser class."""
def test_parse_simple_json(self):
"""Test parsing simple JSON content."""
from github_crawler import JSONParser
content = json.dumps({
"title": "Privacy Policy",
"text": "This is the privacy policy content.",
"language": "en",
})
docs = JSONParser.parse(content, "policy.json")
assert len(docs) == 1
assert docs[0].title == "Privacy Policy"
assert "privacy policy content" in docs[0].text
def test_parse_nested_json(self):
"""Test parsing nested JSON structures."""
from github_crawler import JSONParser
content = json.dumps({
"sections": {
"intro": {"title": "Introduction", "text": "Welcome text"},
"data": {"title": "Data Collection", "text": "Collection info"},
}
})
docs = JSONParser.parse(content, "nested.json")
# Should extract nested documents
assert len(docs) >= 2
class TestExtractedDocument:
"""Tests for ExtractedDocument dataclass."""
def test_extracted_document_hash(self):
"""Test that source hash is auto-generated."""
from github_crawler import ExtractedDocument
doc = ExtractedDocument(
text="Some content",
title="Test",
file_path="test.md",
file_type="markdown",
source_url="https://example.com",
)
assert doc.source_hash != ""
assert len(doc.source_hash) == 64 # SHA256 hex
# =============================================================================
# Legal Templates Ingestion Tests
# =============================================================================
class TestLegalTemplatesIngestion:
"""Tests for LegalTemplatesIngestion class."""
@pytest.fixture
def mock_qdrant(self):
"""Mock Qdrant client."""
with patch('legal_templates_ingestion.QdrantClient') as mock:
client = MagicMock()
client.get_collections.return_value.collections = []
mock.return_value = client
yield client
@pytest.fixture
def mock_http_client(self):
"""Mock HTTP client for embeddings."""
with patch('legal_templates_ingestion.httpx.AsyncClient') as mock:
client = AsyncMock()
mock.return_value = client
yield client
def test_chunk_text_short(self):
"""Test chunking short text."""
from legal_templates_ingestion import LegalTemplatesIngestion
with patch('legal_templates_ingestion.QdrantClient'):
ingestion = LegalTemplatesIngestion()
chunks = ingestion._chunk_text("Short text", chunk_size=1000)
assert len(chunks) == 1
assert chunks[0] == "Short text"
def test_chunk_text_long(self):
"""Test chunking long text."""
from legal_templates_ingestion import LegalTemplatesIngestion
with patch('legal_templates_ingestion.QdrantClient'):
ingestion = LegalTemplatesIngestion()
# Create text longer than chunk size
long_text = "This is a sentence. " * 100
chunks = ingestion._chunk_text(long_text, chunk_size=200, overlap=50)
assert len(chunks) > 1
# Each chunk should be roughly chunk_size
for chunk in chunks:
assert len(chunk) <= 250 # Allow some buffer
def test_split_sentences(self):
"""Test German sentence splitting."""
from legal_templates_ingestion import LegalTemplatesIngestion
with patch('legal_templates_ingestion.QdrantClient'):
ingestion = LegalTemplatesIngestion()
text = "Dies ist Satz eins. Dies ist Satz zwei. Und Satz drei."
sentences = ingestion._split_sentences(text)
assert len(sentences) == 3
def test_split_sentences_preserves_abbreviations(self):
"""Test that abbreviations don't split sentences."""
from legal_templates_ingestion import LegalTemplatesIngestion
with patch('legal_templates_ingestion.QdrantClient'):
ingestion = LegalTemplatesIngestion()
text = "Das ist z.B. ein Beispiel. Und noch ein Satz."
sentences = ingestion._split_sentences(text)
assert len(sentences) == 2
assert "z.B." in sentences[0] or "z.b." in sentences[0].lower()
def test_infer_template_type_privacy(self):
"""Test inferring privacy policy type."""
from legal_templates_ingestion import LegalTemplatesIngestion
from github_crawler import ExtractedDocument
from template_sources import SourceConfig, LicenseType
with patch('legal_templates_ingestion.QdrantClient'):
ingestion = LegalTemplatesIngestion()
doc = ExtractedDocument(
text="Diese Datenschutzerklaerung informiert Sie ueber die Verarbeitung personenbezogener Daten.",
title="Datenschutz",
file_path="privacy.md",
file_type="markdown",
source_url="https://example.com",
)
source = SourceConfig(
name="test",
license_type=LicenseType.CC0,
template_types=["privacy_policy"],
languages=["de"],
jurisdiction="DE",
description="Test",
)
template_type = ingestion._infer_template_type(doc, source)
assert template_type == "privacy_policy"
def test_infer_clause_category(self):
"""Test inferring clause category."""
from legal_templates_ingestion import LegalTemplatesIngestion
with patch('legal_templates_ingestion.QdrantClient'):
ingestion = LegalTemplatesIngestion()
# Test liability clause
text = "Die Haftung des Anbieters ist auf grobe Fahrlässigkeit beschränkt."
category = ingestion._infer_clause_category(text)
assert category == "haftung"
# Test privacy clause
text = "Wir verarbeiten personenbezogene Daten gemäß der DSGVO."
category = ingestion._infer_clause_category(text)
assert category == "datenschutz"
# =============================================================================
# Admin API Templates Tests
# =============================================================================
class TestTemplatesAdminAPI:
"""Tests for /api/v1/admin/templates/* endpoints."""
def test_templates_status_structure(self):
"""Test the structure of templates status response."""
from admin_api import _templates_ingestion_status
# Reset status
_templates_ingestion_status["running"] = False
_templates_ingestion_status["last_run"] = None
_templates_ingestion_status["current_source"] = None
_templates_ingestion_status["results"] = {}
assert _templates_ingestion_status["running"] is False
assert _templates_ingestion_status["results"] == {}
def test_templates_status_running(self):
"""Test status when ingestion is running."""
from admin_api import _templates_ingestion_status
_templates_ingestion_status["running"] = True
_templates_ingestion_status["current_source"] = "github-site-policy"
_templates_ingestion_status["last_run"] = datetime.now().isoformat()
assert _templates_ingestion_status["running"] is True
assert _templates_ingestion_status["current_source"] == "github-site-policy"
def test_templates_results_tracking(self):
"""Test that ingestion results are tracked correctly."""
from admin_api import _templates_ingestion_status
_templates_ingestion_status["results"] = {
"github-site-policy": {
"status": "completed",
"documents_found": 15,
"chunks_indexed": 42,
"errors": [],
},
"opr-vc": {
"status": "failed",
"documents_found": 0,
"chunks_indexed": 0,
"errors": ["Connection timeout"],
},
}
results = _templates_ingestion_status["results"]
assert results["github-site-policy"]["status"] == "completed"
assert results["github-site-policy"]["chunks_indexed"] == 42
assert results["opr-vc"]["status"] == "failed"
assert len(results["opr-vc"]["errors"]) > 0
class TestTemplateTypeLabels:
"""Tests for template type labels and constants."""
def test_template_types_defined(self):
"""Test that template types are properly defined."""
from template_sources import TEMPLATE_TYPES
assert "privacy_policy" in TEMPLATE_TYPES
assert "terms_of_service" in TEMPLATE_TYPES
assert "cookie_banner" in TEMPLATE_TYPES
assert "impressum" in TEMPLATE_TYPES
assert "widerruf" in TEMPLATE_TYPES
assert "dpa" in TEMPLATE_TYPES
def test_jurisdictions_defined(self):
"""Test that jurisdictions are properly defined."""
from template_sources import JURISDICTIONS
assert "DE" in JURISDICTIONS
assert "AT" in JURISDICTIONS
assert "CH" in JURISDICTIONS
assert "EU" in JURISDICTIONS
assert "US" in JURISDICTIONS
# =============================================================================
# Qdrant Service Templates Tests
# =============================================================================
class TestQdrantServiceTemplates:
"""Tests for legal templates Qdrant service functions."""
@pytest.fixture
def mock_qdrant_client(self):
"""Mock Qdrant client for templates."""
with patch('qdrant_service.get_qdrant_client') as mock:
client = MagicMock()
client.get_collections.return_value.collections = []
mock.return_value = client
yield client
def test_legal_templates_collection_name(self):
"""Test that collection name is correct."""
from qdrant_service import LEGAL_TEMPLATES_COLLECTION
assert LEGAL_TEMPLATES_COLLECTION == "bp_legal_templates"
def test_legal_templates_vector_size(self):
"""Test that vector size is correct for BGE-M3."""
from qdrant_service import LEGAL_TEMPLATES_VECTOR_SIZE
assert LEGAL_TEMPLATES_VECTOR_SIZE == 1024
# =============================================================================
# Integration Tests (require mocking external services)
# =============================================================================
class TestTemplatesIntegration:
"""Integration tests for the templates system."""
@pytest.fixture
def mock_all_services(self):
"""Mock all external services."""
with patch('legal_templates_ingestion.QdrantClient') as qdrant_mock, \
patch('legal_templates_ingestion.httpx.AsyncClient') as http_mock:
qdrant = MagicMock()
qdrant.get_collections.return_value.collections = []
qdrant_mock.return_value = qdrant
http = AsyncMock()
http.post.return_value.json.return_value = {"embeddings": [[0.1] * 1024]}
http.post.return_value.raise_for_status = MagicMock()
http_mock.return_value.__aenter__.return_value = http
yield {"qdrant": qdrant, "http": http}
def test_full_chunk_creation_pipeline(self, mock_all_services):
"""Test the full chunk creation pipeline."""
from legal_templates_ingestion import LegalTemplatesIngestion
from github_crawler import ExtractedDocument
from template_sources import SourceConfig, LicenseType
ingestion = LegalTemplatesIngestion()
doc = ExtractedDocument(
text="# Datenschutzerklaerung\n\nWir nehmen den Schutz Ihrer personenbezogenen Daten sehr ernst. Diese Datenschutzerklaerung informiert Sie ueber die Verarbeitung Ihrer Daten gemaess der DSGVO.",
title="Datenschutzerklaerung",
file_path="privacy.md",
file_type="markdown",
source_url="https://example.com/privacy.md",
source_commit="abc123",
placeholders=["[FIRMENNAME]"],
language="de", # Explicitly set language
)
source = SourceConfig(
name="test-source",
license_type=LicenseType.CC0,
template_types=["privacy_policy"],
languages=["de"],
jurisdiction="DE",
description="Test source",
repo_url="https://github.com/test/repo",
)
chunks = ingestion._create_chunks(doc, source)
assert len(chunks) >= 1
assert chunks[0].template_type == "privacy_policy"
assert chunks[0].language == "de"
assert chunks[0].jurisdiction == "DE"
assert chunks[0].license_id == "cc0"
assert chunks[0].attribution_required is False
assert "[FIRMENNAME]" in chunks[0].placeholders
# =============================================================================
# Test Runner Configuration
# =============================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v"])