""" Tests für den Legal Crawler Service. Testet das Crawlen und Parsen von rechtlichen Bildungsinhalten. """ import pytest from unittest.mock import AsyncMock, patch, MagicMock import httpx from llm_gateway.services.legal_crawler import ( LegalCrawler, CrawledDocument, get_legal_crawler, ) class TestLegalCrawler: """Tests für LegalCrawler Klasse.""" def test_crawler_initialization(self): """Test Crawler wird korrekt initialisiert.""" crawler = LegalCrawler() assert crawler.user_agent == "BreakPilot-Crawler/1.0 (Educational Purpose)" assert crawler.timeout == 30.0 assert crawler.rate_limit_delay == 1.0 assert crawler.db_pool is None def test_crawler_with_db_pool(self): """Test Crawler mit DB Pool.""" mock_pool = MagicMock() crawler = LegalCrawler(db_pool=mock_pool) assert crawler.db_pool == mock_pool class TestCrawledDocument: """Tests für CrawledDocument Dataclass.""" def test_document_creation(self): """Test CrawledDocument erstellen.""" doc = CrawledDocument( url="https://example.com/schulgesetz", canonical_url="https://example.com/schulgesetz", title="Schulgesetz NRW", content="§ 1 Bildungsauftrag...", content_hash="abc123", category="legal", doc_type="schulgesetz", state="NW", law_name="SchulG NRW", paragraphs=[{"nr": "§ 1", "title": "Bildungsauftrag"}], trust_score=0.9, ) assert doc.url == "https://example.com/schulgesetz" assert doc.state == "NW" assert doc.law_name == "SchulG NRW" assert len(doc.paragraphs) == 1 def test_document_without_optional_fields(self): """Test CrawledDocument ohne optionale Felder.""" doc = CrawledDocument( url="https://example.com/info", canonical_url=None, title="Info Page", content="Some content", content_hash="def456", category="legal", doc_type="info", state=None, law_name=None, paragraphs=None, trust_score=0.5, ) assert doc.state is None assert doc.paragraphs is None class TestParagraphExtraction: """Tests für die Paragraphen-Extraktion.""" def test_extract_paragraphs_from_html(self): """Test Paragraphen werden aus HTML extrahiert.""" crawler = LegalCrawler() html_content = """ § 1 Bildungsauftrag Die Schule hat den Auftrag... § 2 Erziehungsauftrag Die Schule erzieht... § 42 Pflichten der Eltern Die Eltern sind verpflichtet... """ from bs4 import BeautifulSoup soup = BeautifulSoup("", "html.parser") paragraphs = crawler._extract_paragraphs(soup, html_content) assert paragraphs is not None assert len(paragraphs) >= 3 # Prüfe dass § 42 gefunden wurde para_numbers = [p["nr"] for p in paragraphs] assert any("42" in nr for nr in para_numbers) def test_extract_paragraphs_empty_content(self): """Test keine Paragraphen bei leerem Content.""" crawler = LegalCrawler() from bs4 import BeautifulSoup soup = BeautifulSoup("", "html.parser") paragraphs = crawler._extract_paragraphs(soup, "") assert paragraphs is None or len(paragraphs) == 0 def test_extract_paragraphs_no_pattern_match(self): """Test keine Paragraphen wenn kein Pattern matched.""" crawler = LegalCrawler() from bs4 import BeautifulSoup soup = BeautifulSoup("", "html.parser") paragraphs = crawler._extract_paragraphs(soup, "Just some text without paragraphs") assert paragraphs is None or len(paragraphs) == 0 class TestCrawlUrl: """Tests für das URL-Crawling.""" @pytest.mark.asyncio async def test_crawl_url_html_success(self): """Test erfolgreiches Crawlen einer HTML-URL.""" crawler = LegalCrawler() mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"content-type": "text/html; charset=utf-8"} mock_response.text = """ Schulgesetz NRW
§ 1 Bildungsauftrag Die Schule hat den Auftrag...
""" mock_response.url = "https://example.com/schulgesetz" with patch("httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance seed_info = {"name": "SchulG NRW", "state": "NW", "trust_boost": 0.95} doc = await crawler.crawl_url("https://example.com/schulgesetz", seed_info) assert doc is not None assert doc.title == "Schulgesetz NRW" assert doc.state == "NW" assert doc.trust_score == 0.95 @pytest.mark.asyncio async def test_crawl_url_404_returns_none(self): """Test 404 Error gibt None zurück.""" crawler = LegalCrawler() mock_response = MagicMock() mock_response.status_code = 404 with patch("httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance doc = await crawler.crawl_url("https://example.com/notfound", {}) assert doc is None @pytest.mark.asyncio async def test_crawl_url_network_error_returns_none(self): """Test Netzwerkfehler gibt None zurück.""" crawler = LegalCrawler() with patch("httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.side_effect = httpx.ConnectError("Network error") mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance doc = await crawler.crawl_url("https://example.com/error", {}) assert doc is None @pytest.mark.asyncio async def test_crawl_url_pdf_returns_none(self): """Test PDF URLs werden aktuell übersprungen (not implemented).""" crawler = LegalCrawler() mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"content-type": "application/pdf"} mock_response.content = b"%PDF-1.4..." with patch("httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance doc = await crawler.crawl_url("https://example.com/doc.pdf", {}) # PDF extraction ist noch nicht implementiert assert doc is None class TestGetLegalCrawler: """Tests für Singleton-Pattern.""" def test_get_legal_crawler_singleton(self): """Test dass get_legal_crawler immer dieselbe Instanz zurückgibt.""" crawler1 = get_legal_crawler() crawler2 = get_legal_crawler() assert crawler1 is crawler2 def test_get_legal_crawler_returns_crawler(self): """Test dass get_legal_crawler einen LegalCrawler zurückgibt.""" crawler = get_legal_crawler() assert isinstance(crawler, LegalCrawler)