This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/tests/test_llm_gateway/test_legal_crawler.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

238 lines
7.9 KiB
Python

"""
Tests für den Legal Crawler Service.
Testet das Crawlen und Parsen von rechtlichen Bildungsinhalten.
"""
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
import httpx
from llm_gateway.services.legal_crawler import (
LegalCrawler,
CrawledDocument,
get_legal_crawler,
)
class TestLegalCrawler:
"""Tests für LegalCrawler Klasse."""
def test_crawler_initialization(self):
"""Test Crawler wird korrekt initialisiert."""
crawler = LegalCrawler()
assert crawler.user_agent == "BreakPilot-Crawler/1.0 (Educational Purpose)"
assert crawler.timeout == 30.0
assert crawler.rate_limit_delay == 1.0
assert crawler.db_pool is None
def test_crawler_with_db_pool(self):
"""Test Crawler mit DB Pool."""
mock_pool = MagicMock()
crawler = LegalCrawler(db_pool=mock_pool)
assert crawler.db_pool == mock_pool
class TestCrawledDocument:
"""Tests für CrawledDocument Dataclass."""
def test_document_creation(self):
"""Test CrawledDocument erstellen."""
doc = CrawledDocument(
url="https://example.com/schulgesetz",
canonical_url="https://example.com/schulgesetz",
title="Schulgesetz NRW",
content="§ 1 Bildungsauftrag...",
content_hash="abc123",
category="legal",
doc_type="schulgesetz",
state="NW",
law_name="SchulG NRW",
paragraphs=[{"nr": "§ 1", "title": "Bildungsauftrag"}],
trust_score=0.9,
)
assert doc.url == "https://example.com/schulgesetz"
assert doc.state == "NW"
assert doc.law_name == "SchulG NRW"
assert len(doc.paragraphs) == 1
def test_document_without_optional_fields(self):
"""Test CrawledDocument ohne optionale Felder."""
doc = CrawledDocument(
url="https://example.com/info",
canonical_url=None,
title="Info Page",
content="Some content",
content_hash="def456",
category="legal",
doc_type="info",
state=None,
law_name=None,
paragraphs=None,
trust_score=0.5,
)
assert doc.state is None
assert doc.paragraphs is None
class TestParagraphExtraction:
"""Tests für die Paragraphen-Extraktion."""
def test_extract_paragraphs_from_html(self):
"""Test Paragraphen werden aus HTML extrahiert."""
crawler = LegalCrawler()
html_content = """
§ 1 Bildungsauftrag
Die Schule hat den Auftrag...
§ 2 Erziehungsauftrag
Die Schule erzieht...
§ 42 Pflichten der Eltern
Die Eltern sind verpflichtet...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup("<body></body>", "html.parser")
paragraphs = crawler._extract_paragraphs(soup, html_content)
assert paragraphs is not None
assert len(paragraphs) >= 3
# Prüfe dass § 42 gefunden wurde
para_numbers = [p["nr"] for p in paragraphs]
assert any("42" in nr for nr in para_numbers)
def test_extract_paragraphs_empty_content(self):
"""Test keine Paragraphen bei leerem Content."""
crawler = LegalCrawler()
from bs4 import BeautifulSoup
soup = BeautifulSoup("<body></body>", "html.parser")
paragraphs = crawler._extract_paragraphs(soup, "")
assert paragraphs is None or len(paragraphs) == 0
def test_extract_paragraphs_no_pattern_match(self):
"""Test keine Paragraphen wenn kein Pattern matched."""
crawler = LegalCrawler()
from bs4 import BeautifulSoup
soup = BeautifulSoup("<body></body>", "html.parser")
paragraphs = crawler._extract_paragraphs(soup, "Just some text without paragraphs")
assert paragraphs is None or len(paragraphs) == 0
class TestCrawlUrl:
"""Tests für das URL-Crawling."""
@pytest.mark.asyncio
async def test_crawl_url_html_success(self):
"""Test erfolgreiches Crawlen einer HTML-URL."""
crawler = LegalCrawler()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"content-type": "text/html; charset=utf-8"}
mock_response.text = """
<html>
<head><title>Schulgesetz NRW</title></head>
<body>
<main>
§ 1 Bildungsauftrag
Die Schule hat den Auftrag...
</main>
</body>
</html>
"""
mock_response.url = "https://example.com/schulgesetz"
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.return_value = mock_response
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
seed_info = {"name": "SchulG NRW", "state": "NW", "trust_boost": 0.95}
doc = await crawler.crawl_url("https://example.com/schulgesetz", seed_info)
assert doc is not None
assert doc.title == "Schulgesetz NRW"
assert doc.state == "NW"
assert doc.trust_score == 0.95
@pytest.mark.asyncio
async def test_crawl_url_404_returns_none(self):
"""Test 404 Error gibt None zurück."""
crawler = LegalCrawler()
mock_response = MagicMock()
mock_response.status_code = 404
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.return_value = mock_response
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
doc = await crawler.crawl_url("https://example.com/notfound", {})
assert doc is None
@pytest.mark.asyncio
async def test_crawl_url_network_error_returns_none(self):
"""Test Netzwerkfehler gibt None zurück."""
crawler = LegalCrawler()
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.side_effect = httpx.ConnectError("Network error")
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
doc = await crawler.crawl_url("https://example.com/error", {})
assert doc is None
@pytest.mark.asyncio
async def test_crawl_url_pdf_returns_none(self):
"""Test PDF URLs werden aktuell übersprungen (not implemented)."""
crawler = LegalCrawler()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"content-type": "application/pdf"}
mock_response.content = b"%PDF-1.4..."
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.return_value = mock_response
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
doc = await crawler.crawl_url("https://example.com/doc.pdf", {})
# PDF extraction ist noch nicht implementiert
assert doc is None
class TestGetLegalCrawler:
"""Tests für Singleton-Pattern."""
def test_get_legal_crawler_singleton(self):
"""Test dass get_legal_crawler immer dieselbe Instanz zurückgibt."""
crawler1 = get_legal_crawler()
crawler2 = get_legal_crawler()
assert crawler1 is crawler2
def test_get_legal_crawler_returns_crawler(self):
"""Test dass get_legal_crawler einen LegalCrawler zurückgibt."""
crawler = get_legal_crawler()
assert isinstance(crawler, LegalCrawler)