feat: BreakPilot PWA - Full codebase (clean push without large binaries)
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
This commit is contained in:
237
backend/tests/test_llm_gateway/test_legal_crawler.py
Normal file
237
backend/tests/test_llm_gateway/test_legal_crawler.py
Normal file
@@ -0,0 +1,237 @@
|
||||
"""
|
||||
Tests für den Legal Crawler Service.
|
||||
|
||||
Testet das Crawlen und Parsen von rechtlichen Bildungsinhalten.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
import httpx
|
||||
|
||||
from llm_gateway.services.legal_crawler import (
|
||||
LegalCrawler,
|
||||
CrawledDocument,
|
||||
get_legal_crawler,
|
||||
)
|
||||
|
||||
|
||||
class TestLegalCrawler:
|
||||
"""Tests für LegalCrawler Klasse."""
|
||||
|
||||
def test_crawler_initialization(self):
|
||||
"""Test Crawler wird korrekt initialisiert."""
|
||||
crawler = LegalCrawler()
|
||||
assert crawler.user_agent == "BreakPilot-Crawler/1.0 (Educational Purpose)"
|
||||
assert crawler.timeout == 30.0
|
||||
assert crawler.rate_limit_delay == 1.0
|
||||
assert crawler.db_pool is None
|
||||
|
||||
def test_crawler_with_db_pool(self):
|
||||
"""Test Crawler mit DB Pool."""
|
||||
mock_pool = MagicMock()
|
||||
crawler = LegalCrawler(db_pool=mock_pool)
|
||||
assert crawler.db_pool == mock_pool
|
||||
|
||||
|
||||
class TestCrawledDocument:
|
||||
"""Tests für CrawledDocument Dataclass."""
|
||||
|
||||
def test_document_creation(self):
|
||||
"""Test CrawledDocument erstellen."""
|
||||
doc = CrawledDocument(
|
||||
url="https://example.com/schulgesetz",
|
||||
canonical_url="https://example.com/schulgesetz",
|
||||
title="Schulgesetz NRW",
|
||||
content="§ 1 Bildungsauftrag...",
|
||||
content_hash="abc123",
|
||||
category="legal",
|
||||
doc_type="schulgesetz",
|
||||
state="NW",
|
||||
law_name="SchulG NRW",
|
||||
paragraphs=[{"nr": "§ 1", "title": "Bildungsauftrag"}],
|
||||
trust_score=0.9,
|
||||
)
|
||||
assert doc.url == "https://example.com/schulgesetz"
|
||||
assert doc.state == "NW"
|
||||
assert doc.law_name == "SchulG NRW"
|
||||
assert len(doc.paragraphs) == 1
|
||||
|
||||
def test_document_without_optional_fields(self):
|
||||
"""Test CrawledDocument ohne optionale Felder."""
|
||||
doc = CrawledDocument(
|
||||
url="https://example.com/info",
|
||||
canonical_url=None,
|
||||
title="Info Page",
|
||||
content="Some content",
|
||||
content_hash="def456",
|
||||
category="legal",
|
||||
doc_type="info",
|
||||
state=None,
|
||||
law_name=None,
|
||||
paragraphs=None,
|
||||
trust_score=0.5,
|
||||
)
|
||||
assert doc.state is None
|
||||
assert doc.paragraphs is None
|
||||
|
||||
|
||||
class TestParagraphExtraction:
|
||||
"""Tests für die Paragraphen-Extraktion."""
|
||||
|
||||
def test_extract_paragraphs_from_html(self):
|
||||
"""Test Paragraphen werden aus HTML extrahiert."""
|
||||
crawler = LegalCrawler()
|
||||
html_content = """
|
||||
§ 1 Bildungsauftrag
|
||||
Die Schule hat den Auftrag...
|
||||
|
||||
§ 2 Erziehungsauftrag
|
||||
Die Schule erzieht...
|
||||
|
||||
§ 42 Pflichten der Eltern
|
||||
Die Eltern sind verpflichtet...
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup("<body></body>", "html.parser")
|
||||
|
||||
paragraphs = crawler._extract_paragraphs(soup, html_content)
|
||||
|
||||
assert paragraphs is not None
|
||||
assert len(paragraphs) >= 3
|
||||
# Prüfe dass § 42 gefunden wurde
|
||||
para_numbers = [p["nr"] for p in paragraphs]
|
||||
assert any("42" in nr for nr in para_numbers)
|
||||
|
||||
def test_extract_paragraphs_empty_content(self):
|
||||
"""Test keine Paragraphen bei leerem Content."""
|
||||
crawler = LegalCrawler()
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup("<body></body>", "html.parser")
|
||||
|
||||
paragraphs = crawler._extract_paragraphs(soup, "")
|
||||
|
||||
assert paragraphs is None or len(paragraphs) == 0
|
||||
|
||||
def test_extract_paragraphs_no_pattern_match(self):
|
||||
"""Test keine Paragraphen wenn kein Pattern matched."""
|
||||
crawler = LegalCrawler()
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup("<body></body>", "html.parser")
|
||||
|
||||
paragraphs = crawler._extract_paragraphs(soup, "Just some text without paragraphs")
|
||||
|
||||
assert paragraphs is None or len(paragraphs) == 0
|
||||
|
||||
|
||||
class TestCrawlUrl:
|
||||
"""Tests für das URL-Crawling."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_html_success(self):
|
||||
"""Test erfolgreiches Crawlen einer HTML-URL."""
|
||||
crawler = LegalCrawler()
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"content-type": "text/html; charset=utf-8"}
|
||||
mock_response.text = """
|
||||
<html>
|
||||
<head><title>Schulgesetz NRW</title></head>
|
||||
<body>
|
||||
<main>
|
||||
§ 1 Bildungsauftrag
|
||||
Die Schule hat den Auftrag...
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
mock_response.url = "https://example.com/schulgesetz"
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client:
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.get.return_value = mock_response
|
||||
mock_instance.__aenter__.return_value = mock_instance
|
||||
mock_instance.__aexit__.return_value = None
|
||||
mock_client.return_value = mock_instance
|
||||
|
||||
seed_info = {"name": "SchulG NRW", "state": "NW", "trust_boost": 0.95}
|
||||
doc = await crawler.crawl_url("https://example.com/schulgesetz", seed_info)
|
||||
|
||||
assert doc is not None
|
||||
assert doc.title == "Schulgesetz NRW"
|
||||
assert doc.state == "NW"
|
||||
assert doc.trust_score == 0.95
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_404_returns_none(self):
|
||||
"""Test 404 Error gibt None zurück."""
|
||||
crawler = LegalCrawler()
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 404
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client:
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.get.return_value = mock_response
|
||||
mock_instance.__aenter__.return_value = mock_instance
|
||||
mock_instance.__aexit__.return_value = None
|
||||
mock_client.return_value = mock_instance
|
||||
|
||||
doc = await crawler.crawl_url("https://example.com/notfound", {})
|
||||
|
||||
assert doc is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_network_error_returns_none(self):
|
||||
"""Test Netzwerkfehler gibt None zurück."""
|
||||
crawler = LegalCrawler()
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client:
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.get.side_effect = httpx.ConnectError("Network error")
|
||||
mock_instance.__aenter__.return_value = mock_instance
|
||||
mock_instance.__aexit__.return_value = None
|
||||
mock_client.return_value = mock_instance
|
||||
|
||||
doc = await crawler.crawl_url("https://example.com/error", {})
|
||||
|
||||
assert doc is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_pdf_returns_none(self):
|
||||
"""Test PDF URLs werden aktuell übersprungen (not implemented)."""
|
||||
crawler = LegalCrawler()
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"content-type": "application/pdf"}
|
||||
mock_response.content = b"%PDF-1.4..."
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client:
|
||||
mock_instance = AsyncMock()
|
||||
mock_instance.get.return_value = mock_response
|
||||
mock_instance.__aenter__.return_value = mock_instance
|
||||
mock_instance.__aexit__.return_value = None
|
||||
mock_client.return_value = mock_instance
|
||||
|
||||
doc = await crawler.crawl_url("https://example.com/doc.pdf", {})
|
||||
|
||||
# PDF extraction ist noch nicht implementiert
|
||||
assert doc is None
|
||||
|
||||
|
||||
class TestGetLegalCrawler:
|
||||
"""Tests für Singleton-Pattern."""
|
||||
|
||||
def test_get_legal_crawler_singleton(self):
|
||||
"""Test dass get_legal_crawler immer dieselbe Instanz zurückgibt."""
|
||||
crawler1 = get_legal_crawler()
|
||||
crawler2 = get_legal_crawler()
|
||||
|
||||
assert crawler1 is crawler2
|
||||
|
||||
def test_get_legal_crawler_returns_crawler(self):
|
||||
"""Test dass get_legal_crawler einen LegalCrawler zurückgibt."""
|
||||
crawler = get_legal_crawler()
|
||||
|
||||
assert isinstance(crawler, LegalCrawler)
|
||||
Reference in New Issue
Block a user