feat: BreakPilot PWA - Full codebase (clean push without large binaries)

All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00
commit 19855efacc
2512 changed files with 933814 additions and 0 deletions
--- a/backend/tests/test_llm_gateway/test_legal_crawler.py
+++ b/backend/tests/test_llm_gateway/test_legal_crawler.py
@@ -0,0 +1,237 @@
+"""
+Tests für den Legal Crawler Service.
+
+Testet das Crawlen und Parsen von rechtlichen Bildungsinhalten.
+"""
+
+import pytest
+from unittest.mock import AsyncMock, patch, MagicMock
+import httpx
+
+from llm_gateway.services.legal_crawler import (
+    LegalCrawler,
+    CrawledDocument,
+    get_legal_crawler,
+)
+
+
+class TestLegalCrawler:
+    """Tests für LegalCrawler Klasse."""
+
+    def test_crawler_initialization(self):
+        """Test Crawler wird korrekt initialisiert."""
+        crawler = LegalCrawler()
+        assert crawler.user_agent == "BreakPilot-Crawler/1.0 (Educational Purpose)"
+        assert crawler.timeout == 30.0
+        assert crawler.rate_limit_delay == 1.0
+        assert crawler.db_pool is None
+
+    def test_crawler_with_db_pool(self):
+        """Test Crawler mit DB Pool."""
+        mock_pool = MagicMock()
+        crawler = LegalCrawler(db_pool=mock_pool)
+        assert crawler.db_pool == mock_pool
+
+
+class TestCrawledDocument:
+    """Tests für CrawledDocument Dataclass."""
+
+    def test_document_creation(self):
+        """Test CrawledDocument erstellen."""
+        doc = CrawledDocument(
+            url="https://example.com/schulgesetz",
+            canonical_url="https://example.com/schulgesetz",
+            title="Schulgesetz NRW",
+            content="§ 1 Bildungsauftrag...",
+            content_hash="abc123",
+            category="legal",
+            doc_type="schulgesetz",
+            state="NW",
+            law_name="SchulG NRW",
+            paragraphs=[{"nr": "§ 1", "title": "Bildungsauftrag"}],
+            trust_score=0.9,
+        )
+        assert doc.url == "https://example.com/schulgesetz"
+        assert doc.state == "NW"
+        assert doc.law_name == "SchulG NRW"
+        assert len(doc.paragraphs) == 1
+
+    def test_document_without_optional_fields(self):
+        """Test CrawledDocument ohne optionale Felder."""
+        doc = CrawledDocument(
+            url="https://example.com/info",
+            canonical_url=None,
+            title="Info Page",
+            content="Some content",
+            content_hash="def456",
+            category="legal",
+            doc_type="info",
+            state=None,
+            law_name=None,
+            paragraphs=None,
+            trust_score=0.5,
+        )
+        assert doc.state is None
+        assert doc.paragraphs is None
+
+
+class TestParagraphExtraction:
+    """Tests für die Paragraphen-Extraktion."""
+
+    def test_extract_paragraphs_from_html(self):
+        """Test Paragraphen werden aus HTML extrahiert."""
+        crawler = LegalCrawler()
+        html_content = """
+        § 1 Bildungsauftrag
+        Die Schule hat den Auftrag...
+
+        § 2 Erziehungsauftrag
+        Die Schule erzieht...
+
+        § 42 Pflichten der Eltern
+        Die Eltern sind verpflichtet...
+        """
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup("<body></body>", "html.parser")
+
+        paragraphs = crawler._extract_paragraphs(soup, html_content)
+
+        assert paragraphs is not None
+        assert len(paragraphs) >= 3
+        # Prüfe dass § 42 gefunden wurde
+        para_numbers = [p["nr"] for p in paragraphs]
+        assert any("42" in nr for nr in para_numbers)
+
+    def test_extract_paragraphs_empty_content(self):
+        """Test keine Paragraphen bei leerem Content."""
+        crawler = LegalCrawler()
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup("<body></body>", "html.parser")
+
+        paragraphs = crawler._extract_paragraphs(soup, "")
+
+        assert paragraphs is None or len(paragraphs) == 0
+
+    def test_extract_paragraphs_no_pattern_match(self):
+        """Test keine Paragraphen wenn kein Pattern matched."""
+        crawler = LegalCrawler()
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup("<body></body>", "html.parser")
+
+        paragraphs = crawler._extract_paragraphs(soup, "Just some text without paragraphs")
+
+        assert paragraphs is None or len(paragraphs) == 0
+
+
+class TestCrawlUrl:
+    """Tests für das URL-Crawling."""
+
+    @pytest.mark.asyncio
+    async def test_crawl_url_html_success(self):
+        """Test erfolgreiches Crawlen einer HTML-URL."""
+        crawler = LegalCrawler()
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.headers = {"content-type": "text/html; charset=utf-8"}
+        mock_response.text = """
+        <html>
+        <head><title>Schulgesetz NRW</title></head>
+        <body>
+            <main>
+            § 1 Bildungsauftrag
+            Die Schule hat den Auftrag...
+            </main>
+        </body>
+        </html>
+        """
+        mock_response.url = "https://example.com/schulgesetz"
+
+        with patch("httpx.AsyncClient") as mock_client:
+            mock_instance = AsyncMock()
+            mock_instance.get.return_value = mock_response
+            mock_instance.__aenter__.return_value = mock_instance
+            mock_instance.__aexit__.return_value = None
+            mock_client.return_value = mock_instance
+
+            seed_info = {"name": "SchulG NRW", "state": "NW", "trust_boost": 0.95}
+            doc = await crawler.crawl_url("https://example.com/schulgesetz", seed_info)
+
+            assert doc is not None
+            assert doc.title == "Schulgesetz NRW"
+            assert doc.state == "NW"
+            assert doc.trust_score == 0.95
+
+    @pytest.mark.asyncio
+    async def test_crawl_url_404_returns_none(self):
+        """Test 404 Error gibt None zurück."""
+        crawler = LegalCrawler()
+
+        mock_response = MagicMock()
+        mock_response.status_code = 404
+
+        with patch("httpx.AsyncClient") as mock_client:
+            mock_instance = AsyncMock()
+            mock_instance.get.return_value = mock_response
+            mock_instance.__aenter__.return_value = mock_instance
+            mock_instance.__aexit__.return_value = None
+            mock_client.return_value = mock_instance
+
+            doc = await crawler.crawl_url("https://example.com/notfound", {})
+
+            assert doc is None
+
+    @pytest.mark.asyncio
+    async def test_crawl_url_network_error_returns_none(self):
+        """Test Netzwerkfehler gibt None zurück."""
+        crawler = LegalCrawler()
+
+        with patch("httpx.AsyncClient") as mock_client:
+            mock_instance = AsyncMock()
+            mock_instance.get.side_effect = httpx.ConnectError("Network error")
+            mock_instance.__aenter__.return_value = mock_instance
+            mock_instance.__aexit__.return_value = None
+            mock_client.return_value = mock_instance
+
+            doc = await crawler.crawl_url("https://example.com/error", {})
+
+            assert doc is None
+
+    @pytest.mark.asyncio
+    async def test_crawl_url_pdf_returns_none(self):
+        """Test PDF URLs werden aktuell übersprungen (not implemented)."""
+        crawler = LegalCrawler()
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.headers = {"content-type": "application/pdf"}
+        mock_response.content = b"%PDF-1.4..."
+
+        with patch("httpx.AsyncClient") as mock_client:
+            mock_instance = AsyncMock()
+            mock_instance.get.return_value = mock_response
+            mock_instance.__aenter__.return_value = mock_instance
+            mock_instance.__aexit__.return_value = None
+            mock_client.return_value = mock_instance
+
+            doc = await crawler.crawl_url("https://example.com/doc.pdf", {})
+
+            # PDF extraction ist noch nicht implementiert
+            assert doc is None
+
+
+class TestGetLegalCrawler:
+    """Tests für Singleton-Pattern."""
+
+    def test_get_legal_crawler_singleton(self):
+        """Test dass get_legal_crawler immer dieselbe Instanz zurückgibt."""
+        crawler1 = get_legal_crawler()
+        crawler2 = get_legal_crawler()
+
+        assert crawler1 is crawler2
+
+    def test_get_legal_crawler_returns_crawler(self):
+        """Test dass get_legal_crawler einen LegalCrawler zurückgibt."""
+        crawler = get_legal_crawler()
+
+        assert isinstance(crawler, LegalCrawler)