fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,237 @@
"""
Tests für den Legal Crawler Service.
Testet das Crawlen und Parsen von rechtlichen Bildungsinhalten.
"""
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
import httpx
from llm_gateway.services.legal_crawler import (
LegalCrawler,
CrawledDocument,
get_legal_crawler,
)
class TestLegalCrawler:
"""Tests für LegalCrawler Klasse."""
def test_crawler_initialization(self):
"""Test Crawler wird korrekt initialisiert."""
crawler = LegalCrawler()
assert crawler.user_agent == "BreakPilot-Crawler/1.0 (Educational Purpose)"
assert crawler.timeout == 30.0
assert crawler.rate_limit_delay == 1.0
assert crawler.db_pool is None
def test_crawler_with_db_pool(self):
"""Test Crawler mit DB Pool."""
mock_pool = MagicMock()
crawler = LegalCrawler(db_pool=mock_pool)
assert crawler.db_pool == mock_pool
class TestCrawledDocument:
"""Tests für CrawledDocument Dataclass."""
def test_document_creation(self):
"""Test CrawledDocument erstellen."""
doc = CrawledDocument(
url="https://example.com/schulgesetz",
canonical_url="https://example.com/schulgesetz",
title="Schulgesetz NRW",
content="§ 1 Bildungsauftrag...",
content_hash="abc123",
category="legal",
doc_type="schulgesetz",
state="NW",
law_name="SchulG NRW",
paragraphs=[{"nr": "§ 1", "title": "Bildungsauftrag"}],
trust_score=0.9,
)
assert doc.url == "https://example.com/schulgesetz"
assert doc.state == "NW"
assert doc.law_name == "SchulG NRW"
assert len(doc.paragraphs) == 1
def test_document_without_optional_fields(self):
"""Test CrawledDocument ohne optionale Felder."""
doc = CrawledDocument(
url="https://example.com/info",
canonical_url=None,
title="Info Page",
content="Some content",
content_hash="def456",
category="legal",
doc_type="info",
state=None,
law_name=None,
paragraphs=None,
trust_score=0.5,
)
assert doc.state is None
assert doc.paragraphs is None
class TestParagraphExtraction:
"""Tests für die Paragraphen-Extraktion."""
def test_extract_paragraphs_from_html(self):
"""Test Paragraphen werden aus HTML extrahiert."""
crawler = LegalCrawler()
html_content = """
§ 1 Bildungsauftrag
Die Schule hat den Auftrag...
§ 2 Erziehungsauftrag
Die Schule erzieht...
§ 42 Pflichten der Eltern
Die Eltern sind verpflichtet...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup("<body></body>", "html.parser")
paragraphs = crawler._extract_paragraphs(soup, html_content)
assert paragraphs is not None
assert len(paragraphs) >= 3
# Prüfe dass § 42 gefunden wurde
para_numbers = [p["nr"] for p in paragraphs]
assert any("42" in nr for nr in para_numbers)
def test_extract_paragraphs_empty_content(self):
"""Test keine Paragraphen bei leerem Content."""
crawler = LegalCrawler()
from bs4 import BeautifulSoup
soup = BeautifulSoup("<body></body>", "html.parser")
paragraphs = crawler._extract_paragraphs(soup, "")
assert paragraphs is None or len(paragraphs) == 0
def test_extract_paragraphs_no_pattern_match(self):
"""Test keine Paragraphen wenn kein Pattern matched."""
crawler = LegalCrawler()
from bs4 import BeautifulSoup
soup = BeautifulSoup("<body></body>", "html.parser")
paragraphs = crawler._extract_paragraphs(soup, "Just some text without paragraphs")
assert paragraphs is None or len(paragraphs) == 0
class TestCrawlUrl:
"""Tests für das URL-Crawling."""
@pytest.mark.asyncio
async def test_crawl_url_html_success(self):
"""Test erfolgreiches Crawlen einer HTML-URL."""
crawler = LegalCrawler()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"content-type": "text/html; charset=utf-8"}
mock_response.text = """
<html>
<head><title>Schulgesetz NRW</title></head>
<body>
<main>
§ 1 Bildungsauftrag
Die Schule hat den Auftrag...
</main>
</body>
</html>
"""
mock_response.url = "https://example.com/schulgesetz"
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.return_value = mock_response
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
seed_info = {"name": "SchulG NRW", "state": "NW", "trust_boost": 0.95}
doc = await crawler.crawl_url("https://example.com/schulgesetz", seed_info)
assert doc is not None
assert doc.title == "Schulgesetz NRW"
assert doc.state == "NW"
assert doc.trust_score == 0.95
@pytest.mark.asyncio
async def test_crawl_url_404_returns_none(self):
"""Test 404 Error gibt None zurück."""
crawler = LegalCrawler()
mock_response = MagicMock()
mock_response.status_code = 404
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.return_value = mock_response
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
doc = await crawler.crawl_url("https://example.com/notfound", {})
assert doc is None
@pytest.mark.asyncio
async def test_crawl_url_network_error_returns_none(self):
"""Test Netzwerkfehler gibt None zurück."""
crawler = LegalCrawler()
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.side_effect = httpx.ConnectError("Network error")
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
doc = await crawler.crawl_url("https://example.com/error", {})
assert doc is None
@pytest.mark.asyncio
async def test_crawl_url_pdf_returns_none(self):
"""Test PDF URLs werden aktuell übersprungen (not implemented)."""
crawler = LegalCrawler()
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"content-type": "application/pdf"}
mock_response.content = b"%PDF-1.4..."
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.return_value = mock_response
mock_instance.__aenter__.return_value = mock_instance
mock_instance.__aexit__.return_value = None
mock_client.return_value = mock_instance
doc = await crawler.crawl_url("https://example.com/doc.pdf", {})
# PDF extraction ist noch nicht implementiert
assert doc is None
class TestGetLegalCrawler:
"""Tests für Singleton-Pattern."""
def test_get_legal_crawler_singleton(self):
"""Test dass get_legal_crawler immer dieselbe Instanz zurückgibt."""
crawler1 = get_legal_crawler()
crawler2 = get_legal_crawler()
assert crawler1 is crawler2
def test_get_legal_crawler_returns_crawler(self):
"""Test dass get_legal_crawler einen LegalCrawler zurückgibt."""
crawler = get_legal_crawler()
assert isinstance(crawler, LegalCrawler)