A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
291 lines
9.6 KiB
Python
291 lines
9.6 KiB
Python
"""
|
|
Legal Content Crawler Service.
|
|
|
|
Crawlt Schulgesetze und rechtliche Inhalte von den Seed-URLs
|
|
und speichert sie in der Datenbank für den Communication-Service.
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional
|
|
from dataclasses import dataclass
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class CrawledDocument:
|
|
"""Repräsentiert ein gecrawltes Dokument."""
|
|
url: str
|
|
canonical_url: Optional[str]
|
|
title: str
|
|
content: str
|
|
content_hash: str
|
|
category: str
|
|
doc_type: str
|
|
state: Optional[str]
|
|
law_name: Optional[str]
|
|
paragraphs: Optional[List[Dict]]
|
|
trust_score: float
|
|
|
|
|
|
class LegalCrawler:
|
|
"""Crawler für rechtliche Bildungsinhalte."""
|
|
|
|
def __init__(self, db_pool=None):
|
|
self.db_pool = db_pool
|
|
self.user_agent = "BreakPilot-Crawler/1.0 (Educational Purpose)"
|
|
self.timeout = 30.0
|
|
self.rate_limit_delay = 1.0 # Sekunden zwischen Requests
|
|
|
|
async def crawl_url(self, url: str, seed_info: Dict) -> Optional[CrawledDocument]:
|
|
"""
|
|
Crawlt eine URL und extrahiert den Inhalt.
|
|
|
|
Args:
|
|
url: Die zu crawlende URL
|
|
seed_info: Metadaten vom Seed (category, state, trust_boost)
|
|
|
|
Returns:
|
|
CrawledDocument oder None bei Fehler
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
follow_redirects=True,
|
|
timeout=self.timeout,
|
|
headers={"User-Agent": self.user_agent}
|
|
) as client:
|
|
response = await client.get(url)
|
|
|
|
if response.status_code != 200:
|
|
logger.warning(f"HTTP {response.status_code} für {url}")
|
|
return None
|
|
|
|
content_type = response.headers.get("content-type", "")
|
|
|
|
# PDF-Handling (für Saarland etc.)
|
|
if "pdf" in content_type.lower():
|
|
return await self._process_pdf(response, url, seed_info)
|
|
|
|
# HTML-Handling
|
|
if "html" in content_type.lower():
|
|
return await self._process_html(response, url, seed_info)
|
|
|
|
logger.warning(f"Unbekannter Content-Type: {content_type} für {url}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Crawlen von {url}: {e}")
|
|
return None
|
|
|
|
async def _process_html(
|
|
self,
|
|
response: httpx.Response,
|
|
url: str,
|
|
seed_info: Dict
|
|
) -> Optional[CrawledDocument]:
|
|
"""Verarbeitet HTML-Inhalte."""
|
|
html = response.text
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Titel extrahieren
|
|
title = ""
|
|
title_tag = soup.find("title")
|
|
if title_tag:
|
|
title = title_tag.get_text(strip=True)
|
|
|
|
# Haupt-Content extrahieren (verschiedene Strategien)
|
|
content = ""
|
|
|
|
# Strategie 1: main oder article Tag
|
|
main = soup.find("main") or soup.find("article")
|
|
if main:
|
|
content = main.get_text(separator="\n", strip=True)
|
|
else:
|
|
# Strategie 2: Body ohne Navigation etc.
|
|
for tag in soup.find_all(["nav", "header", "footer", "aside", "script", "style"]):
|
|
tag.decompose()
|
|
body = soup.find("body")
|
|
if body:
|
|
content = body.get_text(separator="\n", strip=True)
|
|
|
|
if not content:
|
|
return None
|
|
|
|
# Paragraphen extrahieren (für Schulgesetze)
|
|
paragraphs = self._extract_paragraphs(soup, content)
|
|
|
|
# Law name ermitteln
|
|
law_name = seed_info.get("name", "")
|
|
if not law_name and title:
|
|
# Aus Titel extrahieren
|
|
law_patterns = [
|
|
r"(SchulG\s+\w+)",
|
|
r"(Schulgesetz\s+\w+)",
|
|
r"(BayEUG)",
|
|
r"(\w+SchulG)",
|
|
]
|
|
for pattern in law_patterns:
|
|
match = re.search(pattern, title)
|
|
if match:
|
|
law_name = match.group(1)
|
|
break
|
|
|
|
# Content Hash berechnen
|
|
content_hash = hashlib.sha256(content.encode()).hexdigest()[:64]
|
|
|
|
return CrawledDocument(
|
|
url=url,
|
|
canonical_url=str(response.url),
|
|
title=title,
|
|
content=content[:100000], # Max 100k Zeichen
|
|
content_hash=content_hash,
|
|
category=seed_info.get("category", "legal"),
|
|
doc_type="schulgesetz",
|
|
state=seed_info.get("state"),
|
|
law_name=law_name,
|
|
paragraphs=paragraphs,
|
|
trust_score=seed_info.get("trust_boost", 0.9),
|
|
)
|
|
|
|
async def _process_pdf(
|
|
self,
|
|
response: httpx.Response,
|
|
url: str,
|
|
seed_info: Dict
|
|
) -> Optional[CrawledDocument]:
|
|
"""Verarbeitet PDF-Inhalte (Placeholder - benötigt PDF-Library)."""
|
|
# TODO: PDF-Extraktion mit PyPDF2 oder pdfplumber
|
|
logger.info(f"PDF erkannt: {url} - PDF-Extraktion noch nicht implementiert")
|
|
return None
|
|
|
|
def _extract_paragraphs(
|
|
self,
|
|
soup: BeautifulSoup,
|
|
content: str
|
|
) -> Optional[List[Dict]]:
|
|
"""
|
|
Extrahiert Paragraphen aus Gesetzestexten.
|
|
|
|
Sucht nach Mustern wie:
|
|
- § 42 Titel
|
|
- Paragraph 42
|
|
"""
|
|
paragraphs = []
|
|
|
|
# Pattern für Paragraphen
|
|
paragraph_pattern = r"(§\s*\d+[a-z]?)\s*([^\n§]+)"
|
|
matches = re.findall(paragraph_pattern, content, re.MULTILINE)
|
|
|
|
for nr, title in matches[:50]: # Max 50 Paragraphen
|
|
paragraphs.append({
|
|
"nr": nr.strip(),
|
|
"title": title.strip()[:200],
|
|
})
|
|
|
|
return paragraphs if paragraphs else None
|
|
|
|
async def crawl_legal_seeds(self, db_pool) -> Dict:
|
|
"""
|
|
Crawlt alle Seeds der Kategorie 'legal'.
|
|
|
|
Returns:
|
|
Statistik über gecrawlte Dokumente
|
|
"""
|
|
stats = {
|
|
"total": 0,
|
|
"success": 0,
|
|
"failed": 0,
|
|
"skipped": 0,
|
|
}
|
|
|
|
# Seeds aus DB laden
|
|
async with db_pool.acquire() as conn:
|
|
seeds = await conn.fetch("""
|
|
SELECT s.id, s.url, s.name, s.state, s.trust_boost,
|
|
c.name as category
|
|
FROM edu_search_seeds s
|
|
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
|
WHERE c.name = 'legal' AND s.enabled = true
|
|
""")
|
|
|
|
stats["total"] = len(seeds)
|
|
logger.info(f"Crawle {len(seeds)} Legal-Seeds...")
|
|
|
|
for seed in seeds:
|
|
# Rate Limiting
|
|
await asyncio.sleep(self.rate_limit_delay)
|
|
|
|
seed_info = {
|
|
"name": seed["name"],
|
|
"state": seed["state"],
|
|
"trust_boost": seed["trust_boost"],
|
|
"category": seed["category"],
|
|
}
|
|
|
|
doc = await self.crawl_url(seed["url"], seed_info)
|
|
|
|
if doc:
|
|
# In DB speichern
|
|
try:
|
|
await conn.execute("""
|
|
INSERT INTO edu_search_documents
|
|
(url, canonical_url, title, content, content_hash,
|
|
category, doc_type, state, law_name, paragraphs,
|
|
trust_score, seed_id, last_crawled_at)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb, $11, $12, NOW())
|
|
ON CONFLICT (url) DO UPDATE SET
|
|
title = EXCLUDED.title,
|
|
content = EXCLUDED.content,
|
|
content_hash = EXCLUDED.content_hash,
|
|
paragraphs = EXCLUDED.paragraphs,
|
|
last_crawled_at = NOW(),
|
|
content_updated_at = CASE
|
|
WHEN edu_search_documents.content_hash != EXCLUDED.content_hash
|
|
THEN NOW()
|
|
ELSE edu_search_documents.content_updated_at
|
|
END
|
|
""",
|
|
doc.url, doc.canonical_url, doc.title, doc.content,
|
|
doc.content_hash, doc.category, doc.doc_type, doc.state,
|
|
doc.law_name,
|
|
str(doc.paragraphs) if doc.paragraphs else None,
|
|
doc.trust_score, seed["id"]
|
|
)
|
|
stats["success"] += 1
|
|
logger.info(f"✓ Gecrawlt: {doc.title[:50]}...")
|
|
except Exception as e:
|
|
logger.error(f"DB-Fehler für {doc.url}: {e}")
|
|
stats["failed"] += 1
|
|
else:
|
|
stats["failed"] += 1
|
|
|
|
# Seed-Status aktualisieren
|
|
await conn.execute("""
|
|
UPDATE edu_search_seeds
|
|
SET last_crawled_at = NOW(),
|
|
last_crawl_status = $1
|
|
WHERE id = $2
|
|
""", "success" if doc else "failed", seed["id"])
|
|
|
|
logger.info(f"Crawl abgeschlossen: {stats}")
|
|
return stats
|
|
|
|
|
|
# Singleton-Instanz
|
|
_crawler_instance: Optional[LegalCrawler] = None
|
|
|
|
|
|
def get_legal_crawler() -> LegalCrawler:
|
|
"""Gibt die Singleton-Instanz des Legal Crawlers zurück."""
|
|
global _crawler_instance
|
|
if _crawler_instance is None:
|
|
_crawler_instance = LegalCrawler()
|
|
return _crawler_instance
|