fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
290
backend/llm_gateway/services/legal_crawler.py
Normal file
290
backend/llm_gateway/services/legal_crawler.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""
|
||||
Legal Content Crawler Service.
|
||||
|
||||
Crawlt Schulgesetze und rechtliche Inhalte von den Seed-URLs
|
||||
und speichert sie in der Datenbank für den Communication-Service.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawledDocument:
|
||||
"""Repräsentiert ein gecrawltes Dokument."""
|
||||
url: str
|
||||
canonical_url: Optional[str]
|
||||
title: str
|
||||
content: str
|
||||
content_hash: str
|
||||
category: str
|
||||
doc_type: str
|
||||
state: Optional[str]
|
||||
law_name: Optional[str]
|
||||
paragraphs: Optional[List[Dict]]
|
||||
trust_score: float
|
||||
|
||||
|
||||
class LegalCrawler:
|
||||
"""Crawler für rechtliche Bildungsinhalte."""
|
||||
|
||||
def __init__(self, db_pool=None):
|
||||
self.db_pool = db_pool
|
||||
self.user_agent = "BreakPilot-Crawler/1.0 (Educational Purpose)"
|
||||
self.timeout = 30.0
|
||||
self.rate_limit_delay = 1.0 # Sekunden zwischen Requests
|
||||
|
||||
async def crawl_url(self, url: str, seed_info: Dict) -> Optional[CrawledDocument]:
|
||||
"""
|
||||
Crawlt eine URL und extrahiert den Inhalt.
|
||||
|
||||
Args:
|
||||
url: Die zu crawlende URL
|
||||
seed_info: Metadaten vom Seed (category, state, trust_boost)
|
||||
|
||||
Returns:
|
||||
CrawledDocument oder None bei Fehler
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=self.timeout,
|
||||
headers={"User-Agent": self.user_agent}
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"HTTP {response.status_code} für {url}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get("content-type", "")
|
||||
|
||||
# PDF-Handling (für Saarland etc.)
|
||||
if "pdf" in content_type.lower():
|
||||
return await self._process_pdf(response, url, seed_info)
|
||||
|
||||
# HTML-Handling
|
||||
if "html" in content_type.lower():
|
||||
return await self._process_html(response, url, seed_info)
|
||||
|
||||
logger.warning(f"Unbekannter Content-Type: {content_type} für {url}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Crawlen von {url}: {e}")
|
||||
return None
|
||||
|
||||
async def _process_html(
|
||||
self,
|
||||
response: httpx.Response,
|
||||
url: str,
|
||||
seed_info: Dict
|
||||
) -> Optional[CrawledDocument]:
|
||||
"""Verarbeitet HTML-Inhalte."""
|
||||
html = response.text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Titel extrahieren
|
||||
title = ""
|
||||
title_tag = soup.find("title")
|
||||
if title_tag:
|
||||
title = title_tag.get_text(strip=True)
|
||||
|
||||
# Haupt-Content extrahieren (verschiedene Strategien)
|
||||
content = ""
|
||||
|
||||
# Strategie 1: main oder article Tag
|
||||
main = soup.find("main") or soup.find("article")
|
||||
if main:
|
||||
content = main.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
# Strategie 2: Body ohne Navigation etc.
|
||||
for tag in soup.find_all(["nav", "header", "footer", "aside", "script", "style"]):
|
||||
tag.decompose()
|
||||
body = soup.find("body")
|
||||
if body:
|
||||
content = body.get_text(separator="\n", strip=True)
|
||||
|
||||
if not content:
|
||||
return None
|
||||
|
||||
# Paragraphen extrahieren (für Schulgesetze)
|
||||
paragraphs = self._extract_paragraphs(soup, content)
|
||||
|
||||
# Law name ermitteln
|
||||
law_name = seed_info.get("name", "")
|
||||
if not law_name and title:
|
||||
# Aus Titel extrahieren
|
||||
law_patterns = [
|
||||
r"(SchulG\s+\w+)",
|
||||
r"(Schulgesetz\s+\w+)",
|
||||
r"(BayEUG)",
|
||||
r"(\w+SchulG)",
|
||||
]
|
||||
for pattern in law_patterns:
|
||||
match = re.search(pattern, title)
|
||||
if match:
|
||||
law_name = match.group(1)
|
||||
break
|
||||
|
||||
# Content Hash berechnen
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()[:64]
|
||||
|
||||
return CrawledDocument(
|
||||
url=url,
|
||||
canonical_url=str(response.url),
|
||||
title=title,
|
||||
content=content[:100000], # Max 100k Zeichen
|
||||
content_hash=content_hash,
|
||||
category=seed_info.get("category", "legal"),
|
||||
doc_type="schulgesetz",
|
||||
state=seed_info.get("state"),
|
||||
law_name=law_name,
|
||||
paragraphs=paragraphs,
|
||||
trust_score=seed_info.get("trust_boost", 0.9),
|
||||
)
|
||||
|
||||
async def _process_pdf(
|
||||
self,
|
||||
response: httpx.Response,
|
||||
url: str,
|
||||
seed_info: Dict
|
||||
) -> Optional[CrawledDocument]:
|
||||
"""Verarbeitet PDF-Inhalte (Placeholder - benötigt PDF-Library)."""
|
||||
# TODO: PDF-Extraktion mit PyPDF2 oder pdfplumber
|
||||
logger.info(f"PDF erkannt: {url} - PDF-Extraktion noch nicht implementiert")
|
||||
return None
|
||||
|
||||
def _extract_paragraphs(
|
||||
self,
|
||||
soup: BeautifulSoup,
|
||||
content: str
|
||||
) -> Optional[List[Dict]]:
|
||||
"""
|
||||
Extrahiert Paragraphen aus Gesetzestexten.
|
||||
|
||||
Sucht nach Mustern wie:
|
||||
- § 42 Titel
|
||||
- Paragraph 42
|
||||
"""
|
||||
paragraphs = []
|
||||
|
||||
# Pattern für Paragraphen
|
||||
paragraph_pattern = r"(§\s*\d+[a-z]?)\s*([^\n§]+)"
|
||||
matches = re.findall(paragraph_pattern, content, re.MULTILINE)
|
||||
|
||||
for nr, title in matches[:50]: # Max 50 Paragraphen
|
||||
paragraphs.append({
|
||||
"nr": nr.strip(),
|
||||
"title": title.strip()[:200],
|
||||
})
|
||||
|
||||
return paragraphs if paragraphs else None
|
||||
|
||||
async def crawl_legal_seeds(self, db_pool) -> Dict:
|
||||
"""
|
||||
Crawlt alle Seeds der Kategorie 'legal'.
|
||||
|
||||
Returns:
|
||||
Statistik über gecrawlte Dokumente
|
||||
"""
|
||||
stats = {
|
||||
"total": 0,
|
||||
"success": 0,
|
||||
"failed": 0,
|
||||
"skipped": 0,
|
||||
}
|
||||
|
||||
# Seeds aus DB laden
|
||||
async with db_pool.acquire() as conn:
|
||||
seeds = await conn.fetch("""
|
||||
SELECT s.id, s.url, s.name, s.state, s.trust_boost,
|
||||
c.name as category
|
||||
FROM edu_search_seeds s
|
||||
LEFT JOIN edu_search_categories c ON s.category_id = c.id
|
||||
WHERE c.name = 'legal' AND s.enabled = true
|
||||
""")
|
||||
|
||||
stats["total"] = len(seeds)
|
||||
logger.info(f"Crawle {len(seeds)} Legal-Seeds...")
|
||||
|
||||
for seed in seeds:
|
||||
# Rate Limiting
|
||||
await asyncio.sleep(self.rate_limit_delay)
|
||||
|
||||
seed_info = {
|
||||
"name": seed["name"],
|
||||
"state": seed["state"],
|
||||
"trust_boost": seed["trust_boost"],
|
||||
"category": seed["category"],
|
||||
}
|
||||
|
||||
doc = await self.crawl_url(seed["url"], seed_info)
|
||||
|
||||
if doc:
|
||||
# In DB speichern
|
||||
try:
|
||||
await conn.execute("""
|
||||
INSERT INTO edu_search_documents
|
||||
(url, canonical_url, title, content, content_hash,
|
||||
category, doc_type, state, law_name, paragraphs,
|
||||
trust_score, seed_id, last_crawled_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb, $11, $12, NOW())
|
||||
ON CONFLICT (url) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
content = EXCLUDED.content,
|
||||
content_hash = EXCLUDED.content_hash,
|
||||
paragraphs = EXCLUDED.paragraphs,
|
||||
last_crawled_at = NOW(),
|
||||
content_updated_at = CASE
|
||||
WHEN edu_search_documents.content_hash != EXCLUDED.content_hash
|
||||
THEN NOW()
|
||||
ELSE edu_search_documents.content_updated_at
|
||||
END
|
||||
""",
|
||||
doc.url, doc.canonical_url, doc.title, doc.content,
|
||||
doc.content_hash, doc.category, doc.doc_type, doc.state,
|
||||
doc.law_name,
|
||||
str(doc.paragraphs) if doc.paragraphs else None,
|
||||
doc.trust_score, seed["id"]
|
||||
)
|
||||
stats["success"] += 1
|
||||
logger.info(f"✓ Gecrawlt: {doc.title[:50]}...")
|
||||
except Exception as e:
|
||||
logger.error(f"DB-Fehler für {doc.url}: {e}")
|
||||
stats["failed"] += 1
|
||||
else:
|
||||
stats["failed"] += 1
|
||||
|
||||
# Seed-Status aktualisieren
|
||||
await conn.execute("""
|
||||
UPDATE edu_search_seeds
|
||||
SET last_crawled_at = NOW(),
|
||||
last_crawl_status = $1
|
||||
WHERE id = $2
|
||||
""", "success" if doc else "failed", seed["id"])
|
||||
|
||||
logger.info(f"Crawl abgeschlossen: {stats}")
|
||||
return stats
|
||||
|
||||
|
||||
# Singleton-Instanz
|
||||
_crawler_instance: Optional[LegalCrawler] = None
|
||||
|
||||
|
||||
def get_legal_crawler() -> LegalCrawler:
|
||||
"""Gibt die Singleton-Instanz des Legal Crawlers zurück."""
|
||||
global _crawler_instance
|
||||
if _crawler_instance is None:
|
||||
_crawler_instance = LegalCrawler()
|
||||
return _crawler_instance
|
||||
Reference in New Issue
Block a user