This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/llm_gateway/services/legal_crawler.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

291 lines
9.6 KiB
Python

"""
Legal Content Crawler Service.
Crawlt Schulgesetze und rechtliche Inhalte von den Seed-URLs
und speichert sie in der Datenbank für den Communication-Service.
"""
import asyncio
import hashlib
import logging
import re
from datetime import datetime
from typing import Dict, List, Optional
from dataclasses import dataclass
import httpx
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@dataclass
class CrawledDocument:
"""Repräsentiert ein gecrawltes Dokument."""
url: str
canonical_url: Optional[str]
title: str
content: str
content_hash: str
category: str
doc_type: str
state: Optional[str]
law_name: Optional[str]
paragraphs: Optional[List[Dict]]
trust_score: float
class LegalCrawler:
"""Crawler für rechtliche Bildungsinhalte."""
def __init__(self, db_pool=None):
self.db_pool = db_pool
self.user_agent = "BreakPilot-Crawler/1.0 (Educational Purpose)"
self.timeout = 30.0
self.rate_limit_delay = 1.0 # Sekunden zwischen Requests
async def crawl_url(self, url: str, seed_info: Dict) -> Optional[CrawledDocument]:
"""
Crawlt eine URL und extrahiert den Inhalt.
Args:
url: Die zu crawlende URL
seed_info: Metadaten vom Seed (category, state, trust_boost)
Returns:
CrawledDocument oder None bei Fehler
"""
try:
async with httpx.AsyncClient(
follow_redirects=True,
timeout=self.timeout,
headers={"User-Agent": self.user_agent}
) as client:
response = await client.get(url)
if response.status_code != 200:
logger.warning(f"HTTP {response.status_code} für {url}")
return None
content_type = response.headers.get("content-type", "")
# PDF-Handling (für Saarland etc.)
if "pdf" in content_type.lower():
return await self._process_pdf(response, url, seed_info)
# HTML-Handling
if "html" in content_type.lower():
return await self._process_html(response, url, seed_info)
logger.warning(f"Unbekannter Content-Type: {content_type} für {url}")
return None
except Exception as e:
logger.error(f"Fehler beim Crawlen von {url}: {e}")
return None
async def _process_html(
self,
response: httpx.Response,
url: str,
seed_info: Dict
) -> Optional[CrawledDocument]:
"""Verarbeitet HTML-Inhalte."""
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Titel extrahieren
title = ""
title_tag = soup.find("title")
if title_tag:
title = title_tag.get_text(strip=True)
# Haupt-Content extrahieren (verschiedene Strategien)
content = ""
# Strategie 1: main oder article Tag
main = soup.find("main") or soup.find("article")
if main:
content = main.get_text(separator="\n", strip=True)
else:
# Strategie 2: Body ohne Navigation etc.
for tag in soup.find_all(["nav", "header", "footer", "aside", "script", "style"]):
tag.decompose()
body = soup.find("body")
if body:
content = body.get_text(separator="\n", strip=True)
if not content:
return None
# Paragraphen extrahieren (für Schulgesetze)
paragraphs = self._extract_paragraphs(soup, content)
# Law name ermitteln
law_name = seed_info.get("name", "")
if not law_name and title:
# Aus Titel extrahieren
law_patterns = [
r"(SchulG\s+\w+)",
r"(Schulgesetz\s+\w+)",
r"(BayEUG)",
r"(\w+SchulG)",
]
for pattern in law_patterns:
match = re.search(pattern, title)
if match:
law_name = match.group(1)
break
# Content Hash berechnen
content_hash = hashlib.sha256(content.encode()).hexdigest()[:64]
return CrawledDocument(
url=url,
canonical_url=str(response.url),
title=title,
content=content[:100000], # Max 100k Zeichen
content_hash=content_hash,
category=seed_info.get("category", "legal"),
doc_type="schulgesetz",
state=seed_info.get("state"),
law_name=law_name,
paragraphs=paragraphs,
trust_score=seed_info.get("trust_boost", 0.9),
)
async def _process_pdf(
self,
response: httpx.Response,
url: str,
seed_info: Dict
) -> Optional[CrawledDocument]:
"""Verarbeitet PDF-Inhalte (Placeholder - benötigt PDF-Library)."""
# TODO: PDF-Extraktion mit PyPDF2 oder pdfplumber
logger.info(f"PDF erkannt: {url} - PDF-Extraktion noch nicht implementiert")
return None
def _extract_paragraphs(
self,
soup: BeautifulSoup,
content: str
) -> Optional[List[Dict]]:
"""
Extrahiert Paragraphen aus Gesetzestexten.
Sucht nach Mustern wie:
- § 42 Titel
- Paragraph 42
"""
paragraphs = []
# Pattern für Paragraphen
paragraph_pattern = r"\s*\d+[a-z]?)\s*([^\n§]+)"
matches = re.findall(paragraph_pattern, content, re.MULTILINE)
for nr, title in matches[:50]: # Max 50 Paragraphen
paragraphs.append({
"nr": nr.strip(),
"title": title.strip()[:200],
})
return paragraphs if paragraphs else None
async def crawl_legal_seeds(self, db_pool) -> Dict:
"""
Crawlt alle Seeds der Kategorie 'legal'.
Returns:
Statistik über gecrawlte Dokumente
"""
stats = {
"total": 0,
"success": 0,
"failed": 0,
"skipped": 0,
}
# Seeds aus DB laden
async with db_pool.acquire() as conn:
seeds = await conn.fetch("""
SELECT s.id, s.url, s.name, s.state, s.trust_boost,
c.name as category
FROM edu_search_seeds s
LEFT JOIN edu_search_categories c ON s.category_id = c.id
WHERE c.name = 'legal' AND s.enabled = true
""")
stats["total"] = len(seeds)
logger.info(f"Crawle {len(seeds)} Legal-Seeds...")
for seed in seeds:
# Rate Limiting
await asyncio.sleep(self.rate_limit_delay)
seed_info = {
"name": seed["name"],
"state": seed["state"],
"trust_boost": seed["trust_boost"],
"category": seed["category"],
}
doc = await self.crawl_url(seed["url"], seed_info)
if doc:
# In DB speichern
try:
await conn.execute("""
INSERT INTO edu_search_documents
(url, canonical_url, title, content, content_hash,
category, doc_type, state, law_name, paragraphs,
trust_score, seed_id, last_crawled_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb, $11, $12, NOW())
ON CONFLICT (url) DO UPDATE SET
title = EXCLUDED.title,
content = EXCLUDED.content,
content_hash = EXCLUDED.content_hash,
paragraphs = EXCLUDED.paragraphs,
last_crawled_at = NOW(),
content_updated_at = CASE
WHEN edu_search_documents.content_hash != EXCLUDED.content_hash
THEN NOW()
ELSE edu_search_documents.content_updated_at
END
""",
doc.url, doc.canonical_url, doc.title, doc.content,
doc.content_hash, doc.category, doc.doc_type, doc.state,
doc.law_name,
str(doc.paragraphs) if doc.paragraphs else None,
doc.trust_score, seed["id"]
)
stats["success"] += 1
logger.info(f"✓ Gecrawlt: {doc.title[:50]}...")
except Exception as e:
logger.error(f"DB-Fehler für {doc.url}: {e}")
stats["failed"] += 1
else:
stats["failed"] += 1
# Seed-Status aktualisieren
await conn.execute("""
UPDATE edu_search_seeds
SET last_crawled_at = NOW(),
last_crawl_status = $1
WHERE id = $2
""", "success" if doc else "failed", seed["id"])
logger.info(f"Crawl abgeschlossen: {stats}")
return stats
# Singleton-Instanz
_crawler_instance: Optional[LegalCrawler] = None
def get_legal_crawler() -> LegalCrawler:
"""Gibt die Singleton-Instanz des Legal Crawlers zurück."""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = LegalCrawler()
return _crawler_instance