""" Legal Content Crawler Service. Crawlt Schulgesetze und rechtliche Inhalte von den Seed-URLs und speichert sie in der Datenbank für den Communication-Service. """ import asyncio import hashlib import logging import re from datetime import datetime from typing import Dict, List, Optional from dataclasses import dataclass import httpx from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @dataclass class CrawledDocument: """Repräsentiert ein gecrawltes Dokument.""" url: str canonical_url: Optional[str] title: str content: str content_hash: str category: str doc_type: str state: Optional[str] law_name: Optional[str] paragraphs: Optional[List[Dict]] trust_score: float class LegalCrawler: """Crawler für rechtliche Bildungsinhalte.""" def __init__(self, db_pool=None): self.db_pool = db_pool self.user_agent = "BreakPilot-Crawler/1.0 (Educational Purpose)" self.timeout = 30.0 self.rate_limit_delay = 1.0 # Sekunden zwischen Requests async def crawl_url(self, url: str, seed_info: Dict) -> Optional[CrawledDocument]: """ Crawlt eine URL und extrahiert den Inhalt. Args: url: Die zu crawlende URL seed_info: Metadaten vom Seed (category, state, trust_boost) Returns: CrawledDocument oder None bei Fehler """ try: async with httpx.AsyncClient( follow_redirects=True, timeout=self.timeout, headers={"User-Agent": self.user_agent} ) as client: response = await client.get(url) if response.status_code != 200: logger.warning(f"HTTP {response.status_code} für {url}") return None content_type = response.headers.get("content-type", "") # PDF-Handling (für Saarland etc.) if "pdf" in content_type.lower(): return await self._process_pdf(response, url, seed_info) # HTML-Handling if "html" in content_type.lower(): return await self._process_html(response, url, seed_info) logger.warning(f"Unbekannter Content-Type: {content_type} für {url}") return None except Exception as e: logger.error(f"Fehler beim Crawlen von {url}: {e}") return None async def _process_html( self, response: httpx.Response, url: str, seed_info: Dict ) -> Optional[CrawledDocument]: """Verarbeitet HTML-Inhalte.""" html = response.text soup = BeautifulSoup(html, "html.parser") # Titel extrahieren title = "" title_tag = soup.find("title") if title_tag: title = title_tag.get_text(strip=True) # Haupt-Content extrahieren (verschiedene Strategien) content = "" # Strategie 1: main oder article Tag main = soup.find("main") or soup.find("article") if main: content = main.get_text(separator="\n", strip=True) else: # Strategie 2: Body ohne Navigation etc. for tag in soup.find_all(["nav", "header", "footer", "aside", "script", "style"]): tag.decompose() body = soup.find("body") if body: content = body.get_text(separator="\n", strip=True) if not content: return None # Paragraphen extrahieren (für Schulgesetze) paragraphs = self._extract_paragraphs(soup, content) # Law name ermitteln law_name = seed_info.get("name", "") if not law_name and title: # Aus Titel extrahieren law_patterns = [ r"(SchulG\s+\w+)", r"(Schulgesetz\s+\w+)", r"(BayEUG)", r"(\w+SchulG)", ] for pattern in law_patterns: match = re.search(pattern, title) if match: law_name = match.group(1) break # Content Hash berechnen content_hash = hashlib.sha256(content.encode()).hexdigest()[:64] return CrawledDocument( url=url, canonical_url=str(response.url), title=title, content=content[:100000], # Max 100k Zeichen content_hash=content_hash, category=seed_info.get("category", "legal"), doc_type="schulgesetz", state=seed_info.get("state"), law_name=law_name, paragraphs=paragraphs, trust_score=seed_info.get("trust_boost", 0.9), ) async def _process_pdf( self, response: httpx.Response, url: str, seed_info: Dict ) -> Optional[CrawledDocument]: """Verarbeitet PDF-Inhalte (Placeholder - benötigt PDF-Library).""" # TODO: PDF-Extraktion mit PyPDF2 oder pdfplumber logger.info(f"PDF erkannt: {url} - PDF-Extraktion noch nicht implementiert") return None def _extract_paragraphs( self, soup: BeautifulSoup, content: str ) -> Optional[List[Dict]]: """ Extrahiert Paragraphen aus Gesetzestexten. Sucht nach Mustern wie: - § 42 Titel - Paragraph 42 """ paragraphs = [] # Pattern für Paragraphen paragraph_pattern = r"(§\s*\d+[a-z]?)\s*([^\n§]+)" matches = re.findall(paragraph_pattern, content, re.MULTILINE) for nr, title in matches[:50]: # Max 50 Paragraphen paragraphs.append({ "nr": nr.strip(), "title": title.strip()[:200], }) return paragraphs if paragraphs else None async def crawl_legal_seeds(self, db_pool) -> Dict: """ Crawlt alle Seeds der Kategorie 'legal'. Returns: Statistik über gecrawlte Dokumente """ stats = { "total": 0, "success": 0, "failed": 0, "skipped": 0, } # Seeds aus DB laden async with db_pool.acquire() as conn: seeds = await conn.fetch(""" SELECT s.id, s.url, s.name, s.state, s.trust_boost, c.name as category FROM edu_search_seeds s LEFT JOIN edu_search_categories c ON s.category_id = c.id WHERE c.name = 'legal' AND s.enabled = true """) stats["total"] = len(seeds) logger.info(f"Crawle {len(seeds)} Legal-Seeds...") for seed in seeds: # Rate Limiting await asyncio.sleep(self.rate_limit_delay) seed_info = { "name": seed["name"], "state": seed["state"], "trust_boost": seed["trust_boost"], "category": seed["category"], } doc = await self.crawl_url(seed["url"], seed_info) if doc: # In DB speichern try: await conn.execute(""" INSERT INTO edu_search_documents (url, canonical_url, title, content, content_hash, category, doc_type, state, law_name, paragraphs, trust_score, seed_id, last_crawled_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb, $11, $12, NOW()) ON CONFLICT (url) DO UPDATE SET title = EXCLUDED.title, content = EXCLUDED.content, content_hash = EXCLUDED.content_hash, paragraphs = EXCLUDED.paragraphs, last_crawled_at = NOW(), content_updated_at = CASE WHEN edu_search_documents.content_hash != EXCLUDED.content_hash THEN NOW() ELSE edu_search_documents.content_updated_at END """, doc.url, doc.canonical_url, doc.title, doc.content, doc.content_hash, doc.category, doc.doc_type, doc.state, doc.law_name, str(doc.paragraphs) if doc.paragraphs else None, doc.trust_score, seed["id"] ) stats["success"] += 1 logger.info(f"✓ Gecrawlt: {doc.title[:50]}...") except Exception as e: logger.error(f"DB-Fehler für {doc.url}: {e}") stats["failed"] += 1 else: stats["failed"] += 1 # Seed-Status aktualisieren await conn.execute(""" UPDATE edu_search_seeds SET last_crawled_at = NOW(), last_crawl_status = $1 WHERE id = $2 """, "success" if doc else "failed", seed["id"]) logger.info(f"Crawl abgeschlossen: {stats}") return stats # Singleton-Instanz _crawler_instance: Optional[LegalCrawler] = None def get_legal_crawler() -> LegalCrawler: """Gibt die Singleton-Instanz des Legal Crawlers zurück.""" global _crawler_instance if _crawler_instance is None: _crawler_instance = LegalCrawler() return _crawler_instance