fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/llm_gateway/services/legal_crawler.py
+++ b/backend/llm_gateway/services/legal_crawler.py
@@ -0,0 +1,290 @@
+"""
+Legal Content Crawler Service.
+
+Crawlt Schulgesetze und rechtliche Inhalte von den Seed-URLs
+und speichert sie in der Datenbank für den Communication-Service.
+"""
+
+import asyncio
+import hashlib
+import logging
+import re
+from datetime import datetime
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CrawledDocument:
+    """Repräsentiert ein gecrawltes Dokument."""
+    url: str
+    canonical_url: Optional[str]
+    title: str
+    content: str
+    content_hash: str
+    category: str
+    doc_type: str
+    state: Optional[str]
+    law_name: Optional[str]
+    paragraphs: Optional[List[Dict]]
+    trust_score: float
+
+
+class LegalCrawler:
+    """Crawler für rechtliche Bildungsinhalte."""
+
+    def __init__(self, db_pool=None):
+        self.db_pool = db_pool
+        self.user_agent = "BreakPilot-Crawler/1.0 (Educational Purpose)"
+        self.timeout = 30.0
+        self.rate_limit_delay = 1.0  # Sekunden zwischen Requests
+
+    async def crawl_url(self, url: str, seed_info: Dict) -> Optional[CrawledDocument]:
+        """
+        Crawlt eine URL und extrahiert den Inhalt.
+
+        Args:
+            url: Die zu crawlende URL
+            seed_info: Metadaten vom Seed (category, state, trust_boost)
+
+        Returns:
+            CrawledDocument oder None bei Fehler
+        """
+        try:
+            async with httpx.AsyncClient(
+                follow_redirects=True,
+                timeout=self.timeout,
+                headers={"User-Agent": self.user_agent}
+            ) as client:
+                response = await client.get(url)
+
+                if response.status_code != 200:
+                    logger.warning(f"HTTP {response.status_code} für {url}")
+                    return None
+
+                content_type = response.headers.get("content-type", "")
+
+                # PDF-Handling (für Saarland etc.)
+                if "pdf" in content_type.lower():
+                    return await self._process_pdf(response, url, seed_info)
+
+                # HTML-Handling
+                if "html" in content_type.lower():
+                    return await self._process_html(response, url, seed_info)
+
+                logger.warning(f"Unbekannter Content-Type: {content_type} für {url}")
+                return None
+
+        except Exception as e:
+            logger.error(f"Fehler beim Crawlen von {url}: {e}")
+            return None
+
+    async def _process_html(
+        self,
+        response: httpx.Response,
+        url: str,
+        seed_info: Dict
+    ) -> Optional[CrawledDocument]:
+        """Verarbeitet HTML-Inhalte."""
+        html = response.text
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Titel extrahieren
+        title = ""
+        title_tag = soup.find("title")
+        if title_tag:
+            title = title_tag.get_text(strip=True)
+
+        # Haupt-Content extrahieren (verschiedene Strategien)
+        content = ""
+
+        # Strategie 1: main oder article Tag
+        main = soup.find("main") or soup.find("article")
+        if main:
+            content = main.get_text(separator="\n", strip=True)
+        else:
+            # Strategie 2: Body ohne Navigation etc.
+            for tag in soup.find_all(["nav", "header", "footer", "aside", "script", "style"]):
+                tag.decompose()
+            body = soup.find("body")
+            if body:
+                content = body.get_text(separator="\n", strip=True)
+
+        if not content:
+            return None
+
+        # Paragraphen extrahieren (für Schulgesetze)
+        paragraphs = self._extract_paragraphs(soup, content)
+
+        # Law name ermitteln
+        law_name = seed_info.get("name", "")
+        if not law_name and title:
+            # Aus Titel extrahieren
+            law_patterns = [
+                r"(SchulG\s+\w+)",
+                r"(Schulgesetz\s+\w+)",
+                r"(BayEUG)",
+                r"(\w+SchulG)",
+            ]
+            for pattern in law_patterns:
+                match = re.search(pattern, title)
+                if match:
+                    law_name = match.group(1)
+                    break
+
+        # Content Hash berechnen
+        content_hash = hashlib.sha256(content.encode()).hexdigest()[:64]
+
+        return CrawledDocument(
+            url=url,
+            canonical_url=str(response.url),
+            title=title,
+            content=content[:100000],  # Max 100k Zeichen
+            content_hash=content_hash,
+            category=seed_info.get("category", "legal"),
+            doc_type="schulgesetz",
+            state=seed_info.get("state"),
+            law_name=law_name,
+            paragraphs=paragraphs,
+            trust_score=seed_info.get("trust_boost", 0.9),
+        )
+
+    async def _process_pdf(
+        self,
+        response: httpx.Response,
+        url: str,
+        seed_info: Dict
+    ) -> Optional[CrawledDocument]:
+        """Verarbeitet PDF-Inhalte (Placeholder - benötigt PDF-Library)."""
+        # TODO: PDF-Extraktion mit PyPDF2 oder pdfplumber
+        logger.info(f"PDF erkannt: {url} - PDF-Extraktion noch nicht implementiert")
+        return None
+
+    def _extract_paragraphs(
+        self,
+        soup: BeautifulSoup,
+        content: str
+    ) -> Optional[List[Dict]]:
+        """
+        Extrahiert Paragraphen aus Gesetzestexten.
+
+        Sucht nach Mustern wie:
+        - § 42 Titel
+        - Paragraph 42
+        """
+        paragraphs = []
+
+        # Pattern für Paragraphen
+        paragraph_pattern = r"(§\s*\d+[a-z]?)\s*([^\n§]+)"
+        matches = re.findall(paragraph_pattern, content, re.MULTILINE)
+
+        for nr, title in matches[:50]:  # Max 50 Paragraphen
+            paragraphs.append({
+                "nr": nr.strip(),
+                "title": title.strip()[:200],
+            })
+
+        return paragraphs if paragraphs else None
+
+    async def crawl_legal_seeds(self, db_pool) -> Dict:
+        """
+        Crawlt alle Seeds der Kategorie 'legal'.
+
+        Returns:
+            Statistik über gecrawlte Dokumente
+        """
+        stats = {
+            "total": 0,
+            "success": 0,
+            "failed": 0,
+            "skipped": 0,
+        }
+
+        # Seeds aus DB laden
+        async with db_pool.acquire() as conn:
+            seeds = await conn.fetch("""
+                SELECT s.id, s.url, s.name, s.state, s.trust_boost,
+                       c.name as category
+                FROM edu_search_seeds s
+                LEFT JOIN edu_search_categories c ON s.category_id = c.id
+                WHERE c.name = 'legal' AND s.enabled = true
+            """)
+
+            stats["total"] = len(seeds)
+            logger.info(f"Crawle {len(seeds)} Legal-Seeds...")
+
+            for seed in seeds:
+                # Rate Limiting
+                await asyncio.sleep(self.rate_limit_delay)
+
+                seed_info = {
+                    "name": seed["name"],
+                    "state": seed["state"],
+                    "trust_boost": seed["trust_boost"],
+                    "category": seed["category"],
+                }
+
+                doc = await self.crawl_url(seed["url"], seed_info)
+
+                if doc:
+                    # In DB speichern
+                    try:
+                        await conn.execute("""
+                            INSERT INTO edu_search_documents
+                            (url, canonical_url, title, content, content_hash,
+                             category, doc_type, state, law_name, paragraphs,
+                             trust_score, seed_id, last_crawled_at)
+                            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb, $11, $12, NOW())
+                            ON CONFLICT (url) DO UPDATE SET
+                                title = EXCLUDED.title,
+                                content = EXCLUDED.content,
+                                content_hash = EXCLUDED.content_hash,
+                                paragraphs = EXCLUDED.paragraphs,
+                                last_crawled_at = NOW(),
+                                content_updated_at = CASE
+                                    WHEN edu_search_documents.content_hash != EXCLUDED.content_hash
+                                    THEN NOW()
+                                    ELSE edu_search_documents.content_updated_at
+                                END
+                        """,
+                            doc.url, doc.canonical_url, doc.title, doc.content,
+                            doc.content_hash, doc.category, doc.doc_type, doc.state,
+                            doc.law_name,
+                            str(doc.paragraphs) if doc.paragraphs else None,
+                            doc.trust_score, seed["id"]
+                        )
+                        stats["success"] += 1
+                        logger.info(f"✓ Gecrawlt: {doc.title[:50]}...")
+                    except Exception as e:
+                        logger.error(f"DB-Fehler für {doc.url}: {e}")
+                        stats["failed"] += 1
+                else:
+                    stats["failed"] += 1
+
+                # Seed-Status aktualisieren
+                await conn.execute("""
+                    UPDATE edu_search_seeds
+                    SET last_crawled_at = NOW(),
+                        last_crawl_status = $1
+                    WHERE id = $2
+                """, "success" if doc else "failed", seed["id"])
+
+        logger.info(f"Crawl abgeschlossen: {stats}")
+        return stats
+
+
+# Singleton-Instanz
+_crawler_instance: Optional[LegalCrawler] = None
+
+
+def get_legal_crawler() -> LegalCrawler:
+    """Gibt die Singleton-Instanz des Legal Crawlers zurück."""
+    global _crawler_instance
+    if _crawler_instance is None:
+        _crawler_instance = LegalCrawler()
+    return _crawler_instance