feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""
+vast.ai Profile Extractor Script
+Dieses Skript läuft auf vast.ai und extrahiert Profildaten von Universitäts-Webseiten.
+
+Verwendung auf vast.ai:
+1. Lade dieses Skript auf deine vast.ai Instanz
+2. Installiere Abhängigkeiten: pip install requests beautifulsoup4 openai
+3. Setze Umgebungsvariablen:
+   - BREAKPILOT_API_URL=http://deine-ip:8086
+   - BREAKPILOT_API_KEY=dev-key
+   - OPENAI_API_KEY=sk-...
+4. Starte: python vast_ai_extractor.py
+"""
+
+import os
+import sys
+import json
+import time
+import logging
+import requests
+from bs4 import BeautifulSoup
+from typing import Optional, Dict, Any, List
+
+# Logging Setup
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+API_URL = os.environ.get('BREAKPILOT_API_URL', 'http://localhost:8086')
+API_KEY = os.environ.get('BREAKPILOT_API_KEY', 'dev-key')
+OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
+BATCH_SIZE = 10
+SLEEP_BETWEEN_REQUESTS = 1  # Sekunden zwischen Requests (respektiere rate limits)
+
+
+def fetch_pending_profiles(limit: int = 50) -> List[Dict]:
+    """Hole Profile die noch extrahiert werden müssen."""
+    try:
+        response = requests.get(
+            f"{API_URL}/api/v1/ai/extraction/pending",
+            params={"limit": limit},
+            headers={"Authorization": f"Bearer {API_KEY}"},
+            timeout=30
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data.get("tasks", [])
+    except Exception as e:
+        logger.error(f"Fehler beim Abrufen der Profile: {e}")
+        return []
+
+
+def fetch_profile_page(url: str) -> Optional[str]:
+    """Lade den HTML-Inhalt einer Profilseite."""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; BreakPilot-Crawler/1.0; +https://breakpilot.de)',
+            'Accept': 'text/html,application/xhtml+xml',
+            'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
+        }
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logger.error(f"Fehler beim Laden von {url}: {e}")
+        return None
+
+
+def extract_with_beautifulsoup(html: str, url: str) -> Dict[str, Any]:
+    """Extrahiere Basis-Informationen mit BeautifulSoup (ohne AI)."""
+    soup = BeautifulSoup(html, 'html.parser')
+    data = {}
+
+    # Email suchen
+    email_links = soup.find_all('a', href=lambda x: x and x.startswith('mailto:'))
+    if email_links:
+        email = email_links[0]['href'].replace('mailto:', '').split('?')[0]
+        data['email'] = email
+
+    # Telefon suchen
+    phone_links = soup.find_all('a', href=lambda x: x and x.startswith('tel:'))
+    if phone_links:
+        data['phone'] = phone_links[0]['href'].replace('tel:', '')
+
+    # ORCID suchen
+    orcid_links = soup.find_all('a', href=lambda x: x and 'orcid.org' in x)
+    if orcid_links:
+        orcid = orcid_links[0]['href']
+        # Extrahiere ORCID ID
+        if '/' in orcid:
+            data['orcid'] = orcid.split('/')[-1]
+
+    # Google Scholar suchen
+    scholar_links = soup.find_all('a', href=lambda x: x and 'scholar.google' in x)
+    if scholar_links:
+        href = scholar_links[0]['href']
+        if 'user=' in href:
+            data['google_scholar_id'] = href.split('user=')[1].split('&')[0]
+
+    # ResearchGate suchen
+    rg_links = soup.find_all('a', href=lambda x: x and 'researchgate.net' in x)
+    if rg_links:
+        data['researchgate_url'] = rg_links[0]['href']
+
+    # LinkedIn suchen
+    linkedin_links = soup.find_all('a', href=lambda x: x and 'linkedin.com' in x)
+    if linkedin_links:
+        data['linkedin_url'] = linkedin_links[0]['href']
+
+    # Institut/Abteilung Links sammeln (für Hierarchie-Erkennung)
+    base_domain = '/'.join(url.split('/')[:3])
+    department_links = []
+    for link in soup.find_all('a', href=True):
+        href = link['href']
+        text = link.get_text(strip=True)
+        # Suche nach Links die auf Institute/Fakultäten hindeuten
+        if any(kw in text.lower() for kw in ['institut', 'fakultät', 'fachbereich', 'abteilung', 'lehrstuhl']):
+            if href.startswith('/'):
+                href = base_domain + href
+            if href.startswith('http'):
+                department_links.append({'url': href, 'name': text})
+
+    if department_links:
+        # Nimm den ersten gefundenen Department-Link
+        data['department_url'] = department_links[0]['url']
+        data['department_name'] = department_links[0]['name']
+
+    return data
+
+
+def extract_with_ai(html: str, url: str, full_name: str) -> Dict[str, Any]:
+    """Extrahiere strukturierte Daten mit OpenAI GPT."""
+    if not OPENAI_API_KEY:
+        logger.warning("Kein OPENAI_API_KEY gesetzt - nutze nur BeautifulSoup")
+        return extract_with_beautifulsoup(html, url)
+
+    try:
+        import openai
+        client = openai.OpenAI(api_key=OPENAI_API_KEY)
+
+        # Reduziere HTML auf relevanten Text
+        soup = BeautifulSoup(html, 'html.parser')
+
+        # Entferne Scripts, Styles, etc.
+        for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
+            tag.decompose()
+
+        # Extrahiere Text
+        text = soup.get_text(separator='\n', strip=True)
+        # Limitiere auf 8000 Zeichen für API
+        text = text[:8000]
+
+        prompt = f"""Analysiere diese Universitäts-Profilseite für {full_name} und extrahiere folgende Informationen im JSON-Format:
+
+{{
+  "email": "email@uni.de oder null",
+  "phone": "Telefonnummer oder null",
+  "office": "Raum/Büro oder null",
+  "position": "Position/Titel (z.B. Wissenschaftlicher Mitarbeiter, Professorin) oder null",
+  "department_name": "Name des Instituts/der Abteilung oder null",
+  "research_interests": ["Liste", "der", "Forschungsthemen"] oder [],
+  "teaching_topics": ["Liste", "der", "Lehrveranstaltungen/Fächer"] oder [],
+  "supervisor_name": "Name des Vorgesetzten/Lehrstuhlinhabers falls erkennbar oder null"
+}}
+
+Profilseite von {url}:
+
+{text}
+
+Antworte NUR mit dem JSON-Objekt, keine Erklärungen."""
+
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",  # Kostengünstig und schnell
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.1,
+            max_tokens=500
+        )
+
+        result_text = response.choices[0].message.content.strip()
+
+        # Parse JSON (entferne eventuelle Markdown-Blöcke)
+        if result_text.startswith('```'):
+            result_text = result_text.split('```')[1]
+            if result_text.startswith('json'):
+                result_text = result_text[4:]
+
+        ai_data = json.loads(result_text)
+
+        # Kombiniere mit BeautifulSoup-Ergebnissen (für Links wie ORCID)
+        bs_data = extract_with_beautifulsoup(html, url)
+
+        # AI-Daten haben Priorität, aber BS-Daten für spezifische Links
+        for key in ['orcid', 'google_scholar_id', 'researchgate_url', 'linkedin_url']:
+            if key in bs_data and bs_data[key]:
+                ai_data[key] = bs_data[key]
+
+        return ai_data
+
+    except Exception as e:
+        logger.error(f"AI-Extraktion fehlgeschlagen: {e}")
+        return extract_with_beautifulsoup(html, url)
+
+
+def submit_extracted_data(staff_id: str, data: Dict[str, Any]) -> bool:
+    """Sende extrahierte Daten zurück an BreakPilot."""
+    try:
+        payload = {"staff_id": staff_id, **data}
+
+        # Entferne None-Werte
+        payload = {k: v for k, v in payload.items() if v is not None}
+
+        response = requests.post(
+            f"{API_URL}/api/v1/ai/extraction/submit",
+            json=payload,
+            headers={
+                "Authorization": f"Bearer {API_KEY}",
+                "Content-Type": "application/json"
+            },
+            timeout=30
+        )
+        response.raise_for_status()
+        return True
+    except Exception as e:
+        logger.error(f"Fehler beim Senden der Daten für {staff_id}: {e}")
+        return False
+
+
+def process_profiles():
+    """Hauptschleife: Hole Profile, extrahiere Daten, sende zurück."""
+    logger.info(f"Starte Extraktion - API: {API_URL}")
+
+    processed = 0
+    errors = 0
+
+    while True:
+        # Hole neue Profile
+        profiles = fetch_pending_profiles(limit=BATCH_SIZE)
+
+        if not profiles:
+            logger.info("Keine weiteren Profile zum Verarbeiten. Warte 60 Sekunden...")
+            time.sleep(60)
+            continue
+
+        logger.info(f"Verarbeite {len(profiles)} Profile...")
+
+        for profile in profiles:
+            staff_id = profile['staff_id']
+            url = profile['profile_url']
+            full_name = profile.get('full_name', 'Unbekannt')
+
+            logger.info(f"Verarbeite: {full_name} - {url}")
+
+            # Lade Profilseite
+            html = fetch_profile_page(url)
+            if not html:
+                errors += 1
+                continue
+
+            # Extrahiere Daten
+            extracted = extract_with_ai(html, url, full_name)
+
+            if extracted:
+                # Sende zurück
+                if submit_extracted_data(staff_id, extracted):
+                    processed += 1
+                    logger.info(f"Erfolgreich: {full_name} - Email: {extracted.get('email', 'N/A')}")
+                else:
+                    errors += 1
+            else:
+                errors += 1
+
+            # Rate limiting
+            time.sleep(SLEEP_BETWEEN_REQUESTS)
+
+        logger.info(f"Batch abgeschlossen. Gesamt: {processed} erfolgreich, {errors} Fehler")
+
+
+def main():
+    """Einstiegspunkt."""
+    logger.info("=" * 60)
+    logger.info("BreakPilot vast.ai Profile Extractor")
+    logger.info("=" * 60)
+
+    # Prüfe Konfiguration
+    if not API_KEY:
+        logger.error("BREAKPILOT_API_KEY nicht gesetzt!")
+        sys.exit(1)
+
+    if not OPENAI_API_KEY:
+        logger.warning("OPENAI_API_KEY nicht gesetzt - nutze nur BeautifulSoup-Extraktion")
+
+    # Teste Verbindung
+    try:
+        response = requests.get(
+            f"{API_URL}/v1/health",
+            headers={"Authorization": f"Bearer {API_KEY}"},
+            timeout=10
+        )
+        logger.info(f"API-Verbindung OK: {response.status_code}")
+    except Exception as e:
+        logger.error(f"Kann API nicht erreichen: {e}")
+        logger.error(f"Stelle sicher dass {API_URL} erreichbar ist!")
+        sys.exit(1)
+
+    # Starte Verarbeitung
+    try:
+        process_profiles()
+    except KeyboardInterrupt:
+        logger.info("Beendet durch Benutzer")
+    except Exception as e:
+        logger.error(f"Unerwarteter Fehler: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()