breakpilot-pwa/edu-search-service/scripts/vast_ai_extractor.py

#!/usr/bin/env python3
"""
vast.ai Profile Extractor Script
Dieses Skript läuft auf vast.ai und extrahiert Profildaten von Universitäts-Webseiten.

Verwendung auf vast.ai:
1. Lade dieses Skript auf deine vast.ai Instanz
2. Installiere Abhängigkeiten: pip install requests beautifulsoup4 openai
3. Setze Umgebungsvariablen:
   - BREAKPILOT_API_URL=http://deine-ip:8086
   - BREAKPILOT_API_KEY=dev-key
   - OPENAI_API_KEY=sk-...
4. Starte: python vast_ai_extractor.py
"""

import os
import sys
import json
import time
import logging
import requests
from bs4 import BeautifulSoup
from typing import Optional, Dict, Any, List

# Logging Setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
API_URL = os.environ.get('BREAKPILOT_API_URL', 'http://localhost:8086')
API_KEY = os.environ.get('BREAKPILOT_API_KEY', 'dev-key')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
BATCH_SIZE = 10
SLEEP_BETWEEN_REQUESTS = 1  # Sekunden zwischen Requests (respektiere rate limits)


def fetch_pending_profiles(limit: int = 50) -> List[Dict]:
    """Hole Profile die noch extrahiert werden müssen."""
    try:
        response = requests.get(
            f"{API_URL}/api/v1/ai/extraction/pending",
            params={"limit": limit},
            headers={"Authorization": f"Bearer {API_KEY}"},
            timeout=30
        )
        response.raise_for_status()
        data = response.json()
        return data.get("tasks", [])
    except Exception as e:
        logger.error(f"Fehler beim Abrufen der Profile: {e}")
        return []


def fetch_profile_page(url: str) -> Optional[str]:
    """Lade den HTML-Inhalt einer Profilseite."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; BreakPilot-Crawler/1.0; +https://breakpilot.de)',
            'Accept': 'text/html,application/xhtml+xml',
            'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
        }
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        return response.text
    except Exception as e:
        logger.error(f"Fehler beim Laden von {url}: {e}")
        return None


def extract_with_beautifulsoup(html: str, url: str) -> Dict[str, Any]:
    """Extrahiere Basis-Informationen mit BeautifulSoup (ohne AI)."""
    soup = BeautifulSoup(html, 'html.parser')
    data = {}

    # Email suchen
    email_links = soup.find_all('a', href=lambda x: x and x.startswith('mailto:'))
    if email_links:
        email = email_links[0]['href'].replace('mailto:', '').split('?')[0]
        data['email'] = email

    # Telefon suchen
    phone_links = soup.find_all('a', href=lambda x: x and x.startswith('tel:'))
    if phone_links:
        data['phone'] = phone_links[0]['href'].replace('tel:', '')

    # ORCID suchen
    orcid_links = soup.find_all('a', href=lambda x: x and 'orcid.org' in x)
    if orcid_links:
        orcid = orcid_links[0]['href']
        # Extrahiere ORCID ID
        if '/' in orcid:
            data['orcid'] = orcid.split('/')[-1]

    # Google Scholar suchen
    scholar_links = soup.find_all('a', href=lambda x: x and 'scholar.google' in x)
    if scholar_links:
        href = scholar_links[0]['href']
        if 'user=' in href:
            data['google_scholar_id'] = href.split('user=')[1].split('&')[0]

    # ResearchGate suchen
    rg_links = soup.find_all('a', href=lambda x: x and 'researchgate.net' in x)
    if rg_links:
        data['researchgate_url'] = rg_links[0]['href']

    # LinkedIn suchen
    linkedin_links = soup.find_all('a', href=lambda x: x and 'linkedin.com' in x)
    if linkedin_links:
        data['linkedin_url'] = linkedin_links[0]['href']

    # Institut/Abteilung Links sammeln (für Hierarchie-Erkennung)
    base_domain = '/'.join(url.split('/')[:3])
    department_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        text = link.get_text(strip=True)
        # Suche nach Links die auf Institute/Fakultäten hindeuten
        if any(kw in text.lower() for kw in ['institut', 'fakultät', 'fachbereich', 'abteilung', 'lehrstuhl']):
            if href.startswith('/'):
                href = base_domain + href
            if href.startswith('http'):
                department_links.append({'url': href, 'name': text})

    if department_links:
        # Nimm den ersten gefundenen Department-Link
        data['department_url'] = department_links[0]['url']
        data['department_name'] = department_links[0]['name']

    return data


def extract_with_ai(html: str, url: str, full_name: str) -> Dict[str, Any]:
    """Extrahiere strukturierte Daten mit OpenAI GPT."""
    if not OPENAI_API_KEY:
        logger.warning("Kein OPENAI_API_KEY gesetzt - nutze nur BeautifulSoup")
        return extract_with_beautifulsoup(html, url)

    try:
        import openai
        client = openai.OpenAI(api_key=OPENAI_API_KEY)

        # Reduziere HTML auf relevanten Text
        soup = BeautifulSoup(html, 'html.parser')

        # Entferne Scripts, Styles, etc.
        for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
            tag.decompose()

        # Extrahiere Text
        text = soup.get_text(separator='\n', strip=True)
        # Limitiere auf 8000 Zeichen für API
        text = text[:8000]

        prompt = f"""Analysiere diese Universitäts-Profilseite für {full_name} und extrahiere folgende Informationen im JSON-Format:

{{
  "email": "email@uni.de oder null",
  "phone": "Telefonnummer oder null",
  "office": "Raum/Büro oder null",
  "position": "Position/Titel (z.B. Wissenschaftlicher Mitarbeiter, Professorin) oder null",
  "department_name": "Name des Instituts/der Abteilung oder null",
  "research_interests": ["Liste", "der", "Forschungsthemen"] oder [],
  "teaching_topics": ["Liste", "der", "Lehrveranstaltungen/Fächer"] oder [],
  "supervisor_name": "Name des Vorgesetzten/Lehrstuhlinhabers falls erkennbar oder null"
}}

Profilseite von {url}:

{text}

Antworte NUR mit dem JSON-Objekt, keine Erklärungen."""

        response = client.chat.completions.create(
            model="gpt-4o-mini",  # Kostengünstig und schnell
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=500
        )

        result_text = response.choices[0].message.content.strip()

        # Parse JSON (entferne eventuelle Markdown-Blöcke)
        if result_text.startswith('```'):
            result_text = result_text.split('```')[1]
            if result_text.startswith('json'):
                result_text = result_text[4:]

        ai_data = json.loads(result_text)

        # Kombiniere mit BeautifulSoup-Ergebnissen (für Links wie ORCID)
        bs_data = extract_with_beautifulsoup(html, url)

        # AI-Daten haben Priorität, aber BS-Daten für spezifische Links
        for key in ['orcid', 'google_scholar_id', 'researchgate_url', 'linkedin_url']:
            if key in bs_data and bs_data[key]:
                ai_data[key] = bs_data[key]

        return ai_data

    except Exception as e:
        logger.error(f"AI-Extraktion fehlgeschlagen: {e}")
        return extract_with_beautifulsoup(html, url)


def submit_extracted_data(staff_id: str, data: Dict[str, Any]) -> bool:
    """Sende extrahierte Daten zurück an BreakPilot."""
    try:
        payload = {"staff_id": staff_id, **data}

        # Entferne None-Werte
        payload = {k: v for k, v in payload.items() if v is not None}

        response = requests.post(
            f"{API_URL}/api/v1/ai/extraction/submit",
            json=payload,
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json"
            },
            timeout=30
        )
        response.raise_for_status()
        return True
    except Exception as e:
        logger.error(f"Fehler beim Senden der Daten für {staff_id}: {e}")
        return False


def process_profiles():
    """Hauptschleife: Hole Profile, extrahiere Daten, sende zurück."""
    logger.info(f"Starte Extraktion - API: {API_URL}")

    processed = 0
    errors = 0

    while True:
        # Hole neue Profile
        profiles = fetch_pending_profiles(limit=BATCH_SIZE)

        if not profiles:
            logger.info("Keine weiteren Profile zum Verarbeiten. Warte 60 Sekunden...")
            time.sleep(60)
            continue

        logger.info(f"Verarbeite {len(profiles)} Profile...")

        for profile in profiles:
            staff_id = profile['staff_id']
            url = profile['profile_url']
            full_name = profile.get('full_name', 'Unbekannt')

            logger.info(f"Verarbeite: {full_name} - {url}")

            # Lade Profilseite
            html = fetch_profile_page(url)
            if not html:
                errors += 1
                continue

            # Extrahiere Daten
            extracted = extract_with_ai(html, url, full_name)

            if extracted:
                # Sende zurück
                if submit_extracted_data(staff_id, extracted):
                    processed += 1
                    logger.info(f"Erfolgreich: {full_name} - Email: {extracted.get('email', 'N/A')}")
                else:
                    errors += 1
            else:
                errors += 1

            # Rate limiting
            time.sleep(SLEEP_BETWEEN_REQUESTS)

        logger.info(f"Batch abgeschlossen. Gesamt: {processed} erfolgreich, {errors} Fehler")


def main():
    """Einstiegspunkt."""
    logger.info("=" * 60)
    logger.info("BreakPilot vast.ai Profile Extractor")
    logger.info("=" * 60)

    # Prüfe Konfiguration
    if not API_KEY:
        logger.error("BREAKPILOT_API_KEY nicht gesetzt!")
        sys.exit(1)

    if not OPENAI_API_KEY:
        logger.warning("OPENAI_API_KEY nicht gesetzt - nutze nur BeautifulSoup-Extraktion")

    # Teste Verbindung
    try:
        response = requests.get(
            f"{API_URL}/v1/health",
            headers={"Authorization": f"Bearer {API_KEY}"},
            timeout=10
        )
        logger.info(f"API-Verbindung OK: {response.status_code}")
    except Exception as e:
        logger.error(f"Kann API nicht erreichen: {e}")
        logger.error(f"Stelle sicher dass {API_URL} erreichbar ist!")
        sys.exit(1)

    # Starte Verarbeitung
    try:
        process_profiles()
    except KeyboardInterrupt:
        logger.info("Beendet durch Benutzer")
    except Exception as e:
        logger.error(f"Unerwarteter Fehler: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()