This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/edu-search-service/scripts/vast_ai_extractor.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

321 lines
10 KiB
Python

#!/usr/bin/env python3
"""
vast.ai Profile Extractor Script
Dieses Skript läuft auf vast.ai und extrahiert Profildaten von Universitäts-Webseiten.
Verwendung auf vast.ai:
1. Lade dieses Skript auf deine vast.ai Instanz
2. Installiere Abhängigkeiten: pip install requests beautifulsoup4 openai
3. Setze Umgebungsvariablen:
- BREAKPILOT_API_URL=http://deine-ip:8086
- BREAKPILOT_API_KEY=dev-key
- OPENAI_API_KEY=sk-...
4. Starte: python vast_ai_extractor.py
"""
import os
import sys
import json
import time
import logging
import requests
from bs4 import BeautifulSoup
from typing import Optional, Dict, Any, List
# Logging Setup
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
API_URL = os.environ.get('BREAKPILOT_API_URL', 'http://localhost:8086')
API_KEY = os.environ.get('BREAKPILOT_API_KEY', 'dev-key')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
BATCH_SIZE = 10
SLEEP_BETWEEN_REQUESTS = 1 # Sekunden zwischen Requests (respektiere rate limits)
def fetch_pending_profiles(limit: int = 50) -> List[Dict]:
"""Hole Profile die noch extrahiert werden müssen."""
try:
response = requests.get(
f"{API_URL}/api/v1/ai/extraction/pending",
params={"limit": limit},
headers={"Authorization": f"Bearer {API_KEY}"},
timeout=30
)
response.raise_for_status()
data = response.json()
return data.get("tasks", [])
except Exception as e:
logger.error(f"Fehler beim Abrufen der Profile: {e}")
return []
def fetch_profile_page(url: str) -> Optional[str]:
"""Lade den HTML-Inhalt einer Profilseite."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; BreakPilot-Crawler/1.0; +https://breakpilot.de)',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return response.text
except Exception as e:
logger.error(f"Fehler beim Laden von {url}: {e}")
return None
def extract_with_beautifulsoup(html: str, url: str) -> Dict[str, Any]:
"""Extrahiere Basis-Informationen mit BeautifulSoup (ohne AI)."""
soup = BeautifulSoup(html, 'html.parser')
data = {}
# Email suchen
email_links = soup.find_all('a', href=lambda x: x and x.startswith('mailto:'))
if email_links:
email = email_links[0]['href'].replace('mailto:', '').split('?')[0]
data['email'] = email
# Telefon suchen
phone_links = soup.find_all('a', href=lambda x: x and x.startswith('tel:'))
if phone_links:
data['phone'] = phone_links[0]['href'].replace('tel:', '')
# ORCID suchen
orcid_links = soup.find_all('a', href=lambda x: x and 'orcid.org' in x)
if orcid_links:
orcid = orcid_links[0]['href']
# Extrahiere ORCID ID
if '/' in orcid:
data['orcid'] = orcid.split('/')[-1]
# Google Scholar suchen
scholar_links = soup.find_all('a', href=lambda x: x and 'scholar.google' in x)
if scholar_links:
href = scholar_links[0]['href']
if 'user=' in href:
data['google_scholar_id'] = href.split('user=')[1].split('&')[0]
# ResearchGate suchen
rg_links = soup.find_all('a', href=lambda x: x and 'researchgate.net' in x)
if rg_links:
data['researchgate_url'] = rg_links[0]['href']
# LinkedIn suchen
linkedin_links = soup.find_all('a', href=lambda x: x and 'linkedin.com' in x)
if linkedin_links:
data['linkedin_url'] = linkedin_links[0]['href']
# Institut/Abteilung Links sammeln (für Hierarchie-Erkennung)
base_domain = '/'.join(url.split('/')[:3])
department_links = []
for link in soup.find_all('a', href=True):
href = link['href']
text = link.get_text(strip=True)
# Suche nach Links die auf Institute/Fakultäten hindeuten
if any(kw in text.lower() for kw in ['institut', 'fakultät', 'fachbereich', 'abteilung', 'lehrstuhl']):
if href.startswith('/'):
href = base_domain + href
if href.startswith('http'):
department_links.append({'url': href, 'name': text})
if department_links:
# Nimm den ersten gefundenen Department-Link
data['department_url'] = department_links[0]['url']
data['department_name'] = department_links[0]['name']
return data
def extract_with_ai(html: str, url: str, full_name: str) -> Dict[str, Any]:
"""Extrahiere strukturierte Daten mit OpenAI GPT."""
if not OPENAI_API_KEY:
logger.warning("Kein OPENAI_API_KEY gesetzt - nutze nur BeautifulSoup")
return extract_with_beautifulsoup(html, url)
try:
import openai
client = openai.OpenAI(api_key=OPENAI_API_KEY)
# Reduziere HTML auf relevanten Text
soup = BeautifulSoup(html, 'html.parser')
# Entferne Scripts, Styles, etc.
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
# Extrahiere Text
text = soup.get_text(separator='\n', strip=True)
# Limitiere auf 8000 Zeichen für API
text = text[:8000]
prompt = f"""Analysiere diese Universitäts-Profilseite für {full_name} und extrahiere folgende Informationen im JSON-Format:
{{
"email": "email@uni.de oder null",
"phone": "Telefonnummer oder null",
"office": "Raum/Büro oder null",
"position": "Position/Titel (z.B. Wissenschaftlicher Mitarbeiter, Professorin) oder null",
"department_name": "Name des Instituts/der Abteilung oder null",
"research_interests": ["Liste", "der", "Forschungsthemen"] oder [],
"teaching_topics": ["Liste", "der", "Lehrveranstaltungen/Fächer"] oder [],
"supervisor_name": "Name des Vorgesetzten/Lehrstuhlinhabers falls erkennbar oder null"
}}
Profilseite von {url}:
{text}
Antworte NUR mit dem JSON-Objekt, keine Erklärungen."""
response = client.chat.completions.create(
model="gpt-4o-mini", # Kostengünstig und schnell
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=500
)
result_text = response.choices[0].message.content.strip()
# Parse JSON (entferne eventuelle Markdown-Blöcke)
if result_text.startswith('```'):
result_text = result_text.split('```')[1]
if result_text.startswith('json'):
result_text = result_text[4:]
ai_data = json.loads(result_text)
# Kombiniere mit BeautifulSoup-Ergebnissen (für Links wie ORCID)
bs_data = extract_with_beautifulsoup(html, url)
# AI-Daten haben Priorität, aber BS-Daten für spezifische Links
for key in ['orcid', 'google_scholar_id', 'researchgate_url', 'linkedin_url']:
if key in bs_data and bs_data[key]:
ai_data[key] = bs_data[key]
return ai_data
except Exception as e:
logger.error(f"AI-Extraktion fehlgeschlagen: {e}")
return extract_with_beautifulsoup(html, url)
def submit_extracted_data(staff_id: str, data: Dict[str, Any]) -> bool:
"""Sende extrahierte Daten zurück an BreakPilot."""
try:
payload = {"staff_id": staff_id, **data}
# Entferne None-Werte
payload = {k: v for k, v in payload.items() if v is not None}
response = requests.post(
f"{API_URL}/api/v1/ai/extraction/submit",
json=payload,
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
timeout=30
)
response.raise_for_status()
return True
except Exception as e:
logger.error(f"Fehler beim Senden der Daten für {staff_id}: {e}")
return False
def process_profiles():
"""Hauptschleife: Hole Profile, extrahiere Daten, sende zurück."""
logger.info(f"Starte Extraktion - API: {API_URL}")
processed = 0
errors = 0
while True:
# Hole neue Profile
profiles = fetch_pending_profiles(limit=BATCH_SIZE)
if not profiles:
logger.info("Keine weiteren Profile zum Verarbeiten. Warte 60 Sekunden...")
time.sleep(60)
continue
logger.info(f"Verarbeite {len(profiles)} Profile...")
for profile in profiles:
staff_id = profile['staff_id']
url = profile['profile_url']
full_name = profile.get('full_name', 'Unbekannt')
logger.info(f"Verarbeite: {full_name} - {url}")
# Lade Profilseite
html = fetch_profile_page(url)
if not html:
errors += 1
continue
# Extrahiere Daten
extracted = extract_with_ai(html, url, full_name)
if extracted:
# Sende zurück
if submit_extracted_data(staff_id, extracted):
processed += 1
logger.info(f"Erfolgreich: {full_name} - Email: {extracted.get('email', 'N/A')}")
else:
errors += 1
else:
errors += 1
# Rate limiting
time.sleep(SLEEP_BETWEEN_REQUESTS)
logger.info(f"Batch abgeschlossen. Gesamt: {processed} erfolgreich, {errors} Fehler")
def main():
"""Einstiegspunkt."""
logger.info("=" * 60)
logger.info("BreakPilot vast.ai Profile Extractor")
logger.info("=" * 60)
# Prüfe Konfiguration
if not API_KEY:
logger.error("BREAKPILOT_API_KEY nicht gesetzt!")
sys.exit(1)
if not OPENAI_API_KEY:
logger.warning("OPENAI_API_KEY nicht gesetzt - nutze nur BeautifulSoup-Extraktion")
# Teste Verbindung
try:
response = requests.get(
f"{API_URL}/v1/health",
headers={"Authorization": f"Bearer {API_KEY}"},
timeout=10
)
logger.info(f"API-Verbindung OK: {response.status_code}")
except Exception as e:
logger.error(f"Kann API nicht erreichen: {e}")
logger.error(f"Stelle sicher dass {API_URL} erreichbar ist!")
sys.exit(1)
# Starte Verarbeitung
try:
process_profiles()
except KeyboardInterrupt:
logger.info("Beendet durch Benutzer")
except Exception as e:
logger.error(f"Unerwarteter Fehler: {e}")
sys.exit(1)
if __name__ == "__main__":
main()