A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
321 lines
10 KiB
Python
321 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
vast.ai Profile Extractor Script
|
|
Dieses Skript läuft auf vast.ai und extrahiert Profildaten von Universitäts-Webseiten.
|
|
|
|
Verwendung auf vast.ai:
|
|
1. Lade dieses Skript auf deine vast.ai Instanz
|
|
2. Installiere Abhängigkeiten: pip install requests beautifulsoup4 openai
|
|
3. Setze Umgebungsvariablen:
|
|
- BREAKPILOT_API_URL=http://deine-ip:8086
|
|
- BREAKPILOT_API_KEY=dev-key
|
|
- OPENAI_API_KEY=sk-...
|
|
4. Starte: python vast_ai_extractor.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import logging
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from typing import Optional, Dict, Any, List
|
|
|
|
# Logging Setup
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
API_URL = os.environ.get('BREAKPILOT_API_URL', 'http://localhost:8086')
|
|
API_KEY = os.environ.get('BREAKPILOT_API_KEY', 'dev-key')
|
|
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
|
|
BATCH_SIZE = 10
|
|
SLEEP_BETWEEN_REQUESTS = 1 # Sekunden zwischen Requests (respektiere rate limits)
|
|
|
|
|
|
def fetch_pending_profiles(limit: int = 50) -> List[Dict]:
|
|
"""Hole Profile die noch extrahiert werden müssen."""
|
|
try:
|
|
response = requests.get(
|
|
f"{API_URL}/api/v1/ai/extraction/pending",
|
|
params={"limit": limit},
|
|
headers={"Authorization": f"Bearer {API_KEY}"},
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("tasks", [])
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Abrufen der Profile: {e}")
|
|
return []
|
|
|
|
|
|
def fetch_profile_page(url: str) -> Optional[str]:
|
|
"""Lade den HTML-Inhalt einer Profilseite."""
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (compatible; BreakPilot-Crawler/1.0; +https://breakpilot.de)',
|
|
'Accept': 'text/html,application/xhtml+xml',
|
|
'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
|
|
}
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Laden von {url}: {e}")
|
|
return None
|
|
|
|
|
|
def extract_with_beautifulsoup(html: str, url: str) -> Dict[str, Any]:
|
|
"""Extrahiere Basis-Informationen mit BeautifulSoup (ohne AI)."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
data = {}
|
|
|
|
# Email suchen
|
|
email_links = soup.find_all('a', href=lambda x: x and x.startswith('mailto:'))
|
|
if email_links:
|
|
email = email_links[0]['href'].replace('mailto:', '').split('?')[0]
|
|
data['email'] = email
|
|
|
|
# Telefon suchen
|
|
phone_links = soup.find_all('a', href=lambda x: x and x.startswith('tel:'))
|
|
if phone_links:
|
|
data['phone'] = phone_links[0]['href'].replace('tel:', '')
|
|
|
|
# ORCID suchen
|
|
orcid_links = soup.find_all('a', href=lambda x: x and 'orcid.org' in x)
|
|
if orcid_links:
|
|
orcid = orcid_links[0]['href']
|
|
# Extrahiere ORCID ID
|
|
if '/' in orcid:
|
|
data['orcid'] = orcid.split('/')[-1]
|
|
|
|
# Google Scholar suchen
|
|
scholar_links = soup.find_all('a', href=lambda x: x and 'scholar.google' in x)
|
|
if scholar_links:
|
|
href = scholar_links[0]['href']
|
|
if 'user=' in href:
|
|
data['google_scholar_id'] = href.split('user=')[1].split('&')[0]
|
|
|
|
# ResearchGate suchen
|
|
rg_links = soup.find_all('a', href=lambda x: x and 'researchgate.net' in x)
|
|
if rg_links:
|
|
data['researchgate_url'] = rg_links[0]['href']
|
|
|
|
# LinkedIn suchen
|
|
linkedin_links = soup.find_all('a', href=lambda x: x and 'linkedin.com' in x)
|
|
if linkedin_links:
|
|
data['linkedin_url'] = linkedin_links[0]['href']
|
|
|
|
# Institut/Abteilung Links sammeln (für Hierarchie-Erkennung)
|
|
base_domain = '/'.join(url.split('/')[:3])
|
|
department_links = []
|
|
for link in soup.find_all('a', href=True):
|
|
href = link['href']
|
|
text = link.get_text(strip=True)
|
|
# Suche nach Links die auf Institute/Fakultäten hindeuten
|
|
if any(kw in text.lower() for kw in ['institut', 'fakultät', 'fachbereich', 'abteilung', 'lehrstuhl']):
|
|
if href.startswith('/'):
|
|
href = base_domain + href
|
|
if href.startswith('http'):
|
|
department_links.append({'url': href, 'name': text})
|
|
|
|
if department_links:
|
|
# Nimm den ersten gefundenen Department-Link
|
|
data['department_url'] = department_links[0]['url']
|
|
data['department_name'] = department_links[0]['name']
|
|
|
|
return data
|
|
|
|
|
|
def extract_with_ai(html: str, url: str, full_name: str) -> Dict[str, Any]:
|
|
"""Extrahiere strukturierte Daten mit OpenAI GPT."""
|
|
if not OPENAI_API_KEY:
|
|
logger.warning("Kein OPENAI_API_KEY gesetzt - nutze nur BeautifulSoup")
|
|
return extract_with_beautifulsoup(html, url)
|
|
|
|
try:
|
|
import openai
|
|
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
|
|
|
# Reduziere HTML auf relevanten Text
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Entferne Scripts, Styles, etc.
|
|
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
|
|
tag.decompose()
|
|
|
|
# Extrahiere Text
|
|
text = soup.get_text(separator='\n', strip=True)
|
|
# Limitiere auf 8000 Zeichen für API
|
|
text = text[:8000]
|
|
|
|
prompt = f"""Analysiere diese Universitäts-Profilseite für {full_name} und extrahiere folgende Informationen im JSON-Format:
|
|
|
|
{{
|
|
"email": "email@uni.de oder null",
|
|
"phone": "Telefonnummer oder null",
|
|
"office": "Raum/Büro oder null",
|
|
"position": "Position/Titel (z.B. Wissenschaftlicher Mitarbeiter, Professorin) oder null",
|
|
"department_name": "Name des Instituts/der Abteilung oder null",
|
|
"research_interests": ["Liste", "der", "Forschungsthemen"] oder [],
|
|
"teaching_topics": ["Liste", "der", "Lehrveranstaltungen/Fächer"] oder [],
|
|
"supervisor_name": "Name des Vorgesetzten/Lehrstuhlinhabers falls erkennbar oder null"
|
|
}}
|
|
|
|
Profilseite von {url}:
|
|
|
|
{text}
|
|
|
|
Antworte NUR mit dem JSON-Objekt, keine Erklärungen."""
|
|
|
|
response = client.chat.completions.create(
|
|
model="gpt-4o-mini", # Kostengünstig und schnell
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=0.1,
|
|
max_tokens=500
|
|
)
|
|
|
|
result_text = response.choices[0].message.content.strip()
|
|
|
|
# Parse JSON (entferne eventuelle Markdown-Blöcke)
|
|
if result_text.startswith('```'):
|
|
result_text = result_text.split('```')[1]
|
|
if result_text.startswith('json'):
|
|
result_text = result_text[4:]
|
|
|
|
ai_data = json.loads(result_text)
|
|
|
|
# Kombiniere mit BeautifulSoup-Ergebnissen (für Links wie ORCID)
|
|
bs_data = extract_with_beautifulsoup(html, url)
|
|
|
|
# AI-Daten haben Priorität, aber BS-Daten für spezifische Links
|
|
for key in ['orcid', 'google_scholar_id', 'researchgate_url', 'linkedin_url']:
|
|
if key in bs_data and bs_data[key]:
|
|
ai_data[key] = bs_data[key]
|
|
|
|
return ai_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"AI-Extraktion fehlgeschlagen: {e}")
|
|
return extract_with_beautifulsoup(html, url)
|
|
|
|
|
|
def submit_extracted_data(staff_id: str, data: Dict[str, Any]) -> bool:
|
|
"""Sende extrahierte Daten zurück an BreakPilot."""
|
|
try:
|
|
payload = {"staff_id": staff_id, **data}
|
|
|
|
# Entferne None-Werte
|
|
payload = {k: v for k, v in payload.items() if v is not None}
|
|
|
|
response = requests.post(
|
|
f"{API_URL}/api/v1/ai/extraction/submit",
|
|
json=payload,
|
|
headers={
|
|
"Authorization": f"Bearer {API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Fehler beim Senden der Daten für {staff_id}: {e}")
|
|
return False
|
|
|
|
|
|
def process_profiles():
|
|
"""Hauptschleife: Hole Profile, extrahiere Daten, sende zurück."""
|
|
logger.info(f"Starte Extraktion - API: {API_URL}")
|
|
|
|
processed = 0
|
|
errors = 0
|
|
|
|
while True:
|
|
# Hole neue Profile
|
|
profiles = fetch_pending_profiles(limit=BATCH_SIZE)
|
|
|
|
if not profiles:
|
|
logger.info("Keine weiteren Profile zum Verarbeiten. Warte 60 Sekunden...")
|
|
time.sleep(60)
|
|
continue
|
|
|
|
logger.info(f"Verarbeite {len(profiles)} Profile...")
|
|
|
|
for profile in profiles:
|
|
staff_id = profile['staff_id']
|
|
url = profile['profile_url']
|
|
full_name = profile.get('full_name', 'Unbekannt')
|
|
|
|
logger.info(f"Verarbeite: {full_name} - {url}")
|
|
|
|
# Lade Profilseite
|
|
html = fetch_profile_page(url)
|
|
if not html:
|
|
errors += 1
|
|
continue
|
|
|
|
# Extrahiere Daten
|
|
extracted = extract_with_ai(html, url, full_name)
|
|
|
|
if extracted:
|
|
# Sende zurück
|
|
if submit_extracted_data(staff_id, extracted):
|
|
processed += 1
|
|
logger.info(f"Erfolgreich: {full_name} - Email: {extracted.get('email', 'N/A')}")
|
|
else:
|
|
errors += 1
|
|
else:
|
|
errors += 1
|
|
|
|
# Rate limiting
|
|
time.sleep(SLEEP_BETWEEN_REQUESTS)
|
|
|
|
logger.info(f"Batch abgeschlossen. Gesamt: {processed} erfolgreich, {errors} Fehler")
|
|
|
|
|
|
def main():
|
|
"""Einstiegspunkt."""
|
|
logger.info("=" * 60)
|
|
logger.info("BreakPilot vast.ai Profile Extractor")
|
|
logger.info("=" * 60)
|
|
|
|
# Prüfe Konfiguration
|
|
if not API_KEY:
|
|
logger.error("BREAKPILOT_API_KEY nicht gesetzt!")
|
|
sys.exit(1)
|
|
|
|
if not OPENAI_API_KEY:
|
|
logger.warning("OPENAI_API_KEY nicht gesetzt - nutze nur BeautifulSoup-Extraktion")
|
|
|
|
# Teste Verbindung
|
|
try:
|
|
response = requests.get(
|
|
f"{API_URL}/v1/health",
|
|
headers={"Authorization": f"Bearer {API_KEY}"},
|
|
timeout=10
|
|
)
|
|
logger.info(f"API-Verbindung OK: {response.status_code}")
|
|
except Exception as e:
|
|
logger.error(f"Kann API nicht erreichen: {e}")
|
|
logger.error(f"Stelle sicher dass {API_URL} erreichbar ist!")
|
|
sys.exit(1)
|
|
|
|
# Starte Verarbeitung
|
|
try:
|
|
process_profiles()
|
|
except KeyboardInterrupt:
|
|
logger.info("Beendet durch Benutzer")
|
|
except Exception as e:
|
|
logger.error(f"Unerwarteter Fehler: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|