feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
320
edu-search-service/scripts/vast_ai_extractor.py
Normal file
320
edu-search-service/scripts/vast_ai_extractor.py
Normal file
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
vast.ai Profile Extractor Script
|
||||
Dieses Skript läuft auf vast.ai und extrahiert Profildaten von Universitäts-Webseiten.
|
||||
|
||||
Verwendung auf vast.ai:
|
||||
1. Lade dieses Skript auf deine vast.ai Instanz
|
||||
2. Installiere Abhängigkeiten: pip install requests beautifulsoup4 openai
|
||||
3. Setze Umgebungsvariablen:
|
||||
- BREAKPILOT_API_URL=http://deine-ip:8086
|
||||
- BREAKPILOT_API_KEY=dev-key
|
||||
- OPENAI_API_KEY=sk-...
|
||||
4. Starte: python vast_ai_extractor.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
# Logging Setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
API_URL = os.environ.get('BREAKPILOT_API_URL', 'http://localhost:8086')
|
||||
API_KEY = os.environ.get('BREAKPILOT_API_KEY', 'dev-key')
|
||||
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
|
||||
BATCH_SIZE = 10
|
||||
SLEEP_BETWEEN_REQUESTS = 1 # Sekunden zwischen Requests (respektiere rate limits)
|
||||
|
||||
|
||||
def fetch_pending_profiles(limit: int = 50) -> List[Dict]:
|
||||
"""Hole Profile die noch extrahiert werden müssen."""
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{API_URL}/api/v1/ai/extraction/pending",
|
||||
params={"limit": limit},
|
||||
headers={"Authorization": f"Bearer {API_KEY}"},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("tasks", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Abrufen der Profile: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def fetch_profile_page(url: str) -> Optional[str]:
|
||||
"""Lade den HTML-Inhalt einer Profilseite."""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; BreakPilot-Crawler/1.0; +https://breakpilot.de)',
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Laden von {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_with_beautifulsoup(html: str, url: str) -> Dict[str, Any]:
|
||||
"""Extrahiere Basis-Informationen mit BeautifulSoup (ohne AI)."""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
data = {}
|
||||
|
||||
# Email suchen
|
||||
email_links = soup.find_all('a', href=lambda x: x and x.startswith('mailto:'))
|
||||
if email_links:
|
||||
email = email_links[0]['href'].replace('mailto:', '').split('?')[0]
|
||||
data['email'] = email
|
||||
|
||||
# Telefon suchen
|
||||
phone_links = soup.find_all('a', href=lambda x: x and x.startswith('tel:'))
|
||||
if phone_links:
|
||||
data['phone'] = phone_links[0]['href'].replace('tel:', '')
|
||||
|
||||
# ORCID suchen
|
||||
orcid_links = soup.find_all('a', href=lambda x: x and 'orcid.org' in x)
|
||||
if orcid_links:
|
||||
orcid = orcid_links[0]['href']
|
||||
# Extrahiere ORCID ID
|
||||
if '/' in orcid:
|
||||
data['orcid'] = orcid.split('/')[-1]
|
||||
|
||||
# Google Scholar suchen
|
||||
scholar_links = soup.find_all('a', href=lambda x: x and 'scholar.google' in x)
|
||||
if scholar_links:
|
||||
href = scholar_links[0]['href']
|
||||
if 'user=' in href:
|
||||
data['google_scholar_id'] = href.split('user=')[1].split('&')[0]
|
||||
|
||||
# ResearchGate suchen
|
||||
rg_links = soup.find_all('a', href=lambda x: x and 'researchgate.net' in x)
|
||||
if rg_links:
|
||||
data['researchgate_url'] = rg_links[0]['href']
|
||||
|
||||
# LinkedIn suchen
|
||||
linkedin_links = soup.find_all('a', href=lambda x: x and 'linkedin.com' in x)
|
||||
if linkedin_links:
|
||||
data['linkedin_url'] = linkedin_links[0]['href']
|
||||
|
||||
# Institut/Abteilung Links sammeln (für Hierarchie-Erkennung)
|
||||
base_domain = '/'.join(url.split('/')[:3])
|
||||
department_links = []
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
text = link.get_text(strip=True)
|
||||
# Suche nach Links die auf Institute/Fakultäten hindeuten
|
||||
if any(kw in text.lower() for kw in ['institut', 'fakultät', 'fachbereich', 'abteilung', 'lehrstuhl']):
|
||||
if href.startswith('/'):
|
||||
href = base_domain + href
|
||||
if href.startswith('http'):
|
||||
department_links.append({'url': href, 'name': text})
|
||||
|
||||
if department_links:
|
||||
# Nimm den ersten gefundenen Department-Link
|
||||
data['department_url'] = department_links[0]['url']
|
||||
data['department_name'] = department_links[0]['name']
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def extract_with_ai(html: str, url: str, full_name: str) -> Dict[str, Any]:
|
||||
"""Extrahiere strukturierte Daten mit OpenAI GPT."""
|
||||
if not OPENAI_API_KEY:
|
||||
logger.warning("Kein OPENAI_API_KEY gesetzt - nutze nur BeautifulSoup")
|
||||
return extract_with_beautifulsoup(html, url)
|
||||
|
||||
try:
|
||||
import openai
|
||||
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
||||
|
||||
# Reduziere HTML auf relevanten Text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Entferne Scripts, Styles, etc.
|
||||
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
|
||||
tag.decompose()
|
||||
|
||||
# Extrahiere Text
|
||||
text = soup.get_text(separator='\n', strip=True)
|
||||
# Limitiere auf 8000 Zeichen für API
|
||||
text = text[:8000]
|
||||
|
||||
prompt = f"""Analysiere diese Universitäts-Profilseite für {full_name} und extrahiere folgende Informationen im JSON-Format:
|
||||
|
||||
{{
|
||||
"email": "email@uni.de oder null",
|
||||
"phone": "Telefonnummer oder null",
|
||||
"office": "Raum/Büro oder null",
|
||||
"position": "Position/Titel (z.B. Wissenschaftlicher Mitarbeiter, Professorin) oder null",
|
||||
"department_name": "Name des Instituts/der Abteilung oder null",
|
||||
"research_interests": ["Liste", "der", "Forschungsthemen"] oder [],
|
||||
"teaching_topics": ["Liste", "der", "Lehrveranstaltungen/Fächer"] oder [],
|
||||
"supervisor_name": "Name des Vorgesetzten/Lehrstuhlinhabers falls erkennbar oder null"
|
||||
}}
|
||||
|
||||
Profilseite von {url}:
|
||||
|
||||
{text}
|
||||
|
||||
Antworte NUR mit dem JSON-Objekt, keine Erklärungen."""
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini", # Kostengünstig und schnell
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.1,
|
||||
max_tokens=500
|
||||
)
|
||||
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
|
||||
# Parse JSON (entferne eventuelle Markdown-Blöcke)
|
||||
if result_text.startswith('```'):
|
||||
result_text = result_text.split('```')[1]
|
||||
if result_text.startswith('json'):
|
||||
result_text = result_text[4:]
|
||||
|
||||
ai_data = json.loads(result_text)
|
||||
|
||||
# Kombiniere mit BeautifulSoup-Ergebnissen (für Links wie ORCID)
|
||||
bs_data = extract_with_beautifulsoup(html, url)
|
||||
|
||||
# AI-Daten haben Priorität, aber BS-Daten für spezifische Links
|
||||
for key in ['orcid', 'google_scholar_id', 'researchgate_url', 'linkedin_url']:
|
||||
if key in bs_data and bs_data[key]:
|
||||
ai_data[key] = bs_data[key]
|
||||
|
||||
return ai_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI-Extraktion fehlgeschlagen: {e}")
|
||||
return extract_with_beautifulsoup(html, url)
|
||||
|
||||
|
||||
def submit_extracted_data(staff_id: str, data: Dict[str, Any]) -> bool:
|
||||
"""Sende extrahierte Daten zurück an BreakPilot."""
|
||||
try:
|
||||
payload = {"staff_id": staff_id, **data}
|
||||
|
||||
# Entferne None-Werte
|
||||
payload = {k: v for k, v in payload.items() if v is not None}
|
||||
|
||||
response = requests.post(
|
||||
f"{API_URL}/api/v1/ai/extraction/submit",
|
||||
json=payload,
|
||||
headers={
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Senden der Daten für {staff_id}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def process_profiles():
|
||||
"""Hauptschleife: Hole Profile, extrahiere Daten, sende zurück."""
|
||||
logger.info(f"Starte Extraktion - API: {API_URL}")
|
||||
|
||||
processed = 0
|
||||
errors = 0
|
||||
|
||||
while True:
|
||||
# Hole neue Profile
|
||||
profiles = fetch_pending_profiles(limit=BATCH_SIZE)
|
||||
|
||||
if not profiles:
|
||||
logger.info("Keine weiteren Profile zum Verarbeiten. Warte 60 Sekunden...")
|
||||
time.sleep(60)
|
||||
continue
|
||||
|
||||
logger.info(f"Verarbeite {len(profiles)} Profile...")
|
||||
|
||||
for profile in profiles:
|
||||
staff_id = profile['staff_id']
|
||||
url = profile['profile_url']
|
||||
full_name = profile.get('full_name', 'Unbekannt')
|
||||
|
||||
logger.info(f"Verarbeite: {full_name} - {url}")
|
||||
|
||||
# Lade Profilseite
|
||||
html = fetch_profile_page(url)
|
||||
if not html:
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Extrahiere Daten
|
||||
extracted = extract_with_ai(html, url, full_name)
|
||||
|
||||
if extracted:
|
||||
# Sende zurück
|
||||
if submit_extracted_data(staff_id, extracted):
|
||||
processed += 1
|
||||
logger.info(f"Erfolgreich: {full_name} - Email: {extracted.get('email', 'N/A')}")
|
||||
else:
|
||||
errors += 1
|
||||
else:
|
||||
errors += 1
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(SLEEP_BETWEEN_REQUESTS)
|
||||
|
||||
logger.info(f"Batch abgeschlossen. Gesamt: {processed} erfolgreich, {errors} Fehler")
|
||||
|
||||
|
||||
def main():
|
||||
"""Einstiegspunkt."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("BreakPilot vast.ai Profile Extractor")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Prüfe Konfiguration
|
||||
if not API_KEY:
|
||||
logger.error("BREAKPILOT_API_KEY nicht gesetzt!")
|
||||
sys.exit(1)
|
||||
|
||||
if not OPENAI_API_KEY:
|
||||
logger.warning("OPENAI_API_KEY nicht gesetzt - nutze nur BeautifulSoup-Extraktion")
|
||||
|
||||
# Teste Verbindung
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{API_URL}/v1/health",
|
||||
headers={"Authorization": f"Bearer {API_KEY}"},
|
||||
timeout=10
|
||||
)
|
||||
logger.info(f"API-Verbindung OK: {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.error(f"Kann API nicht erreichen: {e}")
|
||||
logger.error(f"Stelle sicher dass {API_URL} erreichbar ist!")
|
||||
sys.exit(1)
|
||||
|
||||
# Starte Verarbeitung
|
||||
try:
|
||||
process_profiles()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Beendet durch Benutzer")
|
||||
except Exception as e:
|
||||
logger.error(f"Unerwarteter Fehler: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user