feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions

View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Add all major German universities to the edu-search-service database.
Based on HRK (Hochschulrektorenkonferenz) list.
"""
import requests
import json
import time
import sys
API_BASE = "https://macmini:8089/api/v1"
# German Universities - categorized
GERMAN_UNIVERSITIES = [
# === UNIVERSITIES (Universitäten) ===
# Already in DB (skip or update)
# {"name": "TUM", "url": "https://www.tum.de", "type": "university"},
# {"name": "LMU München", "url": "https://www.lmu.de", "type": "university"},
# {"name": "UOL", "url": "https://uol.de", "type": "university"},
# {"name": "KIT Karlsruhe", "url": "https://www.kit.edu", "type": "university"},
# TU9 Universities
{"name": "TU Dresden", "url": "https://tu-dresden.de", "type": "university"},
{"name": "TU Braunschweig", "url": "https://www.tu-braunschweig.de", "type": "university"},
{"name": "TU Darmstadt", "url": "https://www.tu-darmstadt.de", "type": "university"},
{"name": "Leibniz Universität Hannover", "url": "https://www.uni-hannover.de", "type": "university"},
{"name": "Universität Stuttgart", "url": "https://www.uni-stuttgart.de", "type": "university"},
# Excellence Universities
{"name": "Universität Bonn", "url": "https://www.uni-bonn.de", "type": "university"},
{"name": "Universität Konstanz", "url": "https://www.uni-konstanz.de", "type": "university"},
{"name": "Universität Tübingen", "url": "https://uni-tuebingen.de", "type": "university"},
{"name": "Universität Freiburg", "url": "https://www.uni-freiburg.de", "type": "university"},
# Large State Universities
{"name": "Universität Münster", "url": "https://www.uni-muenster.de", "type": "university"},
{"name": "Universität Frankfurt", "url": "https://www.uni-frankfurt.de", "type": "university"},
{"name": "Universität Mainz", "url": "https://www.uni-mainz.de", "type": "university"},
{"name": "Universität Würzburg", "url": "https://www.uni-wuerzburg.de", "type": "university"},
{"name": "Universität Erlangen-Nürnberg", "url": "https://www.fau.de", "type": "university"},
{"name": "Universität Leipzig", "url": "https://www.uni-leipzig.de", "type": "university"},
{"name": "Universität Jena", "url": "https://www.uni-jena.de", "type": "university"},
{"name": "Universität Halle", "url": "https://www.uni-halle.de", "type": "university"},
{"name": "Universität Rostock", "url": "https://www.uni-rostock.de", "type": "university"},
{"name": "Universität Greifswald", "url": "https://www.uni-greifswald.de", "type": "university"},
{"name": "Universität Kiel", "url": "https://www.uni-kiel.de", "type": "university"},
{"name": "Universität Bremen", "url": "https://www.uni-bremen.de", "type": "university"},
{"name": "Universität Bielefeld", "url": "https://www.uni-bielefeld.de", "type": "university"},
{"name": "Universität Duisburg-Essen", "url": "https://www.uni-due.de", "type": "university"},
{"name": "Universität Dortmund", "url": "https://www.tu-dortmund.de", "type": "university"},
{"name": "Universität Bochum", "url": "https://www.ruhr-uni-bochum.de", "type": "university"},
{"name": "Universität Düsseldorf", "url": "https://www.hhu.de", "type": "university"},
{"name": "Universität Wuppertal", "url": "https://www.uni-wuppertal.de", "type": "university"},
{"name": "Universität Siegen", "url": "https://www.uni-siegen.de", "type": "university"},
{"name": "Universität Paderborn", "url": "https://www.uni-paderborn.de", "type": "university"},
{"name": "Universität Kassel", "url": "https://www.uni-kassel.de", "type": "university"},
{"name": "Universität Marburg", "url": "https://www.uni-marburg.de", "type": "university"},
{"name": "Universität Gießen", "url": "https://www.uni-giessen.de", "type": "university"},
{"name": "Universität Saarbrücken", "url": "https://www.uni-saarland.de", "type": "university"},
{"name": "Universität Trier", "url": "https://www.uni-trier.de", "type": "university"},
{"name": "Universität Koblenz", "url": "https://www.uni-koblenz.de", "type": "university"},
{"name": "Universität Landau", "url": "https://rptu.de", "type": "university"},
{"name": "Universität Mannheim", "url": "https://www.uni-mannheim.de", "type": "university"},
{"name": "Universität Ulm", "url": "https://www.uni-ulm.de", "type": "university"},
{"name": "Universität Hohenheim", "url": "https://www.uni-hohenheim.de", "type": "university"},
{"name": "Universität Regensburg", "url": "https://www.uni-regensburg.de", "type": "university"},
{"name": "Universität Passau", "url": "https://www.uni-passau.de", "type": "university"},
{"name": "Universität Bayreuth", "url": "https://www.uni-bayreuth.de", "type": "university"},
{"name": "Universität Bamberg", "url": "https://www.uni-bamberg.de", "type": "university"},
{"name": "Universität Augsburg", "url": "https://www.uni-augsburg.de", "type": "university"},
{"name": "Universität Potsdam", "url": "https://www.uni-potsdam.de", "type": "university"},
{"name": "Universität Magdeburg", "url": "https://www.ovgu.de", "type": "university"},
{"name": "TU Chemnitz", "url": "https://www.tu-chemnitz.de", "type": "university"},
{"name": "TU Ilmenau", "url": "https://www.tu-ilmenau.de", "type": "university"},
{"name": "TU Freiberg", "url": "https://tu-freiberg.de", "type": "university"},
{"name": "TU Clausthal", "url": "https://www.tu-clausthal.de", "type": "university"},
{"name": "TU Kaiserslautern", "url": "https://rptu.de", "type": "university"},
{"name": "BTU Cottbus-Senftenberg", "url": "https://www.b-tu.de", "type": "university"},
{"name": "Universität der Bundeswehr München", "url": "https://www.unibw.de", "type": "university"},
{"name": "Universität der Bundeswehr Hamburg", "url": "https://www.hsu-hh.de", "type": "university"},
# === FACHHOCHSCHULEN / HAW ===
{"name": "HAW Hamburg", "url": "https://www.haw-hamburg.de", "type": "haw"},
{"name": "HTW Berlin", "url": "https://www.htw-berlin.de", "type": "haw"},
{"name": "Beuth Hochschule Berlin", "url": "https://www.bht-berlin.de", "type": "haw"},
{"name": "FH Aachen", "url": "https://www.fh-aachen.de", "type": "haw"},
{"name": "TH Köln", "url": "https://www.th-koeln.de", "type": "haw"},
{"name": "Hochschule Düsseldorf", "url": "https://www.hs-duesseldorf.de", "type": "haw"},
{"name": "FH Dortmund", "url": "https://www.fh-dortmund.de", "type": "haw"},
{"name": "Hochschule Bochum", "url": "https://www.hochschule-bochum.de", "type": "haw"},
{"name": "Westfälische Hochschule", "url": "https://www.w-hs.de", "type": "haw"},
{"name": "FH Bielefeld", "url": "https://www.fh-bielefeld.de", "type": "haw"},
{"name": "FH Münster", "url": "https://www.fh-muenster.de", "type": "haw"},
{"name": "Hochschule Osnabrück", "url": "https://www.hs-osnabrueck.de", "type": "haw"},
{"name": "Hochschule Bremen", "url": "https://www.hs-bremen.de", "type": "haw"},
{"name": "Hochschule Hannover", "url": "https://www.hs-hannover.de", "type": "haw"},
{"name": "Ostfalia Hochschule", "url": "https://www.ostfalia.de", "type": "haw"},
{"name": "Hochschule Emden/Leer", "url": "https://www.hs-emden-leer.de", "type": "haw"},
{"name": "HAWK Hildesheim", "url": "https://www.hawk.de", "type": "haw"},
{"name": "Hochschule Fulda", "url": "https://www.hs-fulda.de", "type": "haw"},
{"name": "Frankfurt UAS", "url": "https://www.frankfurt-university.de", "type": "haw"},
{"name": "Hochschule Darmstadt", "url": "https://www.h-da.de", "type": "haw"},
{"name": "Hochschule RheinMain", "url": "https://www.hs-rm.de", "type": "haw"},
{"name": "Hochschule Mainz", "url": "https://www.hs-mainz.de", "type": "haw"},
{"name": "Hochschule Trier", "url": "https://www.hochschule-trier.de", "type": "haw"},
{"name": "Hochschule Koblenz", "url": "https://www.hs-koblenz.de", "type": "haw"},
{"name": "Hochschule Karlsruhe", "url": "https://www.h-ka.de", "type": "haw"},
{"name": "Hochschule Mannheim", "url": "https://www.hs-mannheim.de", "type": "haw"},
{"name": "Hochschule Heilbronn", "url": "https://www.hs-heilbronn.de", "type": "haw"},
{"name": "Hochschule Esslingen", "url": "https://www.hs-esslingen.de", "type": "haw"},
{"name": "Hochschule Reutlingen", "url": "https://www.reutlingen-university.de", "type": "haw"},
{"name": "Hochschule Konstanz", "url": "https://www.htwg-konstanz.de", "type": "haw"},
{"name": "Hochschule Offenburg", "url": "https://www.hs-offenburg.de", "type": "haw"},
{"name": "Hochschule Pforzheim", "url": "https://www.hs-pforzheim.de", "type": "haw"},
{"name": "Hochschule Albstadt-Sigmaringen", "url": "https://www.hs-albsig.de", "type": "haw"},
{"name": "Hochschule München", "url": "https://www.hm.edu", "type": "haw"},
{"name": "TH Nürnberg", "url": "https://www.th-nuernberg.de", "type": "haw"},
{"name": "TH Ingolstadt", "url": "https://www.thi.de", "type": "haw"},
{"name": "Hochschule Augsburg", "url": "https://www.hs-augsburg.de", "type": "haw"},
{"name": "Hochschule Rosenheim", "url": "https://www.th-rosenheim.de", "type": "haw"},
{"name": "Hochschule Regensburg", "url": "https://www.oth-regensburg.de", "type": "haw"},
{"name": "Hochschule Landshut", "url": "https://www.haw-landshut.de", "type": "haw"},
{"name": "Hochschule Coburg", "url": "https://www.hs-coburg.de", "type": "haw"},
{"name": "Hochschule Hof", "url": "https://www.hof-university.de", "type": "haw"},
{"name": "Hochschule Würzburg-Schweinfurt", "url": "https://www.thws.de", "type": "haw"},
{"name": "Hochschule Aschaffenburg", "url": "https://www.th-ab.de", "type": "haw"},
{"name": "Hochschule Ansbach", "url": "https://www.hs-ansbach.de", "type": "haw"},
{"name": "OTH Amberg-Weiden", "url": "https://www.oth-aw.de", "type": "haw"},
{"name": "Hochschule Deggendorf", "url": "https://www.th-deg.de", "type": "haw"},
{"name": "Hochschule Kempten", "url": "https://www.hs-kempten.de", "type": "haw"},
{"name": "Hochschule Neu-Ulm", "url": "https://www.hnu.de", "type": "haw"},
{"name": "HTW Dresden", "url": "https://www.htw-dresden.de", "type": "haw"},
{"name": "HTWK Leipzig", "url": "https://www.htwk-leipzig.de", "type": "haw"},
{"name": "Hochschule Mittweida", "url": "https://www.hs-mittweida.de", "type": "haw"},
{"name": "Hochschule Zittau/Görlitz", "url": "https://www.hszg.de", "type": "haw"},
{"name": "Westsächsische Hochschule Zwickau", "url": "https://www.fh-zwickau.de", "type": "haw"},
{"name": "Hochschule Merseburg", "url": "https://www.hs-merseburg.de", "type": "haw"},
{"name": "Hochschule Anhalt", "url": "https://www.hs-anhalt.de", "type": "haw"},
{"name": "Hochschule Magdeburg-Stendal", "url": "https://www.h2.de", "type": "haw"},
{"name": "Hochschule Harz", "url": "https://www.hs-harz.de", "type": "haw"},
{"name": "Ernst-Abbe-Hochschule Jena", "url": "https://www.eah-jena.de", "type": "haw"},
{"name": "FH Erfurt", "url": "https://www.fh-erfurt.de", "type": "haw"},
{"name": "Hochschule Nordhausen", "url": "https://www.hs-nordhausen.de", "type": "haw"},
{"name": "Hochschule Schmalkalden", "url": "https://www.hs-schmalkalden.de", "type": "haw"},
{"name": "TH Brandenburg", "url": "https://www.th-brandenburg.de", "type": "haw"},
{"name": "FH Potsdam", "url": "https://www.fh-potsdam.de", "type": "haw"},
{"name": "TH Wildau", "url": "https://www.th-wildau.de", "type": "haw"},
{"name": "Hochschule Neubrandenburg", "url": "https://www.hs-nb.de", "type": "haw"},
{"name": "Hochschule Stralsund", "url": "https://www.hochschule-stralsund.de", "type": "haw"},
{"name": "Hochschule Wismar", "url": "https://www.hs-wismar.de", "type": "haw"},
{"name": "FH Kiel", "url": "https://www.fh-kiel.de", "type": "haw"},
{"name": "FH Westküste", "url": "https://www.fh-westkueste.de", "type": "haw"},
{"name": "TH Lübeck", "url": "https://www.th-luebeck.de", "type": "haw"},
{"name": "FH Flensburg", "url": "https://hs-flensburg.de", "type": "haw"},
{"name": "Hochschule Bremerhaven", "url": "https://www.hs-bremerhaven.de", "type": "haw"},
# === PRIVATE HOCHSCHULEN ===
{"name": "WHU Vallendar", "url": "https://www.whu.edu", "type": "private"},
{"name": "HHL Leipzig", "url": "https://www.hhl.de", "type": "private"},
{"name": "EBS Universität", "url": "https://www.ebs.edu", "type": "private"},
{"name": "Frankfurt School", "url": "https://www.frankfurt-school.de", "type": "private"},
{"name": "ESMT Berlin", "url": "https://esmt.berlin", "type": "private"},
{"name": "Jacobs University Bremen", "url": "https://www.jacobs-university.de", "type": "private"},
{"name": "Zeppelin Universität", "url": "https://www.zu.de", "type": "private"},
{"name": "Bucerius Law School", "url": "https://www.law-school.de", "type": "private"},
{"name": "Universität Witten/Herdecke", "url": "https://www.uni-wh.de", "type": "private"},
{"name": "IUBH", "url": "https://www.iu.de", "type": "private"},
{"name": "SRH Hochschule Heidelberg", "url": "https://www.srh-hochschule-heidelberg.de", "type": "private"},
{"name": "FOM Hochschule", "url": "https://www.fom.de", "type": "private"},
# === FRAUNHOFER INSTITUTE ===
{"name": "Fraunhofer IIS", "url": "https://www.iis.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IAIS", "url": "https://www.iais.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IML", "url": "https://www.iml.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer ISI", "url": "https://www.isi.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IPA", "url": "https://www.ipa.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IAO", "url": "https://www.iao.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IWS", "url": "https://www.iws.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IPT", "url": "https://www.ipt.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer FOKUS", "url": "https://www.fokus.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer HHI", "url": "https://www.hhi.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IESE", "url": "https://www.iese.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IOSB", "url": "https://www.iosb.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IDMT", "url": "https://www.idmt.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IKTS", "url": "https://www.ikts.fraunhofer.de", "type": "research"},
{"name": "Fraunhofer IGD", "url": "https://www.igd.fraunhofer.de", "type": "research"},
# === MAX-PLANCK-INSTITUTE ===
{"name": "MPI für Informatik", "url": "https://www.mpi-inf.mpg.de", "type": "research"},
{"name": "MPI für Software Systeme", "url": "https://www.mpi-sws.org", "type": "research"},
{"name": "MPI für intelligente Systeme", "url": "https://is.mpg.de", "type": "research"},
{"name": "MPI für Mathematik", "url": "https://www.mpim-bonn.mpg.de", "type": "research"},
{"name": "MPI für Physik", "url": "https://www.mpp.mpg.de", "type": "research"},
{"name": "MPI für Quantenoptik", "url": "https://www.mpq.mpg.de", "type": "research"},
{"name": "MPI für Biophysik", "url": "https://www.biophys.mpg.de", "type": "research"},
{"name": "MPI für Biochemie", "url": "https://www.biochem.mpg.de", "type": "research"},
{"name": "MPI für Neurobiologie", "url": "https://www.neuro.mpg.de", "type": "research"},
{"name": "MPI für Hirnforschung", "url": "https://brain.mpg.de", "type": "research"},
# === HELMHOLTZ-ZENTREN ===
{"name": "DESY Hamburg", "url": "https://www.desy.de", "type": "research"},
{"name": "FZ Jülich", "url": "https://www.fz-juelich.de", "type": "research"},
{"name": "GSI Darmstadt", "url": "https://www.gsi.de", "type": "research"},
{"name": "DKFZ Heidelberg", "url": "https://www.dkfz.de", "type": "research"},
{"name": "DLR", "url": "https://www.dlr.de", "type": "research"},
{"name": "AWI Bremerhaven", "url": "https://www.awi.de", "type": "research"},
{"name": "GFZ Potsdam", "url": "https://www.gfz-potsdam.de", "type": "research"},
{"name": "UFZ Leipzig", "url": "https://www.ufz.de", "type": "research"},
{"name": "GEOMAR Kiel", "url": "https://www.geomar.de", "type": "research"},
]
def get_existing_universities():
"""Get list of existing universities from the API."""
try:
response = requests.get(f"{API_BASE}/universities", verify=False, timeout=10)
if response.status_code == 200:
data = response.json()
return {u['url'].rstrip('/').lower(): u for u in data.get('universities', [])}
except Exception as e:
print(f"Error fetching existing universities: {e}")
return {}
def add_university(uni):
"""Add a university to the database."""
payload = {
"name": uni["name"],
"url": uni["url"],
"type": uni.get("type", "university"),
"country": "DE"
}
try:
response = requests.post(
f"{API_BASE}/universities",
json=payload,
verify=False,
timeout=10
)
return response.status_code == 201 or response.status_code == 200
except Exception as e:
print(f"Error adding {uni['name']}: {e}")
return False
def main():
print("Fetching existing universities...")
existing = get_existing_universities()
print(f"Found {len(existing)} existing universities")
added = 0
skipped = 0
failed = 0
for uni in GERMAN_UNIVERSITIES:
url_key = uni["url"].rstrip('/').lower()
if url_key in existing:
print(f"SKIP: {uni['name']} (already exists)")
skipped += 1
continue
print(f"ADD: {uni['name']} ({uni['url']})")
if add_university(uni):
added += 1
else:
failed += 1
# Rate limiting
time.sleep(0.2)
print(f"\n=== SUMMARY ===")
print(f"Added: {added}")
print(f"Skipped: {skipped}")
print(f"Failed: {failed}")
print(f"Total: {len(GERMAN_UNIVERSITIES)}")
if __name__ == "__main__":
# Disable SSL warnings for self-signed cert
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
main()

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python3
"""
Fix university types in the database.
This script updates uni_type based on university names.
"""
import requests
import json
import sys
API_BASE = "https://macmini:8089/api/v1"
# Classification rules based on name patterns
UNI_TYPE_RULES = {
"UNI": [
"Universität", "University", "TU ", "TUM", "LMU", "RWTH",
"Humboldt", "FU Berlin", "HU Berlin", "TH ", "KIT"
],
"FH": [
"Hochschule", "Fachhochschule", "FH ", "HAW ", "HS ",
"University of Applied", "Beuth", "HTW"
],
"RESEARCH": [
"Fraunhofer", "Max-Planck", "Helmholtz", "DLR", "DESY",
"DKFZ", "FZ Jülich", "AWI", "GFZ", "GSI", "Leibniz"
],
"PRIVATE": [
"EBS", "ESMT", "Bucerius", "WHU", "HHL", "FOM", "IUBH",
"SRH", "International School", "Business School"
],
"KUNST": [
"Kunsthochschule", "Musikhochschule", "Filmhochschule",
"Kunstakademie", "HfK", "HfM", "HfG", "UdK", "Bauhaus"
],
"PH": [
"Pädagogische Hochschule", "PH "
]
}
def classify_university(name):
"""Classify university by name patterns."""
name_lower = name.lower()
# Check each category
for uni_type, patterns in UNI_TYPE_RULES.items():
for pattern in patterns:
if pattern.lower() in name_lower:
return uni_type
# Default to UNI if "universität" in name, else FH
if "universität" in name_lower or "university" in name_lower:
return "UNI"
return "FH" # Default
def get_all_universities():
"""Get list of all universities from the API."""
try:
response = requests.get(f"{API_BASE}/universities", verify=False, timeout=30)
if response.status_code == 200:
data = response.json()
return data.get('universities', [])
except Exception as e:
print(f"Error fetching universities: {e}")
return []
def update_university_type(uni_id, uni_type, uni_state=None):
"""Update university type via direct database or API."""
# The API doesn't have an update endpoint, so we'll print SQL statements
return uni_type
def main():
print("=== University Type Fixer ===\n")
# Disable SSL warnings
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
universities = get_all_universities()
if not universities:
print("ERROR: No universities found!")
return
print(f"Found {len(universities)} universities\n")
# Classify and generate SQL
sql_statements = []
type_counts = {}
for uni in universities:
uni_id = uni['id']
uni_name = uni['name']
current_type = uni.get('uni_type', 'unknown')
# Classify
new_type = classify_university(uni_name)
# Count
type_counts[new_type] = type_counts.get(new_type, 0) + 1
# Generate SQL
sql = f"UPDATE universities SET uni_type = '{new_type}' WHERE id = '{uni_id}';"
sql_statements.append(sql)
if current_type != new_type:
print(f" {uni_name[:50]:<50} -> {new_type}")
print(f"\n=== Summary ===")
for t, c in sorted(type_counts.items()):
print(f" {t}: {c}")
# Write SQL file
sql_file = "/tmp/fix_uni_types.sql"
with open(sql_file, 'w') as f:
f.write("-- Fix university types\n")
f.write("BEGIN;\n\n")
for sql in sql_statements:
f.write(sql + "\n")
f.write("\nCOMMIT;\n")
print(f"\nSQL written to: {sql_file}")
print(f"Run: cat {sql_file} | docker exec -i breakpilot-pwa-postgres psql -U <user> -d edu_search")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""
Seed German Universities directly into the edu-search-service universities table.
This script imports the same university data as load_university_seeds.py
but writes directly to the PostgreSQL universities table used by the crawler.
"""
import psycopg2
import os
import sys
# Add the backend scripts path to import university data
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../backend/scripts'))
from load_university_seeds import (
UNIVERSITAETEN, FACHHOCHSCHULEN, PAEDAGOGISCHE_HOCHSCHULEN,
KUNSTHOCHSCHULEN, PRIVATE_HOCHSCHULEN
)
# Database connection from environment or defaults
DATABASE_URL = os.environ.get(
'DATABASE_URL',
'postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db'
)
def get_uni_type(original_type: str) -> str:
"""Map the type from seed data to database uni_type."""
type_map = {
'UNI': 'UNI',
'FH': 'HAW', # Fachhochschule -> HAW (Hochschule für Angewandte Wissenschaften)
'PH': 'PH', # Pädagogische Hochschule
'KUNST': 'KUNST',
'PRIVATE': 'PRIVATE',
}
return type_map.get(original_type, 'UNI')
def seed_universities():
"""Load all universities into the database."""
# Collect all universities with their types
all_unis = []
for uni in UNIVERSITAETEN:
all_unis.append({**uni, 'uni_type': 'UNI'})
for uni in FACHHOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'HAW'})
for uni in PAEDAGOGISCHE_HOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'PH'})
for uni in KUNSTHOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'KUNST'})
for uni in PRIVATE_HOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'PRIVATE'})
print(f"Total universities to seed: {len(all_unis)}")
print(f" - Universitäten: {len(UNIVERSITAETEN)}")
print(f" - Fachhochschulen: {len(FACHHOCHSCHULEN)}")
print(f" - Pädagogische Hochschulen: {len(PAEDAGOGISCHE_HOCHSCHULEN)}")
print(f" - Kunst-/Musikhochschulen: {len(KUNSTHOCHSCHULEN)}")
print(f" - Private Hochschulen: {len(PRIVATE_HOCHSCHULEN)}")
try:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
inserted = 0
skipped = 0
errors = []
for uni in all_unis:
try:
# Generate a short name from the full name
name = uni['name']
short_name = None
# Try to extract common abbreviations
if 'KIT' in name:
short_name = 'KIT'
elif 'TUM' in name or name == 'Technische Universität München':
short_name = 'TUM'
elif 'LMU' in name or 'Ludwig-Maximilians' in name:
short_name = 'LMU'
elif 'RWTH' in name:
short_name = 'RWTH'
elif 'FAU' in name or 'Friedrich-Alexander' in name:
short_name = 'FAU'
elif name.startswith('Universität '):
short_name = 'Uni ' + name.replace('Universität ', '')[:15]
elif name.startswith('Technische Universität '):
short_name = 'TU ' + name.replace('Technische Universität ', '')[:12]
elif name.startswith('Hochschule '):
short_name = 'HS ' + name.replace('Hochschule ', '')[:15]
cur.execute("""
INSERT INTO universities (name, short_name, url, state, uni_type)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url) DO NOTHING
RETURNING id
""", (
uni['name'],
short_name,
uni['url'],
uni.get('state'),
uni['uni_type']
))
result = cur.fetchone()
if result:
inserted += 1
else:
skipped += 1
except Exception as e:
errors.append(f"{uni['name']}: {str(e)}")
conn.commit()
cur.close()
conn.close()
print(f"\nResults:")
print(f" Inserted: {inserted}")
print(f" Skipped (duplicates): {skipped}")
if errors:
print(f" Errors: {len(errors)}")
for err in errors[:5]:
print(f" - {err}")
print(f"\nDone! Total universities in database: {inserted + skipped}")
return True
except psycopg2.Error as e:
print(f"Database error: {e}")
return False
if __name__ == "__main__":
print("=" * 60)
print("Seeding Universities into edu-search-service database")
print("=" * 60)
success = seed_universities()
sys.exit(0 if success else 1)

View File

@@ -0,0 +1,320 @@
#!/usr/bin/env python3
"""
vast.ai Profile Extractor Script
Dieses Skript läuft auf vast.ai und extrahiert Profildaten von Universitäts-Webseiten.
Verwendung auf vast.ai:
1. Lade dieses Skript auf deine vast.ai Instanz
2. Installiere Abhängigkeiten: pip install requests beautifulsoup4 openai
3. Setze Umgebungsvariablen:
- BREAKPILOT_API_URL=http://deine-ip:8086
- BREAKPILOT_API_KEY=dev-key
- OPENAI_API_KEY=sk-...
4. Starte: python vast_ai_extractor.py
"""
import os
import sys
import json
import time
import logging
import requests
from bs4 import BeautifulSoup
from typing import Optional, Dict, Any, List
# Logging Setup
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
API_URL = os.environ.get('BREAKPILOT_API_URL', 'http://localhost:8086')
API_KEY = os.environ.get('BREAKPILOT_API_KEY', 'dev-key')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
BATCH_SIZE = 10
SLEEP_BETWEEN_REQUESTS = 1 # Sekunden zwischen Requests (respektiere rate limits)
def fetch_pending_profiles(limit: int = 50) -> List[Dict]:
"""Hole Profile die noch extrahiert werden müssen."""
try:
response = requests.get(
f"{API_URL}/api/v1/ai/extraction/pending",
params={"limit": limit},
headers={"Authorization": f"Bearer {API_KEY}"},
timeout=30
)
response.raise_for_status()
data = response.json()
return data.get("tasks", [])
except Exception as e:
logger.error(f"Fehler beim Abrufen der Profile: {e}")
return []
def fetch_profile_page(url: str) -> Optional[str]:
"""Lade den HTML-Inhalt einer Profilseite."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; BreakPilot-Crawler/1.0; +https://breakpilot.de)',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return response.text
except Exception as e:
logger.error(f"Fehler beim Laden von {url}: {e}")
return None
def extract_with_beautifulsoup(html: str, url: str) -> Dict[str, Any]:
"""Extrahiere Basis-Informationen mit BeautifulSoup (ohne AI)."""
soup = BeautifulSoup(html, 'html.parser')
data = {}
# Email suchen
email_links = soup.find_all('a', href=lambda x: x and x.startswith('mailto:'))
if email_links:
email = email_links[0]['href'].replace('mailto:', '').split('?')[0]
data['email'] = email
# Telefon suchen
phone_links = soup.find_all('a', href=lambda x: x and x.startswith('tel:'))
if phone_links:
data['phone'] = phone_links[0]['href'].replace('tel:', '')
# ORCID suchen
orcid_links = soup.find_all('a', href=lambda x: x and 'orcid.org' in x)
if orcid_links:
orcid = orcid_links[0]['href']
# Extrahiere ORCID ID
if '/' in orcid:
data['orcid'] = orcid.split('/')[-1]
# Google Scholar suchen
scholar_links = soup.find_all('a', href=lambda x: x and 'scholar.google' in x)
if scholar_links:
href = scholar_links[0]['href']
if 'user=' in href:
data['google_scholar_id'] = href.split('user=')[1].split('&')[0]
# ResearchGate suchen
rg_links = soup.find_all('a', href=lambda x: x and 'researchgate.net' in x)
if rg_links:
data['researchgate_url'] = rg_links[0]['href']
# LinkedIn suchen
linkedin_links = soup.find_all('a', href=lambda x: x and 'linkedin.com' in x)
if linkedin_links:
data['linkedin_url'] = linkedin_links[0]['href']
# Institut/Abteilung Links sammeln (für Hierarchie-Erkennung)
base_domain = '/'.join(url.split('/')[:3])
department_links = []
for link in soup.find_all('a', href=True):
href = link['href']
text = link.get_text(strip=True)
# Suche nach Links die auf Institute/Fakultäten hindeuten
if any(kw in text.lower() for kw in ['institut', 'fakultät', 'fachbereich', 'abteilung', 'lehrstuhl']):
if href.startswith('/'):
href = base_domain + href
if href.startswith('http'):
department_links.append({'url': href, 'name': text})
if department_links:
# Nimm den ersten gefundenen Department-Link
data['department_url'] = department_links[0]['url']
data['department_name'] = department_links[0]['name']
return data
def extract_with_ai(html: str, url: str, full_name: str) -> Dict[str, Any]:
"""Extrahiere strukturierte Daten mit OpenAI GPT."""
if not OPENAI_API_KEY:
logger.warning("Kein OPENAI_API_KEY gesetzt - nutze nur BeautifulSoup")
return extract_with_beautifulsoup(html, url)
try:
import openai
client = openai.OpenAI(api_key=OPENAI_API_KEY)
# Reduziere HTML auf relevanten Text
soup = BeautifulSoup(html, 'html.parser')
# Entferne Scripts, Styles, etc.
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
# Extrahiere Text
text = soup.get_text(separator='\n', strip=True)
# Limitiere auf 8000 Zeichen für API
text = text[:8000]
prompt = f"""Analysiere diese Universitäts-Profilseite für {full_name} und extrahiere folgende Informationen im JSON-Format:
{{
"email": "email@uni.de oder null",
"phone": "Telefonnummer oder null",
"office": "Raum/Büro oder null",
"position": "Position/Titel (z.B. Wissenschaftlicher Mitarbeiter, Professorin) oder null",
"department_name": "Name des Instituts/der Abteilung oder null",
"research_interests": ["Liste", "der", "Forschungsthemen"] oder [],
"teaching_topics": ["Liste", "der", "Lehrveranstaltungen/Fächer"] oder [],
"supervisor_name": "Name des Vorgesetzten/Lehrstuhlinhabers falls erkennbar oder null"
}}
Profilseite von {url}:
{text}
Antworte NUR mit dem JSON-Objekt, keine Erklärungen."""
response = client.chat.completions.create(
model="gpt-4o-mini", # Kostengünstig und schnell
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=500
)
result_text = response.choices[0].message.content.strip()
# Parse JSON (entferne eventuelle Markdown-Blöcke)
if result_text.startswith('```'):
result_text = result_text.split('```')[1]
if result_text.startswith('json'):
result_text = result_text[4:]
ai_data = json.loads(result_text)
# Kombiniere mit BeautifulSoup-Ergebnissen (für Links wie ORCID)
bs_data = extract_with_beautifulsoup(html, url)
# AI-Daten haben Priorität, aber BS-Daten für spezifische Links
for key in ['orcid', 'google_scholar_id', 'researchgate_url', 'linkedin_url']:
if key in bs_data and bs_data[key]:
ai_data[key] = bs_data[key]
return ai_data
except Exception as e:
logger.error(f"AI-Extraktion fehlgeschlagen: {e}")
return extract_with_beautifulsoup(html, url)
def submit_extracted_data(staff_id: str, data: Dict[str, Any]) -> bool:
"""Sende extrahierte Daten zurück an BreakPilot."""
try:
payload = {"staff_id": staff_id, **data}
# Entferne None-Werte
payload = {k: v for k, v in payload.items() if v is not None}
response = requests.post(
f"{API_URL}/api/v1/ai/extraction/submit",
json=payload,
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
timeout=30
)
response.raise_for_status()
return True
except Exception as e:
logger.error(f"Fehler beim Senden der Daten für {staff_id}: {e}")
return False
def process_profiles():
"""Hauptschleife: Hole Profile, extrahiere Daten, sende zurück."""
logger.info(f"Starte Extraktion - API: {API_URL}")
processed = 0
errors = 0
while True:
# Hole neue Profile
profiles = fetch_pending_profiles(limit=BATCH_SIZE)
if not profiles:
logger.info("Keine weiteren Profile zum Verarbeiten. Warte 60 Sekunden...")
time.sleep(60)
continue
logger.info(f"Verarbeite {len(profiles)} Profile...")
for profile in profiles:
staff_id = profile['staff_id']
url = profile['profile_url']
full_name = profile.get('full_name', 'Unbekannt')
logger.info(f"Verarbeite: {full_name} - {url}")
# Lade Profilseite
html = fetch_profile_page(url)
if not html:
errors += 1
continue
# Extrahiere Daten
extracted = extract_with_ai(html, url, full_name)
if extracted:
# Sende zurück
if submit_extracted_data(staff_id, extracted):
processed += 1
logger.info(f"Erfolgreich: {full_name} - Email: {extracted.get('email', 'N/A')}")
else:
errors += 1
else:
errors += 1
# Rate limiting
time.sleep(SLEEP_BETWEEN_REQUESTS)
logger.info(f"Batch abgeschlossen. Gesamt: {processed} erfolgreich, {errors} Fehler")
def main():
"""Einstiegspunkt."""
logger.info("=" * 60)
logger.info("BreakPilot vast.ai Profile Extractor")
logger.info("=" * 60)
# Prüfe Konfiguration
if not API_KEY:
logger.error("BREAKPILOT_API_KEY nicht gesetzt!")
sys.exit(1)
if not OPENAI_API_KEY:
logger.warning("OPENAI_API_KEY nicht gesetzt - nutze nur BeautifulSoup-Extraktion")
# Teste Verbindung
try:
response = requests.get(
f"{API_URL}/v1/health",
headers={"Authorization": f"Bearer {API_KEY}"},
timeout=10
)
logger.info(f"API-Verbindung OK: {response.status_code}")
except Exception as e:
logger.error(f"Kann API nicht erreichen: {e}")
logger.error(f"Stelle sicher dass {API_URL} erreichbar ist!")
sys.exit(1)
# Starte Verarbeitung
try:
process_profiles()
except KeyboardInterrupt:
logger.info("Beendet durch Benutzer")
except Exception as e:
logger.error(f"Unerwarteter Fehler: {e}")
sys.exit(1)
if __name__ == "__main__":
main()