Files
breakpilot-lehrer/edu-search-service/scripts/seed_universities.py
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

148 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
Seed German Universities directly into the edu-search-service universities table.
This script imports the same university data as load_university_seeds.py
but writes directly to the PostgreSQL universities table used by the crawler.
"""
import psycopg2
import os
import sys
# Add the backend scripts path to import university data
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../backend/scripts'))
from load_university_seeds import (
UNIVERSITAETEN, FACHHOCHSCHULEN, PAEDAGOGISCHE_HOCHSCHULEN,
KUNSTHOCHSCHULEN, PRIVATE_HOCHSCHULEN
)
# Database connection from environment or defaults
DATABASE_URL = os.environ.get(
'DATABASE_URL',
'postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db'
)
def get_uni_type(original_type: str) -> str:
"""Map the type from seed data to database uni_type."""
type_map = {
'UNI': 'UNI',
'FH': 'HAW', # Fachhochschule -> HAW (Hochschule für Angewandte Wissenschaften)
'PH': 'PH', # Pädagogische Hochschule
'KUNST': 'KUNST',
'PRIVATE': 'PRIVATE',
}
return type_map.get(original_type, 'UNI')
def seed_universities():
"""Load all universities into the database."""
# Collect all universities with their types
all_unis = []
for uni in UNIVERSITAETEN:
all_unis.append({**uni, 'uni_type': 'UNI'})
for uni in FACHHOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'HAW'})
for uni in PAEDAGOGISCHE_HOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'PH'})
for uni in KUNSTHOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'KUNST'})
for uni in PRIVATE_HOCHSCHULEN:
all_unis.append({**uni, 'uni_type': 'PRIVATE'})
print(f"Total universities to seed: {len(all_unis)}")
print(f" - Universitäten: {len(UNIVERSITAETEN)}")
print(f" - Fachhochschulen: {len(FACHHOCHSCHULEN)}")
print(f" - Pädagogische Hochschulen: {len(PAEDAGOGISCHE_HOCHSCHULEN)}")
print(f" - Kunst-/Musikhochschulen: {len(KUNSTHOCHSCHULEN)}")
print(f" - Private Hochschulen: {len(PRIVATE_HOCHSCHULEN)}")
try:
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
inserted = 0
skipped = 0
errors = []
for uni in all_unis:
try:
# Generate a short name from the full name
name = uni['name']
short_name = None
# Try to extract common abbreviations
if 'KIT' in name:
short_name = 'KIT'
elif 'TUM' in name or name == 'Technische Universität München':
short_name = 'TUM'
elif 'LMU' in name or 'Ludwig-Maximilians' in name:
short_name = 'LMU'
elif 'RWTH' in name:
short_name = 'RWTH'
elif 'FAU' in name or 'Friedrich-Alexander' in name:
short_name = 'FAU'
elif name.startswith('Universität '):
short_name = 'Uni ' + name.replace('Universität ', '')[:15]
elif name.startswith('Technische Universität '):
short_name = 'TU ' + name.replace('Technische Universität ', '')[:12]
elif name.startswith('Hochschule '):
short_name = 'HS ' + name.replace('Hochschule ', '')[:15]
cur.execute("""
INSERT INTO universities (name, short_name, url, state, uni_type)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url) DO NOTHING
RETURNING id
""", (
uni['name'],
short_name,
uni['url'],
uni.get('state'),
uni['uni_type']
))
result = cur.fetchone()
if result:
inserted += 1
else:
skipped += 1
except Exception as e:
errors.append(f"{uni['name']}: {str(e)}")
conn.commit()
cur.close()
conn.close()
print(f"\nResults:")
print(f" Inserted: {inserted}")
print(f" Skipped (duplicates): {skipped}")
if errors:
print(f" Errors: {len(errors)}")
for err in errors[:5]:
print(f" - {err}")
print(f"\nDone! Total universities in database: {inserted + skipped}")
return True
except psycopg2.Error as e:
print(f"Database error: {e}")
return False
if __name__ == "__main__":
print("=" * 60)
print("Seeding Universities into edu-search-service database")
print("=" * 60)
success = seed_universities()
sys.exit(0 if success else 1)