#!/usr/bin/env python3 """ Seed German Universities directly into the edu-search-service universities table. This script imports the same university data as load_university_seeds.py but writes directly to the PostgreSQL universities table used by the crawler. """ import psycopg2 import os import sys # Add the backend scripts path to import university data sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../backend/scripts')) from load_university_seeds import ( UNIVERSITAETEN, FACHHOCHSCHULEN, PAEDAGOGISCHE_HOCHSCHULEN, KUNSTHOCHSCHULEN, PRIVATE_HOCHSCHULEN ) # Database connection from environment or defaults DATABASE_URL = os.environ.get( 'DATABASE_URL', 'postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db' ) def get_uni_type(original_type: str) -> str: """Map the type from seed data to database uni_type.""" type_map = { 'UNI': 'UNI', 'FH': 'HAW', # Fachhochschule -> HAW (Hochschule für Angewandte Wissenschaften) 'PH': 'PH', # Pädagogische Hochschule 'KUNST': 'KUNST', 'PRIVATE': 'PRIVATE', } return type_map.get(original_type, 'UNI') def seed_universities(): """Load all universities into the database.""" # Collect all universities with their types all_unis = [] for uni in UNIVERSITAETEN: all_unis.append({**uni, 'uni_type': 'UNI'}) for uni in FACHHOCHSCHULEN: all_unis.append({**uni, 'uni_type': 'HAW'}) for uni in PAEDAGOGISCHE_HOCHSCHULEN: all_unis.append({**uni, 'uni_type': 'PH'}) for uni in KUNSTHOCHSCHULEN: all_unis.append({**uni, 'uni_type': 'KUNST'}) for uni in PRIVATE_HOCHSCHULEN: all_unis.append({**uni, 'uni_type': 'PRIVATE'}) print(f"Total universities to seed: {len(all_unis)}") print(f" - Universitäten: {len(UNIVERSITAETEN)}") print(f" - Fachhochschulen: {len(FACHHOCHSCHULEN)}") print(f" - Pädagogische Hochschulen: {len(PAEDAGOGISCHE_HOCHSCHULEN)}") print(f" - Kunst-/Musikhochschulen: {len(KUNSTHOCHSCHULEN)}") print(f" - Private Hochschulen: {len(PRIVATE_HOCHSCHULEN)}") try: conn = psycopg2.connect(DATABASE_URL) cur = conn.cursor() inserted = 0 skipped = 0 errors = [] for uni in all_unis: try: # Generate a short name from the full name name = uni['name'] short_name = None # Try to extract common abbreviations if 'KIT' in name: short_name = 'KIT' elif 'TUM' in name or name == 'Technische Universität München': short_name = 'TUM' elif 'LMU' in name or 'Ludwig-Maximilians' in name: short_name = 'LMU' elif 'RWTH' in name: short_name = 'RWTH' elif 'FAU' in name or 'Friedrich-Alexander' in name: short_name = 'FAU' elif name.startswith('Universität '): short_name = 'Uni ' + name.replace('Universität ', '')[:15] elif name.startswith('Technische Universität '): short_name = 'TU ' + name.replace('Technische Universität ', '')[:12] elif name.startswith('Hochschule '): short_name = 'HS ' + name.replace('Hochschule ', '')[:15] cur.execute(""" INSERT INTO universities (name, short_name, url, state, uni_type) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (url) DO NOTHING RETURNING id """, ( uni['name'], short_name, uni['url'], uni.get('state'), uni['uni_type'] )) result = cur.fetchone() if result: inserted += 1 else: skipped += 1 except Exception as e: errors.append(f"{uni['name']}: {str(e)}") conn.commit() cur.close() conn.close() print(f"\nResults:") print(f" Inserted: {inserted}") print(f" Skipped (duplicates): {skipped}") if errors: print(f" Errors: {len(errors)}") for err in errors[:5]: print(f" - {err}") print(f"\nDone! Total universities in database: {inserted + skipped}") return True except psycopg2.Error as e: print(f"Database error: {e}") return False if __name__ == "__main__": print("=" * 60) print("Seeding Universities into edu-search-service database") print("=" * 60) success = seed_universities() sys.exit(0 if success else 1)