package database import ( "context" "fmt" "strings" "github.com/google/uuid" "github.com/jackc/pgx/v5" ) // Repository provides database operations for staff and publications type Repository struct { db *DB } // NewRepository creates a new repository func NewRepository(db *DB) *Repository { return &Repository{db: db} } // ============================================================================ // UNIVERSITIES // ============================================================================ // CreateUniversity creates a new university func (r *Repository) CreateUniversity(ctx context.Context, u *University) error { query := ` INSERT INTO universities (name, short_name, url, state, uni_type, staff_page_pattern) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (url) DO UPDATE SET name = EXCLUDED.name, short_name = EXCLUDED.short_name, state = EXCLUDED.state, uni_type = EXCLUDED.uni_type, staff_page_pattern = EXCLUDED.staff_page_pattern, updated_at = NOW() RETURNING id, created_at, updated_at ` return r.db.Pool.QueryRow(ctx, query, u.Name, u.ShortName, u.URL, u.State, u.UniType, u.StaffPagePattern, ).Scan(&u.ID, &u.CreatedAt, &u.UpdatedAt) } // GetUniversity retrieves a university by ID func (r *Repository) GetUniversity(ctx context.Context, id uuid.UUID) (*University, error) { query := `SELECT id, name, short_name, url, state, uni_type, staff_page_pattern, created_at, updated_at FROM universities WHERE id = $1` u := &University{} err := r.db.Pool.QueryRow(ctx, query, id).Scan( &u.ID, &u.Name, &u.ShortName, &u.URL, &u.State, &u.UniType, &u.StaffPagePattern, &u.CreatedAt, &u.UpdatedAt, ) if err == pgx.ErrNoRows { return nil, nil } if err != nil { return nil, err } return u, nil } // GetUniversityByID is an alias for GetUniversity (for interface compatibility) func (r *Repository) GetUniversityByID(ctx context.Context, id uuid.UUID) (*University, error) { return r.GetUniversity(ctx, id) } // GetUniversityByURL retrieves a university by URL func (r *Repository) GetUniversityByURL(ctx context.Context, url string) (*University, error) { query := `SELECT id, name, short_name, url, state, uni_type, staff_page_pattern, created_at, updated_at FROM universities WHERE url = $1` u := &University{} err := r.db.Pool.QueryRow(ctx, query, url).Scan( &u.ID, &u.Name, &u.ShortName, &u.URL, &u.State, &u.UniType, &u.StaffPagePattern, &u.CreatedAt, &u.UpdatedAt, ) if err != nil { return nil, err } return u, nil } // ListUniversities lists all universities func (r *Repository) ListUniversities(ctx context.Context) ([]University, error) { query := `SELECT id, name, short_name, url, state, uni_type, staff_page_pattern, created_at, updated_at FROM universities ORDER BY name` rows, err := r.db.Pool.Query(ctx, query) if err != nil { return nil, err } defer rows.Close() var universities []University for rows.Next() { var u University if err := rows.Scan( &u.ID, &u.Name, &u.ShortName, &u.URL, &u.State, &u.UniType, &u.StaffPagePattern, &u.CreatedAt, &u.UpdatedAt, ); err != nil { return nil, err } universities = append(universities, u) } return universities, rows.Err() } // ============================================================================ // DEPARTMENTS // ============================================================================ // CreateDepartment creates or updates a department func (r *Repository) CreateDepartment(ctx context.Context, d *Department) error { query := ` INSERT INTO departments (university_id, name, name_en, url, category, parent_id) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (university_id, name) DO UPDATE SET name_en = EXCLUDED.name_en, url = EXCLUDED.url, category = EXCLUDED.category, parent_id = EXCLUDED.parent_id, updated_at = NOW() RETURNING id, created_at, updated_at ` return r.db.Pool.QueryRow(ctx, query, d.UniversityID, d.Name, d.NameEN, d.URL, d.Category, d.ParentID, ).Scan(&d.ID, &d.CreatedAt, &d.UpdatedAt) } // GetDepartmentByName retrieves a department by university and name func (r *Repository) GetDepartmentByName(ctx context.Context, uniID uuid.UUID, name string) (*Department, error) { query := `SELECT id, university_id, name, name_en, url, category, parent_id, created_at, updated_at FROM departments WHERE university_id = $1 AND name = $2` d := &Department{} err := r.db.Pool.QueryRow(ctx, query, uniID, name).Scan( &d.ID, &d.UniversityID, &d.Name, &d.NameEN, &d.URL, &d.Category, &d.ParentID, &d.CreatedAt, &d.UpdatedAt, ) if err != nil { return nil, err } return d, nil } // ============================================================================ // STAFF // ============================================================================ // CreateStaff creates or updates a staff member func (r *Repository) CreateStaff(ctx context.Context, s *UniversityStaff) error { query := ` INSERT INTO university_staff ( university_id, department_id, first_name, last_name, full_name, title, academic_title, position, position_type, is_professor, email, phone, office, profile_url, photo_url, orcid, google_scholar_id, researchgate_url, linkedin_url, personal_website, research_interests, research_summary, supervisor_id, team_role, source_url ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25 ) ON CONFLICT (university_id, first_name, last_name, COALESCE(department_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO UPDATE SET full_name = EXCLUDED.full_name, title = EXCLUDED.title, academic_title = EXCLUDED.academic_title, position = EXCLUDED.position, position_type = EXCLUDED.position_type, is_professor = EXCLUDED.is_professor, email = COALESCE(EXCLUDED.email, university_staff.email), phone = COALESCE(EXCLUDED.phone, university_staff.phone), office = COALESCE(EXCLUDED.office, university_staff.office), profile_url = COALESCE(EXCLUDED.profile_url, university_staff.profile_url), photo_url = COALESCE(EXCLUDED.photo_url, university_staff.photo_url), orcid = COALESCE(EXCLUDED.orcid, university_staff.orcid), google_scholar_id = COALESCE(EXCLUDED.google_scholar_id, university_staff.google_scholar_id), researchgate_url = COALESCE(EXCLUDED.researchgate_url, university_staff.researchgate_url), linkedin_url = COALESCE(EXCLUDED.linkedin_url, university_staff.linkedin_url), personal_website = COALESCE(EXCLUDED.personal_website, university_staff.personal_website), research_interests = COALESCE(EXCLUDED.research_interests, university_staff.research_interests), research_summary = COALESCE(EXCLUDED.research_summary, university_staff.research_summary), supervisor_id = COALESCE(EXCLUDED.supervisor_id, university_staff.supervisor_id), team_role = COALESCE(EXCLUDED.team_role, university_staff.team_role), source_url = COALESCE(EXCLUDED.source_url, university_staff.source_url), crawled_at = NOW(), updated_at = NOW() RETURNING id, crawled_at, created_at, updated_at ` return r.db.Pool.QueryRow(ctx, query, s.UniversityID, s.DepartmentID, s.FirstName, s.LastName, s.FullName, s.Title, s.AcademicTitle, s.Position, s.PositionType, s.IsProfessor, s.Email, s.Phone, s.Office, s.ProfileURL, s.PhotoURL, s.ORCID, s.GoogleScholarID, s.ResearchgateURL, s.LinkedInURL, s.PersonalWebsite, s.ResearchInterests, s.ResearchSummary, s.SupervisorID, s.TeamRole, s.SourceURL, ).Scan(&s.ID, &s.CrawledAt, &s.CreatedAt, &s.UpdatedAt) } // GetStaff retrieves a staff member by ID func (r *Repository) GetStaff(ctx context.Context, id uuid.UUID) (*UniversityStaff, error) { query := `SELECT * FROM v_staff_full WHERE id = $1` s := &UniversityStaff{} err := r.db.Pool.QueryRow(ctx, query, id).Scan( &s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName, &s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor, &s.Email, &s.Phone, &s.Office, &s.ProfileURL, &s.PhotoURL, &s.ORCID, &s.GoogleScholarID, &s.ResearchgateURL, &s.LinkedInURL, &s.PersonalWebsite, &s.ResearchInterests, &s.ResearchSummary, &s.CrawledAt, &s.LastVerified, &s.IsActive, &s.SourceURL, &s.CreatedAt, &s.UpdatedAt, &s.UniversityName, &s.UniversityShort, nil, nil, &s.DepartmentName, nil, &s.PublicationCount, ) if err != nil { return nil, err } return s, nil } // SearchStaff searches for staff members func (r *Repository) SearchStaff(ctx context.Context, params StaffSearchParams) (*StaffSearchResult, error) { // Build query dynamically var conditions []string var args []interface{} argNum := 1 baseQuery := ` SELECT s.id, s.university_id, s.department_id, s.first_name, s.last_name, s.full_name, s.title, s.academic_title, s.position, s.position_type, s.is_professor, s.email, s.profile_url, s.photo_url, s.orcid, s.research_interests, s.crawled_at, s.is_active, u.name as university_name, u.short_name as university_short, u.state as university_state, d.name as department_name, (SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id ` if params.Query != "" { conditions = append(conditions, fmt.Sprintf( `(to_tsvector('german', COALESCE(s.full_name, '') || ' ' || COALESCE(s.research_summary, '')) @@ plainto_tsquery('german', $%d) OR s.full_name ILIKE '%%' || $%d || '%%' OR s.last_name ILIKE '%%' || $%d || '%%')`, argNum, argNum, argNum)) args = append(args, params.Query) argNum++ } if params.UniversityID != nil { conditions = append(conditions, fmt.Sprintf("s.university_id = $%d", argNum)) args = append(args, *params.UniversityID) argNum++ } if params.DepartmentID != nil { conditions = append(conditions, fmt.Sprintf("s.department_id = $%d", argNum)) args = append(args, *params.DepartmentID) argNum++ } if params.State != nil { conditions = append(conditions, fmt.Sprintf("u.state = $%d", argNum)) args = append(args, *params.State) argNum++ } if params.UniType != nil { conditions = append(conditions, fmt.Sprintf("u.uni_type = $%d", argNum)) args = append(args, *params.UniType) argNum++ } if params.PositionType != nil { conditions = append(conditions, fmt.Sprintf("s.position_type = $%d", argNum)) args = append(args, *params.PositionType) argNum++ } if params.IsProfessor != nil { conditions = append(conditions, fmt.Sprintf("s.is_professor = $%d", argNum)) args = append(args, *params.IsProfessor) argNum++ } // Build WHERE clause whereClause := "" if len(conditions) > 0 { whereClause = "WHERE " + strings.Join(conditions, " AND ") } // Count total countQuery := fmt.Sprintf("SELECT COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id %s", whereClause) var total int if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil { return nil, err } // Apply pagination limit := params.Limit if limit <= 0 { limit = 20 } if limit > 100 { limit = 100 } offset := params.Offset if offset < 0 { offset = 0 } // Full query with pagination fullQuery := fmt.Sprintf("%s %s ORDER BY s.is_professor DESC, s.last_name ASC LIMIT %d OFFSET %d", baseQuery, whereClause, limit, offset) rows, err := r.db.Pool.Query(ctx, fullQuery, args...) if err != nil { return nil, err } defer rows.Close() var staff []UniversityStaff for rows.Next() { var s UniversityStaff var uniState *string if err := rows.Scan( &s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName, &s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor, &s.Email, &s.ProfileURL, &s.PhotoURL, &s.ORCID, &s.ResearchInterests, &s.CrawledAt, &s.IsActive, &s.UniversityName, &s.UniversityShort, &uniState, &s.DepartmentName, &s.PublicationCount, ); err != nil { return nil, err } staff = append(staff, s) } return &StaffSearchResult{ Staff: staff, Total: total, Limit: limit, Offset: offset, Query: params.Query, }, rows.Err() } // ============================================================================ // PUBLICATIONS // ============================================================================ // CreatePublication creates or updates a publication func (r *Repository) CreatePublication(ctx context.Context, p *Publication) error { query := ` INSERT INTO publications ( title, title_en, abstract, abstract_en, year, month, pub_type, venue, venue_short, publisher, doi, isbn, issn, arxiv_id, pubmed_id, url, pdf_url, citation_count, keywords, topics, source, raw_data ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22 ) ON CONFLICT (doi) WHERE doi IS NOT NULL DO UPDATE SET title = EXCLUDED.title, abstract = EXCLUDED.abstract, year = EXCLUDED.year, venue = EXCLUDED.venue, citation_count = EXCLUDED.citation_count, updated_at = NOW() RETURNING id, crawled_at, created_at, updated_at ` // Handle potential duplicate without DOI err := r.db.Pool.QueryRow(ctx, query, p.Title, p.TitleEN, p.Abstract, p.AbstractEN, p.Year, p.Month, p.PubType, p.Venue, p.VenueShort, p.Publisher, p.DOI, p.ISBN, p.ISSN, p.ArxivID, p.PubmedID, p.URL, p.PDFURL, p.CitationCount, p.Keywords, p.Topics, p.Source, p.RawData, ).Scan(&p.ID, &p.CrawledAt, &p.CreatedAt, &p.UpdatedAt) if err != nil && strings.Contains(err.Error(), "duplicate") { // Try to find existing publication by title and year findQuery := `SELECT id FROM publications WHERE title = $1 AND year = $2` err = r.db.Pool.QueryRow(ctx, findQuery, p.Title, p.Year).Scan(&p.ID) } return err } // LinkStaffPublication creates a link between staff and publication func (r *Repository) LinkStaffPublication(ctx context.Context, sp *StaffPublication) error { query := ` INSERT INTO staff_publications (staff_id, publication_id, author_position, is_corresponding) VALUES ($1, $2, $3, $4) ON CONFLICT (staff_id, publication_id) DO UPDATE SET author_position = EXCLUDED.author_position, is_corresponding = EXCLUDED.is_corresponding ` _, err := r.db.Pool.Exec(ctx, query, sp.StaffID, sp.PublicationID, sp.AuthorPosition, sp.IsCorresponding, ) return err } // GetStaffPublications retrieves all publications for a staff member func (r *Repository) GetStaffPublications(ctx context.Context, staffID uuid.UUID) ([]Publication, error) { query := ` SELECT p.id, p.title, p.abstract, p.year, p.pub_type, p.venue, p.doi, p.url, p.citation_count FROM publications p JOIN staff_publications sp ON p.id = sp.publication_id WHERE sp.staff_id = $1 ORDER BY p.year DESC NULLS LAST, p.title ` rows, err := r.db.Pool.Query(ctx, query, staffID) if err != nil { return nil, err } defer rows.Close() var pubs []Publication for rows.Next() { var p Publication if err := rows.Scan( &p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, ); err != nil { return nil, err } pubs = append(pubs, p) } return pubs, rows.Err() } // SearchPublications searches for publications func (r *Repository) SearchPublications(ctx context.Context, params PublicationSearchParams) (*PublicationSearchResult, error) { var conditions []string var args []interface{} argNum := 1 if params.Query != "" { conditions = append(conditions, fmt.Sprintf( `to_tsvector('german', COALESCE(title, '') || ' ' || COALESCE(abstract, '')) @@ plainto_tsquery('german', $%d)`, argNum)) args = append(args, params.Query) argNum++ } if params.StaffID != nil { conditions = append(conditions, fmt.Sprintf( `id IN (SELECT publication_id FROM staff_publications WHERE staff_id = $%d)`, argNum)) args = append(args, *params.StaffID) argNum++ } if params.Year != nil { conditions = append(conditions, fmt.Sprintf("year = $%d", argNum)) args = append(args, *params.Year) argNum++ } if params.YearFrom != nil { conditions = append(conditions, fmt.Sprintf("year >= $%d", argNum)) args = append(args, *params.YearFrom) argNum++ } if params.YearTo != nil { conditions = append(conditions, fmt.Sprintf("year <= $%d", argNum)) args = append(args, *params.YearTo) argNum++ } if params.PubType != nil { conditions = append(conditions, fmt.Sprintf("pub_type = $%d", argNum)) args = append(args, *params.PubType) argNum++ } whereClause := "" if len(conditions) > 0 { whereClause = "WHERE " + strings.Join(conditions, " AND ") } // Count countQuery := fmt.Sprintf("SELECT COUNT(*) FROM publications %s", whereClause) var total int if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil { return nil, err } // Pagination limit := params.Limit if limit <= 0 { limit = 20 } offset := params.Offset // Query query := fmt.Sprintf(` SELECT id, title, abstract, year, pub_type, venue, doi, url, citation_count, keywords FROM publications %s ORDER BY year DESC NULLS LAST, citation_count DESC LIMIT %d OFFSET %d `, whereClause, limit, offset) rows, err := r.db.Pool.Query(ctx, query, args...) if err != nil { return nil, err } defer rows.Close() var pubs []Publication for rows.Next() { var p Publication if err := rows.Scan( &p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, &p.Keywords, ); err != nil { return nil, err } pubs = append(pubs, p) } return &PublicationSearchResult{ Publications: pubs, Total: total, Limit: limit, Offset: offset, Query: params.Query, }, rows.Err() } // ============================================================================ // CRAWL STATUS // ============================================================================ // UpdateCrawlStatus updates crawl status for a university func (r *Repository) UpdateCrawlStatus(ctx context.Context, status *UniversityCrawlStatus) error { query := ` INSERT INTO university_crawl_status ( university_id, last_staff_crawl, staff_crawl_status, staff_count, staff_errors, last_pub_crawl, pub_crawl_status, pub_count, pub_errors, next_scheduled_crawl, crawl_priority ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) ON CONFLICT (university_id) DO UPDATE SET last_staff_crawl = EXCLUDED.last_staff_crawl, staff_crawl_status = EXCLUDED.staff_crawl_status, staff_count = EXCLUDED.staff_count, staff_errors = EXCLUDED.staff_errors, last_pub_crawl = EXCLUDED.last_pub_crawl, pub_crawl_status = EXCLUDED.pub_crawl_status, pub_count = EXCLUDED.pub_count, pub_errors = EXCLUDED.pub_errors, next_scheduled_crawl = EXCLUDED.next_scheduled_crawl, crawl_priority = EXCLUDED.crawl_priority, updated_at = NOW() ` _, err := r.db.Pool.Exec(ctx, query, status.UniversityID, status.LastStaffCrawl, status.StaffCrawlStatus, status.StaffCount, status.StaffErrors, status.LastPubCrawl, status.PubCrawlStatus, status.PubCount, status.PubErrors, status.NextScheduledCrawl, status.CrawlPriority, ) return err } // GetCrawlStatus retrieves crawl status for a university func (r *Repository) GetCrawlStatus(ctx context.Context, uniID uuid.UUID) (*UniversityCrawlStatus, error) { query := `SELECT * FROM university_crawl_status WHERE university_id = $1` s := &UniversityCrawlStatus{} err := r.db.Pool.QueryRow(ctx, query, uniID).Scan( &s.UniversityID, &s.LastStaffCrawl, &s.StaffCrawlStatus, &s.StaffCount, &s.StaffErrors, &s.LastPubCrawl, &s.PubCrawlStatus, &s.PubCount, &s.PubErrors, &s.NextScheduledCrawl, &s.CrawlPriority, &s.CreatedAt, &s.UpdatedAt, ) if err == pgx.ErrNoRows { return nil, nil } if err != nil { return nil, err } return s, nil } // ============================================================================ // STATS // ============================================================================ // GetStaffStats retrieves statistics about staff data func (r *Repository) GetStaffStats(ctx context.Context) (*StaffStats, error) { stats := &StaffStats{ ByState: make(map[string]int), ByUniType: make(map[string]int), ByPositionType: make(map[string]int), } // Basic counts queries := []struct { query string dest *int }{ {"SELECT COUNT(*) FROM university_staff WHERE is_active = true", &stats.TotalStaff}, {"SELECT COUNT(*) FROM university_staff WHERE is_professor = true AND is_active = true", &stats.TotalProfessors}, {"SELECT COUNT(*) FROM publications", &stats.TotalPublications}, {"SELECT COUNT(*) FROM universities", &stats.TotalUniversities}, } for _, q := range queries { if err := r.db.Pool.QueryRow(ctx, q.query).Scan(q.dest); err != nil { return nil, err } } // By state rows, err := r.db.Pool.Query(ctx, ` SELECT COALESCE(u.state, 'unknown'), COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id WHERE s.is_active = true GROUP BY u.state `) if err != nil { return nil, err } defer rows.Close() for rows.Next() { var state string var count int if err := rows.Scan(&state, &count); err != nil { return nil, err } stats.ByState[state] = count } // By uni type rows2, err := r.db.Pool.Query(ctx, ` SELECT COALESCE(u.uni_type, 'unknown'), COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id WHERE s.is_active = true GROUP BY u.uni_type `) if err != nil { return nil, err } defer rows2.Close() for rows2.Next() { var uniType string var count int if err := rows2.Scan(&uniType, &count); err != nil { return nil, err } stats.ByUniType[uniType] = count } // By position type rows3, err := r.db.Pool.Query(ctx, ` SELECT COALESCE(position_type, 'unknown'), COUNT(*) FROM university_staff WHERE is_active = true GROUP BY position_type `) if err != nil { return nil, err } defer rows3.Close() for rows3.Next() { var posType string var count int if err := rows3.Scan(&posType, &count); err != nil { return nil, err } stats.ByPositionType[posType] = count } return stats, nil }