feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions

View File

@@ -0,0 +1,133 @@
package database
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
// DB holds the database connection pool
type DB struct {
Pool *pgxpool.Pool
}
// Config holds database configuration
type Config struct {
Host string
Port string
User string
Password string
DBName string
SSLMode string
}
// NewConfig creates a new database config from environment variables
func NewConfig() *Config {
return &Config{
Host: getEnv("DB_HOST", "localhost"),
Port: getEnv("DB_PORT", "5432"),
User: getEnv("DB_USER", "postgres"),
Password: getEnv("DB_PASSWORD", "postgres"),
DBName: getEnv("DB_NAME", "breakpilot"),
SSLMode: getEnv("DB_SSLMODE", "disable"),
}
}
func getEnv(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
// ConnectionString returns the PostgreSQL connection string
func (c *Config) ConnectionString() string {
return fmt.Sprintf(
"postgres://%s:%s@%s:%s/%s?sslmode=%s",
c.User, c.Password, c.Host, c.Port, c.DBName, c.SSLMode,
)
}
// New creates a new database connection
func New(ctx context.Context, cfg *Config) (*DB, error) {
config, err := pgxpool.ParseConfig(cfg.ConnectionString())
if err != nil {
return nil, fmt.Errorf("failed to parse database config: %w", err)
}
// Configure connection pool
config.MaxConns = 10
config.MinConns = 2
config.MaxConnLifetime = time.Hour
config.MaxConnIdleTime = 30 * time.Minute
pool, err := pgxpool.NewWithConfig(ctx, config)
if err != nil {
return nil, fmt.Errorf("failed to create connection pool: %w", err)
}
// Test connection
if err := pool.Ping(ctx); err != nil {
pool.Close()
return nil, fmt.Errorf("failed to ping database: %w", err)
}
log.Printf("Connected to database %s on %s:%s", cfg.DBName, cfg.Host, cfg.Port)
return &DB{Pool: pool}, nil
}
// Close closes the database connection pool
func (db *DB) Close() {
if db.Pool != nil {
db.Pool.Close()
}
}
// RunMigrations executes all SQL migrations
func (db *DB) RunMigrations(ctx context.Context) error {
// Try multiple paths for migration file
migrationPaths := []string{
"migrations/001_university_staff.sql",
"../migrations/001_university_staff.sql",
"../../migrations/001_university_staff.sql",
}
var content []byte
var err error
var foundPath string
for _, path := range migrationPaths {
absPath, _ := filepath.Abs(path)
content, err = os.ReadFile(absPath)
if err == nil {
foundPath = absPath
break
}
}
if content == nil {
return fmt.Errorf("failed to read migration file from any path: %w", err)
}
log.Printf("Running migrations from: %s", foundPath)
// Execute migration
_, err = db.Pool.Exec(ctx, string(content))
if err != nil {
return fmt.Errorf("failed to execute migration: %w", err)
}
log.Println("Database migrations completed successfully")
return nil
}
// Health checks if the database is healthy
func (db *DB) Health(ctx context.Context) error {
return db.Pool.Ping(ctx)
}

View File

@@ -0,0 +1,205 @@
package database
import (
"time"
"github.com/google/uuid"
)
// University represents a German university/Hochschule
type University struct {
ID uuid.UUID `json:"id"`
Name string `json:"name"`
ShortName *string `json:"short_name,omitempty"`
URL string `json:"url"`
State *string `json:"state,omitempty"`
UniType *string `json:"uni_type,omitempty"`
StaffPagePattern *string `json:"staff_page_pattern,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// Department represents a faculty/department at a university
type Department struct {
ID uuid.UUID `json:"id"`
UniversityID uuid.UUID `json:"university_id"`
Name string `json:"name"`
NameEN *string `json:"name_en,omitempty"`
URL *string `json:"url,omitempty"`
Category *string `json:"category,omitempty"`
ParentID *uuid.UUID `json:"parent_id,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// UniversityStaff represents a staff member at a university
type UniversityStaff struct {
ID uuid.UUID `json:"id"`
UniversityID uuid.UUID `json:"university_id"`
DepartmentID *uuid.UUID `json:"department_id,omitempty"`
FirstName *string `json:"first_name,omitempty"`
LastName string `json:"last_name"`
FullName *string `json:"full_name,omitempty"`
Title *string `json:"title,omitempty"`
AcademicTitle *string `json:"academic_title,omitempty"`
Position *string `json:"position,omitempty"`
PositionType *string `json:"position_type,omitempty"`
IsProfessor bool `json:"is_professor"`
Email *string `json:"email,omitempty"`
Phone *string `json:"phone,omitempty"`
Office *string `json:"office,omitempty"`
ProfileURL *string `json:"profile_url,omitempty"`
PhotoURL *string `json:"photo_url,omitempty"`
ORCID *string `json:"orcid,omitempty"`
GoogleScholarID *string `json:"google_scholar_id,omitempty"`
ResearchgateURL *string `json:"researchgate_url,omitempty"`
LinkedInURL *string `json:"linkedin_url,omitempty"`
PersonalWebsite *string `json:"personal_website,omitempty"`
ResearchInterests []string `json:"research_interests,omitempty"`
ResearchSummary *string `json:"research_summary,omitempty"`
SupervisorID *uuid.UUID `json:"supervisor_id,omitempty"`
TeamRole *string `json:"team_role,omitempty"` // leitung, mitarbeiter, sekretariat, hiwi, doktorand
CrawledAt time.Time `json:"crawled_at"`
LastVerified *time.Time `json:"last_verified,omitempty"`
IsActive bool `json:"is_active"`
SourceURL *string `json:"source_url,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
// Joined fields (from views)
UniversityName *string `json:"university_name,omitempty"`
UniversityShort *string `json:"university_short,omitempty"`
DepartmentName *string `json:"department_name,omitempty"`
PublicationCount int `json:"publication_count,omitempty"`
SupervisorName *string `json:"supervisor_name,omitempty"`
}
// Publication represents an academic publication
type Publication struct {
ID uuid.UUID `json:"id"`
Title string `json:"title"`
TitleEN *string `json:"title_en,omitempty"`
Abstract *string `json:"abstract,omitempty"`
AbstractEN *string `json:"abstract_en,omitempty"`
Year *int `json:"year,omitempty"`
Month *int `json:"month,omitempty"`
PubType *string `json:"pub_type,omitempty"`
Venue *string `json:"venue,omitempty"`
VenueShort *string `json:"venue_short,omitempty"`
Publisher *string `json:"publisher,omitempty"`
DOI *string `json:"doi,omitempty"`
ISBN *string `json:"isbn,omitempty"`
ISSN *string `json:"issn,omitempty"`
ArxivID *string `json:"arxiv_id,omitempty"`
PubmedID *string `json:"pubmed_id,omitempty"`
URL *string `json:"url,omitempty"`
PDFURL *string `json:"pdf_url,omitempty"`
CitationCount int `json:"citation_count"`
Keywords []string `json:"keywords,omitempty"`
Topics []string `json:"topics,omitempty"`
Source *string `json:"source,omitempty"`
RawData []byte `json:"raw_data,omitempty"`
CrawledAt time.Time `json:"crawled_at"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
// Joined fields
Authors []string `json:"authors,omitempty"`
AuthorCount int `json:"author_count,omitempty"`
}
// StaffPublication represents the N:M relationship between staff and publications
type StaffPublication struct {
StaffID uuid.UUID `json:"staff_id"`
PublicationID uuid.UUID `json:"publication_id"`
AuthorPosition *int `json:"author_position,omitempty"`
IsCorresponding bool `json:"is_corresponding"`
CreatedAt time.Time `json:"created_at"`
}
// UniversityCrawlStatus tracks crawl progress for a university
type UniversityCrawlStatus struct {
UniversityID uuid.UUID `json:"university_id"`
LastStaffCrawl *time.Time `json:"last_staff_crawl,omitempty"`
StaffCrawlStatus string `json:"staff_crawl_status"`
StaffCount int `json:"staff_count"`
StaffErrors []string `json:"staff_errors,omitempty"`
LastPubCrawl *time.Time `json:"last_pub_crawl,omitempty"`
PubCrawlStatus string `json:"pub_crawl_status"`
PubCount int `json:"pub_count"`
PubErrors []string `json:"pub_errors,omitempty"`
NextScheduledCrawl *time.Time `json:"next_scheduled_crawl,omitempty"`
CrawlPriority int `json:"crawl_priority"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// CrawlHistory represents a crawl audit log entry
type CrawlHistory struct {
ID uuid.UUID `json:"id"`
UniversityID *uuid.UUID `json:"university_id,omitempty"`
CrawlType string `json:"crawl_type"`
Status string `json:"status"`
StartedAt time.Time `json:"started_at"`
CompletedAt *time.Time `json:"completed_at,omitempty"`
ItemsFound int `json:"items_found"`
ItemsNew int `json:"items_new"`
ItemsUpdated int `json:"items_updated"`
Errors []byte `json:"errors,omitempty"`
Metadata []byte `json:"metadata,omitempty"`
}
// StaffSearchParams contains parameters for searching staff
type StaffSearchParams struct {
Query string `json:"query,omitempty"`
UniversityID *uuid.UUID `json:"university_id,omitempty"`
DepartmentID *uuid.UUID `json:"department_id,omitempty"`
State *string `json:"state,omitempty"`
UniType *string `json:"uni_type,omitempty"`
PositionType *string `json:"position_type,omitempty"`
IsProfessor *bool `json:"is_professor,omitempty"`
Limit int `json:"limit,omitempty"`
Offset int `json:"offset,omitempty"`
}
// StaffSearchResult contains search results for staff
type StaffSearchResult struct {
Staff []UniversityStaff `json:"staff"`
Total int `json:"total"`
Limit int `json:"limit"`
Offset int `json:"offset"`
Query string `json:"query,omitempty"`
}
// PublicationSearchParams contains parameters for searching publications
type PublicationSearchParams struct {
Query string `json:"query,omitempty"`
StaffID *uuid.UUID `json:"staff_id,omitempty"`
Year *int `json:"year,omitempty"`
YearFrom *int `json:"year_from,omitempty"`
YearTo *int `json:"year_to,omitempty"`
PubType *string `json:"pub_type,omitempty"`
Limit int `json:"limit,omitempty"`
Offset int `json:"offset,omitempty"`
}
// PublicationSearchResult contains search results for publications
type PublicationSearchResult struct {
Publications []Publication `json:"publications"`
Total int `json:"total"`
Limit int `json:"limit"`
Offset int `json:"offset"`
Query string `json:"query,omitempty"`
}
// StaffStats contains statistics about staff data
type StaffStats struct {
TotalStaff int `json:"total_staff"`
TotalProfessors int `json:"total_professors"`
TotalPublications int `json:"total_publications"`
TotalUniversities int `json:"total_universities"`
ByState map[string]int `json:"by_state,omitempty"`
ByUniType map[string]int `json:"by_uni_type,omitempty"`
ByPositionType map[string]int `json:"by_position_type,omitempty"`
RecentCrawls []CrawlHistory `json:"recent_crawls,omitempty"`
}

View File

@@ -0,0 +1,684 @@
package database
import (
"context"
"fmt"
"strings"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
)
// Repository provides database operations for staff and publications
type Repository struct {
db *DB
}
// NewRepository creates a new repository
func NewRepository(db *DB) *Repository {
return &Repository{db: db}
}
// ============================================================================
// UNIVERSITIES
// ============================================================================
// CreateUniversity creates a new university
func (r *Repository) CreateUniversity(ctx context.Context, u *University) error {
query := `
INSERT INTO universities (name, short_name, url, state, uni_type, staff_page_pattern)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (url) DO UPDATE SET
name = EXCLUDED.name,
short_name = EXCLUDED.short_name,
state = EXCLUDED.state,
uni_type = EXCLUDED.uni_type,
staff_page_pattern = EXCLUDED.staff_page_pattern,
updated_at = NOW()
RETURNING id, created_at, updated_at
`
return r.db.Pool.QueryRow(ctx, query,
u.Name, u.ShortName, u.URL, u.State, u.UniType, u.StaffPagePattern,
).Scan(&u.ID, &u.CreatedAt, &u.UpdatedAt)
}
// GetUniversity retrieves a university by ID
func (r *Repository) GetUniversity(ctx context.Context, id uuid.UUID) (*University, error) {
query := `SELECT id, name, short_name, url, state, uni_type, staff_page_pattern, created_at, updated_at
FROM universities WHERE id = $1`
u := &University{}
err := r.db.Pool.QueryRow(ctx, query, id).Scan(
&u.ID, &u.Name, &u.ShortName, &u.URL, &u.State, &u.UniType,
&u.StaffPagePattern, &u.CreatedAt, &u.UpdatedAt,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
return u, nil
}
// GetUniversityByID is an alias for GetUniversity (for interface compatibility)
func (r *Repository) GetUniversityByID(ctx context.Context, id uuid.UUID) (*University, error) {
return r.GetUniversity(ctx, id)
}
// GetUniversityByURL retrieves a university by URL
func (r *Repository) GetUniversityByURL(ctx context.Context, url string) (*University, error) {
query := `SELECT id, name, short_name, url, state, uni_type, staff_page_pattern, created_at, updated_at
FROM universities WHERE url = $1`
u := &University{}
err := r.db.Pool.QueryRow(ctx, query, url).Scan(
&u.ID, &u.Name, &u.ShortName, &u.URL, &u.State, &u.UniType,
&u.StaffPagePattern, &u.CreatedAt, &u.UpdatedAt,
)
if err != nil {
return nil, err
}
return u, nil
}
// ListUniversities lists all universities
func (r *Repository) ListUniversities(ctx context.Context) ([]University, error) {
query := `SELECT id, name, short_name, url, state, uni_type, staff_page_pattern, created_at, updated_at
FROM universities ORDER BY name`
rows, err := r.db.Pool.Query(ctx, query)
if err != nil {
return nil, err
}
defer rows.Close()
var universities []University
for rows.Next() {
var u University
if err := rows.Scan(
&u.ID, &u.Name, &u.ShortName, &u.URL, &u.State, &u.UniType,
&u.StaffPagePattern, &u.CreatedAt, &u.UpdatedAt,
); err != nil {
return nil, err
}
universities = append(universities, u)
}
return universities, rows.Err()
}
// ============================================================================
// DEPARTMENTS
// ============================================================================
// CreateDepartment creates or updates a department
func (r *Repository) CreateDepartment(ctx context.Context, d *Department) error {
query := `
INSERT INTO departments (university_id, name, name_en, url, category, parent_id)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (university_id, name) DO UPDATE SET
name_en = EXCLUDED.name_en,
url = EXCLUDED.url,
category = EXCLUDED.category,
parent_id = EXCLUDED.parent_id,
updated_at = NOW()
RETURNING id, created_at, updated_at
`
return r.db.Pool.QueryRow(ctx, query,
d.UniversityID, d.Name, d.NameEN, d.URL, d.Category, d.ParentID,
).Scan(&d.ID, &d.CreatedAt, &d.UpdatedAt)
}
// GetDepartmentByName retrieves a department by university and name
func (r *Repository) GetDepartmentByName(ctx context.Context, uniID uuid.UUID, name string) (*Department, error) {
query := `SELECT id, university_id, name, name_en, url, category, parent_id, created_at, updated_at
FROM departments WHERE university_id = $1 AND name = $2`
d := &Department{}
err := r.db.Pool.QueryRow(ctx, query, uniID, name).Scan(
&d.ID, &d.UniversityID, &d.Name, &d.NameEN, &d.URL, &d.Category,
&d.ParentID, &d.CreatedAt, &d.UpdatedAt,
)
if err != nil {
return nil, err
}
return d, nil
}
// ============================================================================
// STAFF
// ============================================================================
// CreateStaff creates or updates a staff member
func (r *Repository) CreateStaff(ctx context.Context, s *UniversityStaff) error {
query := `
INSERT INTO university_staff (
university_id, department_id, first_name, last_name, full_name,
title, academic_title, position, position_type, is_professor,
email, phone, office, profile_url, photo_url,
orcid, google_scholar_id, researchgate_url, linkedin_url, personal_website,
research_interests, research_summary, supervisor_id, team_role, source_url
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
$21, $22, $23, $24, $25
)
ON CONFLICT (university_id, first_name, last_name, COALESCE(department_id, '00000000-0000-0000-0000-000000000000'::uuid))
DO UPDATE SET
full_name = EXCLUDED.full_name,
title = EXCLUDED.title,
academic_title = EXCLUDED.academic_title,
position = EXCLUDED.position,
position_type = EXCLUDED.position_type,
is_professor = EXCLUDED.is_professor,
email = COALESCE(EXCLUDED.email, university_staff.email),
phone = COALESCE(EXCLUDED.phone, university_staff.phone),
office = COALESCE(EXCLUDED.office, university_staff.office),
profile_url = COALESCE(EXCLUDED.profile_url, university_staff.profile_url),
photo_url = COALESCE(EXCLUDED.photo_url, university_staff.photo_url),
orcid = COALESCE(EXCLUDED.orcid, university_staff.orcid),
google_scholar_id = COALESCE(EXCLUDED.google_scholar_id, university_staff.google_scholar_id),
researchgate_url = COALESCE(EXCLUDED.researchgate_url, university_staff.researchgate_url),
linkedin_url = COALESCE(EXCLUDED.linkedin_url, university_staff.linkedin_url),
personal_website = COALESCE(EXCLUDED.personal_website, university_staff.personal_website),
research_interests = COALESCE(EXCLUDED.research_interests, university_staff.research_interests),
research_summary = COALESCE(EXCLUDED.research_summary, university_staff.research_summary),
supervisor_id = COALESCE(EXCLUDED.supervisor_id, university_staff.supervisor_id),
team_role = COALESCE(EXCLUDED.team_role, university_staff.team_role),
source_url = COALESCE(EXCLUDED.source_url, university_staff.source_url),
crawled_at = NOW(),
updated_at = NOW()
RETURNING id, crawled_at, created_at, updated_at
`
return r.db.Pool.QueryRow(ctx, query,
s.UniversityID, s.DepartmentID, s.FirstName, s.LastName, s.FullName,
s.Title, s.AcademicTitle, s.Position, s.PositionType, s.IsProfessor,
s.Email, s.Phone, s.Office, s.ProfileURL, s.PhotoURL,
s.ORCID, s.GoogleScholarID, s.ResearchgateURL, s.LinkedInURL, s.PersonalWebsite,
s.ResearchInterests, s.ResearchSummary, s.SupervisorID, s.TeamRole, s.SourceURL,
).Scan(&s.ID, &s.CrawledAt, &s.CreatedAt, &s.UpdatedAt)
}
// GetStaff retrieves a staff member by ID
func (r *Repository) GetStaff(ctx context.Context, id uuid.UUID) (*UniversityStaff, error) {
query := `SELECT * FROM v_staff_full WHERE id = $1`
s := &UniversityStaff{}
err := r.db.Pool.QueryRow(ctx, query, id).Scan(
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
&s.Email, &s.Phone, &s.Office, &s.ProfileURL, &s.PhotoURL,
&s.ORCID, &s.GoogleScholarID, &s.ResearchgateURL, &s.LinkedInURL, &s.PersonalWebsite,
&s.ResearchInterests, &s.ResearchSummary, &s.CrawledAt, &s.LastVerified, &s.IsActive, &s.SourceURL,
&s.CreatedAt, &s.UpdatedAt, &s.UniversityName, &s.UniversityShort, nil, nil,
&s.DepartmentName, nil, &s.PublicationCount,
)
if err != nil {
return nil, err
}
return s, nil
}
// SearchStaff searches for staff members
func (r *Repository) SearchStaff(ctx context.Context, params StaffSearchParams) (*StaffSearchResult, error) {
// Build query dynamically
var conditions []string
var args []interface{}
argNum := 1
baseQuery := `
SELECT s.id, s.university_id, s.department_id, s.first_name, s.last_name, s.full_name,
s.title, s.academic_title, s.position, s.position_type, s.is_professor,
s.email, s.profile_url, s.photo_url, s.orcid,
s.research_interests, s.crawled_at, s.is_active,
u.name as university_name, u.short_name as university_short, u.state as university_state,
d.name as department_name,
(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
FROM university_staff s
JOIN universities u ON s.university_id = u.id
LEFT JOIN departments d ON s.department_id = d.id
`
if params.Query != "" {
conditions = append(conditions, fmt.Sprintf(
`(to_tsvector('german', COALESCE(s.full_name, '') || ' ' || COALESCE(s.research_summary, '')) @@ plainto_tsquery('german', $%d)
OR s.full_name ILIKE '%%' || $%d || '%%'
OR s.last_name ILIKE '%%' || $%d || '%%')`,
argNum, argNum, argNum))
args = append(args, params.Query)
argNum++
}
if params.UniversityID != nil {
conditions = append(conditions, fmt.Sprintf("s.university_id = $%d", argNum))
args = append(args, *params.UniversityID)
argNum++
}
if params.DepartmentID != nil {
conditions = append(conditions, fmt.Sprintf("s.department_id = $%d", argNum))
args = append(args, *params.DepartmentID)
argNum++
}
if params.State != nil {
conditions = append(conditions, fmt.Sprintf("u.state = $%d", argNum))
args = append(args, *params.State)
argNum++
}
if params.UniType != nil {
conditions = append(conditions, fmt.Sprintf("u.uni_type = $%d", argNum))
args = append(args, *params.UniType)
argNum++
}
if params.PositionType != nil {
conditions = append(conditions, fmt.Sprintf("s.position_type = $%d", argNum))
args = append(args, *params.PositionType)
argNum++
}
if params.IsProfessor != nil {
conditions = append(conditions, fmt.Sprintf("s.is_professor = $%d", argNum))
args = append(args, *params.IsProfessor)
argNum++
}
// Build WHERE clause
whereClause := ""
if len(conditions) > 0 {
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
// Count total
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id %s", whereClause)
var total int
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
return nil, err
}
// Apply pagination
limit := params.Limit
if limit <= 0 {
limit = 20
}
if limit > 100 {
limit = 100
}
offset := params.Offset
if offset < 0 {
offset = 0
}
// Full query with pagination
fullQuery := fmt.Sprintf("%s %s ORDER BY s.is_professor DESC, s.last_name ASC LIMIT %d OFFSET %d",
baseQuery, whereClause, limit, offset)
rows, err := r.db.Pool.Query(ctx, fullQuery, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var staff []UniversityStaff
for rows.Next() {
var s UniversityStaff
var uniState *string
if err := rows.Scan(
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
&s.Email, &s.ProfileURL, &s.PhotoURL, &s.ORCID,
&s.ResearchInterests, &s.CrawledAt, &s.IsActive,
&s.UniversityName, &s.UniversityShort, &uniState,
&s.DepartmentName, &s.PublicationCount,
); err != nil {
return nil, err
}
staff = append(staff, s)
}
return &StaffSearchResult{
Staff: staff,
Total: total,
Limit: limit,
Offset: offset,
Query: params.Query,
}, rows.Err()
}
// ============================================================================
// PUBLICATIONS
// ============================================================================
// CreatePublication creates or updates a publication
func (r *Repository) CreatePublication(ctx context.Context, p *Publication) error {
query := `
INSERT INTO publications (
title, title_en, abstract, abstract_en, year, month,
pub_type, venue, venue_short, publisher,
doi, isbn, issn, arxiv_id, pubmed_id,
url, pdf_url, citation_count, keywords, topics, source, raw_data
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22
)
ON CONFLICT (doi) WHERE doi IS NOT NULL DO UPDATE SET
title = EXCLUDED.title,
abstract = EXCLUDED.abstract,
year = EXCLUDED.year,
venue = EXCLUDED.venue,
citation_count = EXCLUDED.citation_count,
updated_at = NOW()
RETURNING id, crawled_at, created_at, updated_at
`
// Handle potential duplicate without DOI
err := r.db.Pool.QueryRow(ctx, query,
p.Title, p.TitleEN, p.Abstract, p.AbstractEN, p.Year, p.Month,
p.PubType, p.Venue, p.VenueShort, p.Publisher,
p.DOI, p.ISBN, p.ISSN, p.ArxivID, p.PubmedID,
p.URL, p.PDFURL, p.CitationCount, p.Keywords, p.Topics, p.Source, p.RawData,
).Scan(&p.ID, &p.CrawledAt, &p.CreatedAt, &p.UpdatedAt)
if err != nil && strings.Contains(err.Error(), "duplicate") {
// Try to find existing publication by title and year
findQuery := `SELECT id FROM publications WHERE title = $1 AND year = $2`
err = r.db.Pool.QueryRow(ctx, findQuery, p.Title, p.Year).Scan(&p.ID)
}
return err
}
// LinkStaffPublication creates a link between staff and publication
func (r *Repository) LinkStaffPublication(ctx context.Context, sp *StaffPublication) error {
query := `
INSERT INTO staff_publications (staff_id, publication_id, author_position, is_corresponding)
VALUES ($1, $2, $3, $4)
ON CONFLICT (staff_id, publication_id) DO UPDATE SET
author_position = EXCLUDED.author_position,
is_corresponding = EXCLUDED.is_corresponding
`
_, err := r.db.Pool.Exec(ctx, query,
sp.StaffID, sp.PublicationID, sp.AuthorPosition, sp.IsCorresponding,
)
return err
}
// GetStaffPublications retrieves all publications for a staff member
func (r *Repository) GetStaffPublications(ctx context.Context, staffID uuid.UUID) ([]Publication, error) {
query := `
SELECT p.id, p.title, p.abstract, p.year, p.pub_type, p.venue, p.doi, p.url, p.citation_count
FROM publications p
JOIN staff_publications sp ON p.id = sp.publication_id
WHERE sp.staff_id = $1
ORDER BY p.year DESC NULLS LAST, p.title
`
rows, err := r.db.Pool.Query(ctx, query, staffID)
if err != nil {
return nil, err
}
defer rows.Close()
var pubs []Publication
for rows.Next() {
var p Publication
if err := rows.Scan(
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount,
); err != nil {
return nil, err
}
pubs = append(pubs, p)
}
return pubs, rows.Err()
}
// SearchPublications searches for publications
func (r *Repository) SearchPublications(ctx context.Context, params PublicationSearchParams) (*PublicationSearchResult, error) {
var conditions []string
var args []interface{}
argNum := 1
if params.Query != "" {
conditions = append(conditions, fmt.Sprintf(
`to_tsvector('german', COALESCE(title, '') || ' ' || COALESCE(abstract, '')) @@ plainto_tsquery('german', $%d)`,
argNum))
args = append(args, params.Query)
argNum++
}
if params.StaffID != nil {
conditions = append(conditions, fmt.Sprintf(
`id IN (SELECT publication_id FROM staff_publications WHERE staff_id = $%d)`,
argNum))
args = append(args, *params.StaffID)
argNum++
}
if params.Year != nil {
conditions = append(conditions, fmt.Sprintf("year = $%d", argNum))
args = append(args, *params.Year)
argNum++
}
if params.YearFrom != nil {
conditions = append(conditions, fmt.Sprintf("year >= $%d", argNum))
args = append(args, *params.YearFrom)
argNum++
}
if params.YearTo != nil {
conditions = append(conditions, fmt.Sprintf("year <= $%d", argNum))
args = append(args, *params.YearTo)
argNum++
}
if params.PubType != nil {
conditions = append(conditions, fmt.Sprintf("pub_type = $%d", argNum))
args = append(args, *params.PubType)
argNum++
}
whereClause := ""
if len(conditions) > 0 {
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
// Count
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM publications %s", whereClause)
var total int
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
return nil, err
}
// Pagination
limit := params.Limit
if limit <= 0 {
limit = 20
}
offset := params.Offset
// Query
query := fmt.Sprintf(`
SELECT id, title, abstract, year, pub_type, venue, doi, url, citation_count, keywords
FROM publications %s
ORDER BY year DESC NULLS LAST, citation_count DESC
LIMIT %d OFFSET %d
`, whereClause, limit, offset)
rows, err := r.db.Pool.Query(ctx, query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var pubs []Publication
for rows.Next() {
var p Publication
if err := rows.Scan(
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, &p.Keywords,
); err != nil {
return nil, err
}
pubs = append(pubs, p)
}
return &PublicationSearchResult{
Publications: pubs,
Total: total,
Limit: limit,
Offset: offset,
Query: params.Query,
}, rows.Err()
}
// ============================================================================
// CRAWL STATUS
// ============================================================================
// UpdateCrawlStatus updates crawl status for a university
func (r *Repository) UpdateCrawlStatus(ctx context.Context, status *UniversityCrawlStatus) error {
query := `
INSERT INTO university_crawl_status (
university_id, last_staff_crawl, staff_crawl_status, staff_count, staff_errors,
last_pub_crawl, pub_crawl_status, pub_count, pub_errors,
next_scheduled_crawl, crawl_priority
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
ON CONFLICT (university_id) DO UPDATE SET
last_staff_crawl = EXCLUDED.last_staff_crawl,
staff_crawl_status = EXCLUDED.staff_crawl_status,
staff_count = EXCLUDED.staff_count,
staff_errors = EXCLUDED.staff_errors,
last_pub_crawl = EXCLUDED.last_pub_crawl,
pub_crawl_status = EXCLUDED.pub_crawl_status,
pub_count = EXCLUDED.pub_count,
pub_errors = EXCLUDED.pub_errors,
next_scheduled_crawl = EXCLUDED.next_scheduled_crawl,
crawl_priority = EXCLUDED.crawl_priority,
updated_at = NOW()
`
_, err := r.db.Pool.Exec(ctx, query,
status.UniversityID, status.LastStaffCrawl, status.StaffCrawlStatus, status.StaffCount, status.StaffErrors,
status.LastPubCrawl, status.PubCrawlStatus, status.PubCount, status.PubErrors,
status.NextScheduledCrawl, status.CrawlPriority,
)
return err
}
// GetCrawlStatus retrieves crawl status for a university
func (r *Repository) GetCrawlStatus(ctx context.Context, uniID uuid.UUID) (*UniversityCrawlStatus, error) {
query := `SELECT * FROM university_crawl_status WHERE university_id = $1`
s := &UniversityCrawlStatus{}
err := r.db.Pool.QueryRow(ctx, query, uniID).Scan(
&s.UniversityID, &s.LastStaffCrawl, &s.StaffCrawlStatus, &s.StaffCount, &s.StaffErrors,
&s.LastPubCrawl, &s.PubCrawlStatus, &s.PubCount, &s.PubErrors,
&s.NextScheduledCrawl, &s.CrawlPriority, &s.CreatedAt, &s.UpdatedAt,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
return s, nil
}
// ============================================================================
// STATS
// ============================================================================
// GetStaffStats retrieves statistics about staff data
func (r *Repository) GetStaffStats(ctx context.Context) (*StaffStats, error) {
stats := &StaffStats{
ByState: make(map[string]int),
ByUniType: make(map[string]int),
ByPositionType: make(map[string]int),
}
// Basic counts
queries := []struct {
query string
dest *int
}{
{"SELECT COUNT(*) FROM university_staff WHERE is_active = true", &stats.TotalStaff},
{"SELECT COUNT(*) FROM university_staff WHERE is_professor = true AND is_active = true", &stats.TotalProfessors},
{"SELECT COUNT(*) FROM publications", &stats.TotalPublications},
{"SELECT COUNT(*) FROM universities", &stats.TotalUniversities},
}
for _, q := range queries {
if err := r.db.Pool.QueryRow(ctx, q.query).Scan(q.dest); err != nil {
return nil, err
}
}
// By state
rows, err := r.db.Pool.Query(ctx, `
SELECT COALESCE(u.state, 'unknown'), COUNT(*)
FROM university_staff s
JOIN universities u ON s.university_id = u.id
WHERE s.is_active = true
GROUP BY u.state
`)
if err != nil {
return nil, err
}
defer rows.Close()
for rows.Next() {
var state string
var count int
if err := rows.Scan(&state, &count); err != nil {
return nil, err
}
stats.ByState[state] = count
}
// By uni type
rows2, err := r.db.Pool.Query(ctx, `
SELECT COALESCE(u.uni_type, 'unknown'), COUNT(*)
FROM university_staff s
JOIN universities u ON s.university_id = u.id
WHERE s.is_active = true
GROUP BY u.uni_type
`)
if err != nil {
return nil, err
}
defer rows2.Close()
for rows2.Next() {
var uniType string
var count int
if err := rows2.Scan(&uniType, &count); err != nil {
return nil, err
}
stats.ByUniType[uniType] = count
}
// By position type
rows3, err := r.db.Pool.Query(ctx, `
SELECT COALESCE(position_type, 'unknown'), COUNT(*)
FROM university_staff
WHERE is_active = true
GROUP BY position_type
`)
if err != nil {
return nil, err
}
defer rows3.Close()
for rows3.Next() {
var posType string
var count int
if err := rows3.Scan(&posType, &count); err != nil {
return nil, err
}
stats.ByPositionType[posType] = count
}
return stats, nil
}