feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
@@ -0,0 +1,424 @@
+// Package orchestrator implements multi-phase university crawling with queue management
+package orchestrator
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+// Audience represents a target audience filter configuration
+type Audience struct {
+	ID              uuid.UUID        `json:"id"`
+	Name            string           `json:"name"`
+	Description     string           `json:"description,omitempty"`
+	Filters         AudienceFilters  `json:"filters"`
+	MemberCount     int              `json:"member_count"`
+	LastCountUpdate *time.Time       `json:"last_count_update,omitempty"`
+	CreatedBy       string           `json:"created_by,omitempty"`
+	IsActive        bool             `json:"is_active"`
+	CreatedAt       time.Time        `json:"created_at"`
+	UpdatedAt       time.Time        `json:"updated_at"`
+}
+
+// AudienceFilters defines the filter criteria for an audience
+type AudienceFilters struct {
+	PositionTypes []string    `json:"position_types,omitempty"` // professor, researcher, lecturer
+	SubjectAreas  []uuid.UUID `json:"subject_areas,omitempty"`  // Subject area UUIDs
+	States        []string    `json:"states,omitempty"`         // BW, BY, etc.
+	UniTypes      []string    `json:"uni_types,omitempty"`      // UNI, PH, HAW
+	Universities  []uuid.UUID `json:"universities,omitempty"`   // University UUIDs
+	HasEmail      *bool       `json:"has_email,omitempty"`
+	IsActive      *bool       `json:"is_active,omitempty"`
+	Keywords      []string    `json:"keywords,omitempty"` // Keywords in name/research
+}
+
+// AudienceExport tracks exports of audience data
+type AudienceExport struct {
+	ID          uuid.UUID  `json:"id"`
+	AudienceID  uuid.UUID  `json:"audience_id"`
+	ExportType  string     `json:"export_type"` // csv, json, email_list
+	RecordCount int        `json:"record_count"`
+	FilePath    string     `json:"file_path,omitempty"`
+	ExportedBy  string     `json:"exported_by,omitempty"`
+	Purpose     string     `json:"purpose,omitempty"`
+	CreatedAt   time.Time  `json:"created_at"`
+}
+
+// AudienceMember represents a staff member in an audience preview
+type AudienceMember struct {
+	ID             uuid.UUID `json:"id"`
+	Name           string    `json:"name"`
+	Email          string    `json:"email,omitempty"`
+	Position       string    `json:"position,omitempty"`
+	University     string    `json:"university"`
+	Department     string    `json:"department,omitempty"`
+	SubjectArea    string    `json:"subject_area,omitempty"`
+	PublicationCount int     `json:"publication_count"`
+}
+
+// AudienceRepository extends Repository with audience operations
+type AudienceRepository interface {
+	// Audience CRUD
+	CreateAudience(ctx context.Context, audience *Audience) error
+	GetAudience(ctx context.Context, id uuid.UUID) (*Audience, error)
+	ListAudiences(ctx context.Context, activeOnly bool) ([]Audience, error)
+	UpdateAudience(ctx context.Context, audience *Audience) error
+	DeleteAudience(ctx context.Context, id uuid.UUID) error
+
+	// Audience members
+	GetAudienceMembers(ctx context.Context, id uuid.UUID, limit, offset int) ([]AudienceMember, int, error)
+	UpdateAudienceCount(ctx context.Context, id uuid.UUID) (int, error)
+
+	// Exports
+	CreateExport(ctx context.Context, export *AudienceExport) error
+	ListExports(ctx context.Context, audienceID uuid.UUID) ([]AudienceExport, error)
+}
+
+// ============================================================================
+// POSTGRES IMPLEMENTATION
+// ============================================================================
+
+// CreateAudience creates a new audience
+func (r *PostgresRepository) CreateAudience(ctx context.Context, audience *Audience) error {
+	filtersJSON, err := json.Marshal(audience.Filters)
+	if err != nil {
+		return fmt.Errorf("failed to marshal filters: %w", err)
+	}
+
+	query := `
+		INSERT INTO audiences (name, description, filters, created_by, is_active)
+		VALUES ($1, $2, $3, $4, $5)
+		RETURNING id, member_count, created_at, updated_at
+	`
+
+	return r.pool.QueryRow(ctx, query,
+		audience.Name,
+		audience.Description,
+		filtersJSON,
+		audience.CreatedBy,
+		audience.IsActive,
+	).Scan(&audience.ID, &audience.MemberCount, &audience.CreatedAt, &audience.UpdatedAt)
+}
+
+// GetAudience retrieves an audience by ID
+func (r *PostgresRepository) GetAudience(ctx context.Context, id uuid.UUID) (*Audience, error) {
+	query := `
+		SELECT id, name, description, filters, member_count, last_count_update,
+		       created_by, is_active, created_at, updated_at
+		FROM audiences
+		WHERE id = $1
+	`
+
+	var audience Audience
+	var filtersJSON []byte
+
+	err := r.pool.QueryRow(ctx, query, id).Scan(
+		&audience.ID, &audience.Name, &audience.Description, &filtersJSON,
+		&audience.MemberCount, &audience.LastCountUpdate,
+		&audience.CreatedBy, &audience.IsActive,
+		&audience.CreatedAt, &audience.UpdatedAt,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := json.Unmarshal(filtersJSON, &audience.Filters); err != nil {
+		return nil, fmt.Errorf("failed to unmarshal filters: %w", err)
+	}
+
+	return &audience, nil
+}
+
+// ListAudiences lists all audiences
+func (r *PostgresRepository) ListAudiences(ctx context.Context, activeOnly bool) ([]Audience, error) {
+	query := `
+		SELECT id, name, description, filters, member_count, last_count_update,
+		       created_by, is_active, created_at, updated_at
+		FROM audiences
+	`
+	if activeOnly {
+		query += ` WHERE is_active = TRUE`
+	}
+	query += ` ORDER BY created_at DESC`
+
+	rows, err := r.pool.Query(ctx, query)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query audiences: %w", err)
+	}
+	defer rows.Close()
+
+	var audiences []Audience
+	for rows.Next() {
+		var audience Audience
+		var filtersJSON []byte
+
+		if err := rows.Scan(
+			&audience.ID, &audience.Name, &audience.Description, &filtersJSON,
+			&audience.MemberCount, &audience.LastCountUpdate,
+			&audience.CreatedBy, &audience.IsActive,
+			&audience.CreatedAt, &audience.UpdatedAt,
+		); err != nil {
+			return nil, fmt.Errorf("failed to scan audience: %w", err)
+		}
+
+		if err := json.Unmarshal(filtersJSON, &audience.Filters); err != nil {
+			return nil, fmt.Errorf("failed to unmarshal filters: %w", err)
+		}
+
+		audiences = append(audiences, audience)
+	}
+
+	return audiences, rows.Err()
+}
+
+// UpdateAudience updates an existing audience
+func (r *PostgresRepository) UpdateAudience(ctx context.Context, audience *Audience) error {
+	filtersJSON, err := json.Marshal(audience.Filters)
+	if err != nil {
+		return fmt.Errorf("failed to marshal filters: %w", err)
+	}
+
+	query := `
+		UPDATE audiences
+		SET name = $2, description = $3, filters = $4, is_active = $5, updated_at = NOW()
+		WHERE id = $1
+		RETURNING updated_at
+	`
+
+	return r.pool.QueryRow(ctx, query,
+		audience.ID,
+		audience.Name,
+		audience.Description,
+		filtersJSON,
+		audience.IsActive,
+	).Scan(&audience.UpdatedAt)
+}
+
+// DeleteAudience soft-deletes an audience (sets is_active = false)
+func (r *PostgresRepository) DeleteAudience(ctx context.Context, id uuid.UUID) error {
+	query := `UPDATE audiences SET is_active = FALSE, updated_at = NOW() WHERE id = $1`
+	_, err := r.pool.Exec(ctx, query, id)
+	return err
+}
+
+// GetAudienceMembers retrieves members matching the audience filters
+func (r *PostgresRepository) GetAudienceMembers(ctx context.Context, id uuid.UUID, limit, offset int) ([]AudienceMember, int, error) {
+	// First get the audience filters
+	audience, err := r.GetAudience(ctx, id)
+	if err != nil {
+		return nil, 0, fmt.Errorf("failed to get audience: %w", err)
+	}
+
+	// Build dynamic query based on filters
+	query, args := r.buildAudienceMemberQuery(audience.Filters, limit, offset, false)
+	countQuery, countArgs := r.buildAudienceMemberQuery(audience.Filters, 0, 0, true)
+
+	// Get total count
+	var totalCount int
+	if err := r.pool.QueryRow(ctx, countQuery, countArgs...).Scan(&totalCount); err != nil {
+		return nil, 0, fmt.Errorf("failed to count members: %w", err)
+	}
+
+	// Get members
+	rows, err := r.pool.Query(ctx, query, args...)
+	if err != nil {
+		return nil, 0, fmt.Errorf("failed to query members: %w", err)
+	}
+	defer rows.Close()
+
+	var members []AudienceMember
+	for rows.Next() {
+		var m AudienceMember
+		if err := rows.Scan(
+			&m.ID, &m.Name, &m.Email, &m.Position,
+			&m.University, &m.Department, &m.SubjectArea, &m.PublicationCount,
+		); err != nil {
+			return nil, 0, fmt.Errorf("failed to scan member: %w", err)
+		}
+		members = append(members, m)
+	}
+
+	return members, totalCount, rows.Err()
+}
+
+// buildAudienceMemberQuery constructs a SQL query for audience members
+func (r *PostgresRepository) buildAudienceMemberQuery(filters AudienceFilters, limit, offset int, countOnly bool) (string, []interface{}) {
+	var args []interface{}
+	argNum := 1
+
+	var selectClause string
+	if countOnly {
+		selectClause = "SELECT COUNT(*)"
+	} else {
+		selectClause = `
+			SELECT
+				s.id,
+				COALESCE(s.title || ' ', '') || s.first_name || ' ' || s.last_name as name,
+				COALESCE(s.email, '') as email,
+				COALESCE(s.position_type, '') as position,
+				u.name as university,
+				COALESCE(d.name, '') as department,
+				COALESCE(sa.name, '') as subject_area,
+				(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
+		`
+	}
+
+	query := selectClause + `
+		FROM university_staff s
+		JOIN universities u ON s.university_id = u.id
+		LEFT JOIN departments d ON s.department_id = d.id
+		LEFT JOIN subject_areas sa ON s.subject_area_id = sa.id
+		WHERE 1=1
+	`
+
+	// Position types filter
+	if len(filters.PositionTypes) > 0 {
+		query += fmt.Sprintf(" AND s.position_type = ANY($%d)", argNum)
+		args = append(args, filters.PositionTypes)
+		argNum++
+	}
+
+	// Subject areas filter
+	if len(filters.SubjectAreas) > 0 {
+		query += fmt.Sprintf(" AND s.subject_area_id = ANY($%d)", argNum)
+		args = append(args, filters.SubjectAreas)
+		argNum++
+	}
+
+	// States filter
+	if len(filters.States) > 0 {
+		query += fmt.Sprintf(" AND u.state = ANY($%d)", argNum)
+		args = append(args, filters.States)
+		argNum++
+	}
+
+	// Uni types filter
+	if len(filters.UniTypes) > 0 {
+		query += fmt.Sprintf(" AND u.uni_type = ANY($%d)", argNum)
+		args = append(args, filters.UniTypes)
+		argNum++
+	}
+
+	// Universities filter
+	if len(filters.Universities) > 0 {
+		query += fmt.Sprintf(" AND s.university_id = ANY($%d)", argNum)
+		args = append(args, filters.Universities)
+		argNum++
+	}
+
+	// Has email filter
+	if filters.HasEmail != nil && *filters.HasEmail {
+		query += " AND s.email IS NOT NULL AND s.email != ''"
+	}
+
+	// Is active filter
+	if filters.IsActive != nil && *filters.IsActive {
+		query += " AND s.is_active = TRUE"
+	}
+
+	// Keywords filter (search in name and research_areas)
+	if len(filters.Keywords) > 0 {
+		for _, keyword := range filters.Keywords {
+			query += fmt.Sprintf(" AND (s.first_name ILIKE $%d OR s.last_name ILIKE $%d OR s.research_areas ILIKE $%d)", argNum, argNum, argNum)
+			args = append(args, "%"+keyword+"%")
+			argNum++
+		}
+	}
+
+	if !countOnly {
+		query += " ORDER BY s.last_name, s.first_name"
+
+		if limit > 0 {
+			query += fmt.Sprintf(" LIMIT $%d", argNum)
+			args = append(args, limit)
+			argNum++
+		}
+
+		if offset > 0 {
+			query += fmt.Sprintf(" OFFSET $%d", argNum)
+			args = append(args, offset)
+		}
+	}
+
+	return query, args
+}
+
+// UpdateAudienceCount updates the cached member count for an audience
+func (r *PostgresRepository) UpdateAudienceCount(ctx context.Context, id uuid.UUID) (int, error) {
+	// Get the audience filters
+	audience, err := r.GetAudience(ctx, id)
+	if err != nil {
+		return 0, fmt.Errorf("failed to get audience: %w", err)
+	}
+
+	// Count members
+	countQuery, countArgs := r.buildAudienceMemberQuery(audience.Filters, 0, 0, true)
+	var count int
+	if err := r.pool.QueryRow(ctx, countQuery, countArgs...).Scan(&count); err != nil {
+		return 0, fmt.Errorf("failed to count members: %w", err)
+	}
+
+	// Update the cached count
+	updateQuery := `
+		UPDATE audiences
+		SET member_count = $2, last_count_update = NOW(), updated_at = NOW()
+		WHERE id = $1
+	`
+	if _, err := r.pool.Exec(ctx, updateQuery, id, count); err != nil {
+		return 0, fmt.Errorf("failed to update count: %w", err)
+	}
+
+	return count, nil
+}
+
+// CreateExport creates a new export record
+func (r *PostgresRepository) CreateExport(ctx context.Context, export *AudienceExport) error {
+	query := `
+		INSERT INTO audience_exports (audience_id, export_type, record_count, file_path, exported_by, purpose)
+		VALUES ($1, $2, $3, $4, $5, $6)
+		RETURNING id, created_at
+	`
+
+	return r.pool.QueryRow(ctx, query,
+		export.AudienceID,
+		export.ExportType,
+		export.RecordCount,
+		export.FilePath,
+		export.ExportedBy,
+		export.Purpose,
+	).Scan(&export.ID, &export.CreatedAt)
+}
+
+// ListExports lists exports for an audience
+func (r *PostgresRepository) ListExports(ctx context.Context, audienceID uuid.UUID) ([]AudienceExport, error) {
+	query := `
+		SELECT id, audience_id, export_type, record_count, file_path, exported_by, purpose, created_at
+		FROM audience_exports
+		WHERE audience_id = $1
+		ORDER BY created_at DESC
+	`
+
+	rows, err := r.pool.Query(ctx, query, audienceID)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query exports: %w", err)
+	}
+	defer rows.Close()
+
+	var exports []AudienceExport
+	for rows.Next() {
+		var e AudienceExport
+		if err := rows.Scan(
+			&e.ID, &e.AudienceID, &e.ExportType, &e.RecordCount,
+			&e.FilePath, &e.ExportedBy, &e.Purpose, &e.CreatedAt,
+		); err != nil {
+			return nil, fmt.Errorf("failed to scan export: %w", err)
+		}
+		exports = append(exports, e)
+	}
+
+	return exports, rows.Err()
+}
@@ -0,0 +1,407 @@
+// Package orchestrator implements multi-phase university crawling with queue management
+package orchestrator
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+// CrawlPhase represents a phase in the crawl process
+type CrawlPhase string
+
+const (
+	PhasePending      CrawlPhase = "pending"
+	PhaseDiscovery    CrawlPhase = "discovery"    // Find sample professor to validate crawling works
+	PhaseProfessors   CrawlPhase = "professors"   // Crawl all professors
+	PhaseAllStaff     CrawlPhase = "all_staff"    // Crawl all staff members
+	PhasePublications CrawlPhase = "publications" // Crawl publications for all staff
+	PhaseCompleted    CrawlPhase = "completed"
+	PhaseFailed       CrawlPhase = "failed"
+	PhasePaused       CrawlPhase = "paused"
+)
+
+// CrawlQueueItem represents a university in the crawl queue
+type CrawlQueueItem struct {
+	ID                     uuid.UUID  `json:"id"`
+	UniversityID           uuid.UUID  `json:"university_id"`
+	UniversityName         string     `json:"university_name"`
+	UniversityShort        string     `json:"university_short"`
+	QueuePosition          *int       `json:"queue_position"`
+	Priority               int        `json:"priority"`
+	CurrentPhase           CrawlPhase `json:"current_phase"`
+	DiscoveryCompleted     bool       `json:"discovery_completed"`
+	DiscoveryCompletedAt   *time.Time `json:"discovery_completed_at,omitempty"`
+	ProfessorsCompleted    bool       `json:"professors_completed"`
+	ProfessorsCompletedAt  *time.Time `json:"professors_completed_at,omitempty"`
+	AllStaffCompleted      bool       `json:"all_staff_completed"`
+	AllStaffCompletedAt    *time.Time `json:"all_staff_completed_at,omitempty"`
+	PublicationsCompleted  bool       `json:"publications_completed"`
+	PublicationsCompletedAt *time.Time `json:"publications_completed_at,omitempty"`
+	DiscoveryCount         int        `json:"discovery_count"`
+	ProfessorsCount        int        `json:"professors_count"`
+	StaffCount             int        `json:"staff_count"`
+	PublicationsCount      int        `json:"publications_count"`
+	RetryCount             int        `json:"retry_count"`
+	MaxRetries             int        `json:"max_retries"`
+	LastError              string     `json:"last_error,omitempty"`
+	StartedAt              *time.Time `json:"started_at,omitempty"`
+	CompletedAt            *time.Time `json:"completed_at,omitempty"`
+	ProgressPercent        int        `json:"progress_percent"`
+	CreatedAt              time.Time  `json:"created_at"`
+	UpdatedAt              time.Time  `json:"updated_at"`
+}
+
+// CrawlProgress represents progress for a single phase
+type CrawlProgress struct {
+	Phase          CrawlPhase `json:"phase"`
+	ItemsFound     int        `json:"items_found"`
+	ItemsProcessed int        `json:"items_processed"`
+	Errors         []string   `json:"errors,omitempty"`
+	StartedAt      time.Time  `json:"started_at"`
+	CompletedAt    *time.Time `json:"completed_at,omitempty"`
+}
+
+// OrchestratorStatus represents the current state of the orchestrator
+type OrchestratorStatus struct {
+	IsRunning         bool             `json:"is_running"`
+	CurrentUniversity *CrawlQueueItem  `json:"current_university,omitempty"`
+	CurrentPhase      CrawlPhase       `json:"current_phase"`
+	QueueLength       int              `json:"queue_length"`
+	CompletedToday    int              `json:"completed_today"`
+	TotalProcessed    int              `json:"total_processed"`
+	LastActivity      *time.Time       `json:"last_activity,omitempty"`
+}
+
+// StaffCrawlerInterface defines what the staff crawler must implement
+type StaffCrawlerInterface interface {
+	// DiscoverSampleProfessor finds at least one professor to validate crawling works
+	DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
+	// CrawlProfessors crawls all professors at a university
+	CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
+	// CrawlAllStaff crawls all staff members at a university
+	CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
+}
+
+// PublicationCrawlerInterface defines what the publication crawler must implement
+type PublicationCrawlerInterface interface {
+	// CrawlPublicationsForUniversity crawls publications for all staff at a university
+	CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
+}
+
+// Repository defines database operations for the orchestrator
+type Repository interface {
+	// Queue operations
+	GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error)
+	GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error)
+	AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error)
+	RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error
+	UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error
+	PauseQueueItem(ctx context.Context, universityID uuid.UUID) error
+	ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error
+
+	// Phase updates
+	CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error
+	FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, err string) error
+
+	// Stats
+	GetCompletedTodayCount(ctx context.Context) (int, error)
+	GetTotalProcessedCount(ctx context.Context) (int, error)
+}
+
+// Orchestrator manages the multi-phase crawl process
+type Orchestrator struct {
+	repo         Repository
+	staffCrawler StaffCrawlerInterface
+	pubCrawler   PublicationCrawlerInterface
+
+	// Runtime state
+	mu            sync.RWMutex
+	isRunning     bool
+	stopChan      chan struct{}
+	currentItem   *CrawlQueueItem
+	lastActivity  time.Time
+
+	// Configuration
+	phaseCooldown  time.Duration // Wait time between phases
+	retryCooldown  time.Duration // Wait time after failure before retry
+	maxConcurrent  int           // Max concurrent crawls (always 1 for now)
+}
+
+// NewOrchestrator creates a new orchestrator instance
+func NewOrchestrator(repo Repository, staffCrawler StaffCrawlerInterface, pubCrawler PublicationCrawlerInterface) *Orchestrator {
+	return &Orchestrator{
+		repo:          repo,
+		staffCrawler:  staffCrawler,
+		pubCrawler:    pubCrawler,
+		phaseCooldown: 5 * time.Second,  // Small pause between phases
+		retryCooldown: 30 * time.Second, // Wait before retry after failure
+		maxConcurrent: 1,                // Sequential processing
+	}
+}
+
+// Start begins the orchestrator loop
+func (o *Orchestrator) Start() error {
+	o.mu.Lock()
+	if o.isRunning {
+		o.mu.Unlock()
+		return fmt.Errorf("orchestrator already running")
+	}
+	o.isRunning = true
+	o.stopChan = make(chan struct{})
+	o.mu.Unlock()
+
+	log.Println("[Orchestrator] Starting crawl orchestration loop")
+
+	go o.runLoop()
+	return nil
+}
+
+// Stop gracefully stops the orchestrator
+func (o *Orchestrator) Stop() error {
+	o.mu.Lock()
+	if !o.isRunning {
+		o.mu.Unlock()
+		return fmt.Errorf("orchestrator not running")
+	}
+	close(o.stopChan)
+	o.isRunning = false
+	o.mu.Unlock()
+
+	log.Println("[Orchestrator] Stopped")
+	return nil
+}
+
+// Status returns the current orchestrator status
+func (o *Orchestrator) Status(ctx context.Context) (*OrchestratorStatus, error) {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+
+	status := &OrchestratorStatus{
+		IsRunning:    o.isRunning,
+		CurrentPhase: PhasePending,
+	}
+
+	if o.currentItem != nil {
+		status.CurrentUniversity = o.currentItem
+		status.CurrentPhase = o.currentItem.CurrentPhase
+	}
+
+	if !o.lastActivity.IsZero() {
+		status.LastActivity = &o.lastActivity
+	}
+
+	// Get queue stats from DB
+	items, err := o.repo.GetQueueItems(ctx)
+	if err == nil {
+		status.QueueLength = len(items)
+	}
+
+	completedToday, _ := o.repo.GetCompletedTodayCount(ctx)
+	status.CompletedToday = completedToday
+
+	totalProcessed, _ := o.repo.GetTotalProcessedCount(ctx)
+	status.TotalProcessed = totalProcessed
+
+	return status, nil
+}
+
+// AddUniversity adds a university to the crawl queue
+func (o *Orchestrator) AddUniversity(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) {
+	item, err := o.repo.AddToQueue(ctx, universityID, priority, initiatedBy)
+	if err != nil {
+		return nil, fmt.Errorf("failed to add to queue: %w", err)
+	}
+
+	log.Printf("[Orchestrator] Added university %s to queue with priority %d", universityID, priority)
+	return item, nil
+}
+
+// RemoveUniversity removes a university from the queue
+func (o *Orchestrator) RemoveUniversity(ctx context.Context, universityID uuid.UUID) error {
+	return o.repo.RemoveFromQueue(ctx, universityID)
+}
+
+// PauseUniversity pauses crawling for a university
+func (o *Orchestrator) PauseUniversity(ctx context.Context, universityID uuid.UUID) error {
+	return o.repo.PauseQueueItem(ctx, universityID)
+}
+
+// ResumeUniversity resumes crawling for a paused university
+func (o *Orchestrator) ResumeUniversity(ctx context.Context, universityID uuid.UUID) error {
+	return o.repo.ResumeQueueItem(ctx, universityID)
+}
+
+// GetQueue returns all items in the queue
+func (o *Orchestrator) GetQueue(ctx context.Context) ([]CrawlQueueItem, error) {
+	return o.repo.GetQueueItems(ctx)
+}
+
+// runLoop is the main orchestration loop
+func (o *Orchestrator) runLoop() {
+	ticker := time.NewTicker(10 * time.Second) // Check queue every 10 seconds
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-o.stopChan:
+			return
+		case <-ticker.C:
+			o.processNextInQueue()
+		}
+	}
+}
+
+// processNextInQueue processes the next university in the queue
+func (o *Orchestrator) processNextInQueue() {
+	ctx := context.Background()
+
+	// Get next item in queue
+	item, err := o.repo.GetNextInQueue(ctx)
+	if err != nil {
+		log.Printf("[Orchestrator] Error getting next item: %v", err)
+		return
+	}
+
+	if item == nil {
+		// No items to process
+		return
+	}
+
+	// Check if paused
+	if item.CurrentPhase == PhasePaused {
+		return
+	}
+
+	// Set current item
+	o.mu.Lock()
+	o.currentItem = item
+	o.lastActivity = time.Now()
+	o.mu.Unlock()
+
+	defer func() {
+		o.mu.Lock()
+		o.currentItem = nil
+		o.mu.Unlock()
+	}()
+
+	log.Printf("[Orchestrator] Processing university: %s (Phase: %s)", item.UniversityName, item.CurrentPhase)
+
+	// Process based on current phase
+	switch item.CurrentPhase {
+	case PhasePending:
+		o.runPhase(ctx, item, PhaseDiscovery)
+	case PhaseDiscovery:
+		if item.DiscoveryCompleted {
+			o.runPhase(ctx, item, PhaseProfessors)
+		} else {
+			o.runPhase(ctx, item, PhaseDiscovery)
+		}
+	case PhaseProfessors:
+		if item.ProfessorsCompleted {
+			o.runPhase(ctx, item, PhaseAllStaff)
+		} else {
+			o.runPhase(ctx, item, PhaseProfessors)
+		}
+	case PhaseAllStaff:
+		if item.AllStaffCompleted {
+			o.runPhase(ctx, item, PhasePublications)
+		} else {
+			o.runPhase(ctx, item, PhaseAllStaff)
+		}
+	case PhasePublications:
+		if item.PublicationsCompleted {
+			o.completeUniversity(ctx, item)
+		} else {
+			o.runPhase(ctx, item, PhasePublications)
+		}
+	}
+}
+
+// runPhase executes a specific crawl phase
+func (o *Orchestrator) runPhase(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase) {
+	log.Printf("[Orchestrator] Running phase %s for %s", phase, item.UniversityName)
+
+	// Update current phase
+	item.CurrentPhase = phase
+	if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
+		log.Printf("[Orchestrator] Failed to update phase: %v", err)
+		return
+	}
+
+	var progress *CrawlProgress
+	var err error
+
+	// Execute phase
+	switch phase {
+	case PhaseDiscovery:
+		progress, err = o.staffCrawler.DiscoverSampleProfessor(ctx, item.UniversityID)
+	case PhaseProfessors:
+		progress, err = o.staffCrawler.CrawlProfessors(ctx, item.UniversityID)
+	case PhaseAllStaff:
+		progress, err = o.staffCrawler.CrawlAllStaff(ctx, item.UniversityID)
+	case PhasePublications:
+		progress, err = o.pubCrawler.CrawlPublicationsForUniversity(ctx, item.UniversityID)
+	}
+
+	// Handle result
+	if err != nil {
+		log.Printf("[Orchestrator] Phase %s failed: %v", phase, err)
+		o.handlePhaseFailure(ctx, item, phase, err)
+		return
+	}
+
+	// Mark phase complete
+	count := 0
+	if progress != nil {
+		count = progress.ItemsFound
+	}
+
+	if err := o.repo.CompletePhase(ctx, item.UniversityID, phase, count); err != nil {
+		log.Printf("[Orchestrator] Failed to complete phase: %v", err)
+	}
+
+	log.Printf("[Orchestrator] Phase %s completed for %s (found: %d)", phase, item.UniversityName, count)
+
+	// Wait before next phase
+	time.Sleep(o.phaseCooldown)
+}
+
+// handlePhaseFailure handles a phase failure
+func (o *Orchestrator) handlePhaseFailure(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase, err error) {
+	item.RetryCount++
+	item.LastError = err.Error()
+
+	if item.RetryCount >= item.MaxRetries {
+		// Max retries reached, mark as failed
+		item.CurrentPhase = PhaseFailed
+		log.Printf("[Orchestrator] University %s failed after %d retries", item.UniversityName, item.RetryCount)
+	}
+
+	if updateErr := o.repo.FailPhase(ctx, item.UniversityID, phase, err.Error()); updateErr != nil {
+		log.Printf("[Orchestrator] Failed to update failure status: %v", updateErr)
+	}
+
+	// Wait before potential retry
+	time.Sleep(o.retryCooldown)
+}
+
+// completeUniversity marks a university as fully crawled
+func (o *Orchestrator) completeUniversity(ctx context.Context, item *CrawlQueueItem) {
+	now := time.Now()
+	item.CurrentPhase = PhaseCompleted
+	item.CompletedAt = &now
+	item.QueuePosition = nil // Remove from active queue
+
+	if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
+		log.Printf("[Orchestrator] Failed to complete university: %v", err)
+		return
+	}
+
+	log.Printf("[Orchestrator] University %s completed! Professors: %d, Staff: %d, Publications: %d",
+		item.UniversityName, item.ProfessorsCount, item.StaffCount, item.PublicationsCount)
+}
@@ -0,0 +1,316 @@
+// Package orchestrator implements multi-phase university crawling with queue management
+package orchestrator
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/jackc/pgx/v5"
+	"github.com/jackc/pgx/v5/pgxpool"
+)
+
+// PostgresRepository implements the Repository interface using PostgreSQL
+type PostgresRepository struct {
+	pool *pgxpool.Pool
+}
+
+// NewPostgresRepository creates a new PostgresRepository
+func NewPostgresRepository(pool *pgxpool.Pool) *PostgresRepository {
+	return &PostgresRepository{pool: pool}
+}
+
+// ============================================================================
+// QUEUE OPERATIONS
+// ============================================================================
+
+// GetQueueItems retrieves all items in the crawl queue
+func (r *PostgresRepository) GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error) {
+	query := `
+		SELECT
+			cq.id, cq.university_id, u.name, COALESCE(u.short_name, ''),
+			cq.queue_position, cq.priority, cq.current_phase,
+			cq.discovery_completed, cq.discovery_completed_at,
+			cq.professors_completed, cq.professors_completed_at,
+			cq.all_staff_completed, cq.all_staff_completed_at,
+			cq.publications_completed, cq.publications_completed_at,
+			cq.discovery_count, cq.professors_count, cq.staff_count, cq.publications_count,
+			cq.retry_count, cq.max_retries, COALESCE(cq.last_error, ''),
+			cq.started_at, cq.completed_at,
+			CASE
+				WHEN cq.current_phase = 'pending' THEN 0
+				WHEN cq.current_phase = 'discovery' THEN 10
+				WHEN cq.current_phase = 'professors' THEN 30
+				WHEN cq.current_phase = 'all_staff' THEN 60
+				WHEN cq.current_phase = 'publications' THEN 90
+				WHEN cq.current_phase = 'completed' THEN 100
+				ELSE 0
+			END as progress_percent,
+			cq.created_at, cq.updated_at
+		FROM crawl_queue cq
+		JOIN universities u ON cq.university_id = u.id
+		ORDER BY cq.queue_position NULLS LAST, cq.priority DESC
+	`
+
+	rows, err := r.pool.Query(ctx, query)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query queue items: %w", err)
+	}
+	defer rows.Close()
+
+	var items []CrawlQueueItem
+	for rows.Next() {
+		var item CrawlQueueItem
+		var phase string
+		if err := rows.Scan(
+			&item.ID, &item.UniversityID, &item.UniversityName, &item.UniversityShort,
+			&item.QueuePosition, &item.Priority, &phase,
+			&item.DiscoveryCompleted, &item.DiscoveryCompletedAt,
+			&item.ProfessorsCompleted, &item.ProfessorsCompletedAt,
+			&item.AllStaffCompleted, &item.AllStaffCompletedAt,
+			&item.PublicationsCompleted, &item.PublicationsCompletedAt,
+			&item.DiscoveryCount, &item.ProfessorsCount, &item.StaffCount, &item.PublicationsCount,
+			&item.RetryCount, &item.MaxRetries, &item.LastError,
+			&item.StartedAt, &item.CompletedAt,
+			&item.ProgressPercent,
+			&item.CreatedAt, &item.UpdatedAt,
+		); err != nil {
+			return nil, fmt.Errorf("failed to scan queue item: %w", err)
+		}
+		item.CurrentPhase = CrawlPhase(phase)
+		items = append(items, item)
+	}
+
+	return items, rows.Err()
+}
+
+// GetNextInQueue retrieves the next item to process
+func (r *PostgresRepository) GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error) {
+	query := `
+		SELECT
+			cq.id, cq.university_id, u.name, COALESCE(u.short_name, ''),
+			cq.queue_position, cq.priority, cq.current_phase,
+			cq.discovery_completed, cq.discovery_completed_at,
+			cq.professors_completed, cq.professors_completed_at,
+			cq.all_staff_completed, cq.all_staff_completed_at,
+			cq.publications_completed, cq.publications_completed_at,
+			cq.discovery_count, cq.professors_count, cq.staff_count, cq.publications_count,
+			cq.retry_count, cq.max_retries, COALESCE(cq.last_error, ''),
+			cq.started_at, cq.completed_at,
+			cq.created_at, cq.updated_at
+		FROM crawl_queue cq
+		JOIN universities u ON cq.university_id = u.id
+		WHERE cq.current_phase NOT IN ('completed', 'failed', 'paused')
+			AND cq.queue_position IS NOT NULL
+		ORDER BY cq.queue_position ASC, cq.priority DESC
+		LIMIT 1
+	`
+
+	var item CrawlQueueItem
+	var phase string
+	err := r.pool.QueryRow(ctx, query).Scan(
+		&item.ID, &item.UniversityID, &item.UniversityName, &item.UniversityShort,
+		&item.QueuePosition, &item.Priority, &phase,
+		&item.DiscoveryCompleted, &item.DiscoveryCompletedAt,
+		&item.ProfessorsCompleted, &item.ProfessorsCompletedAt,
+		&item.AllStaffCompleted, &item.AllStaffCompletedAt,
+		&item.PublicationsCompleted, &item.PublicationsCompletedAt,
+		&item.DiscoveryCount, &item.ProfessorsCount, &item.StaffCount, &item.PublicationsCount,
+		&item.RetryCount, &item.MaxRetries, &item.LastError,
+		&item.StartedAt, &item.CompletedAt,
+		&item.CreatedAt, &item.UpdatedAt,
+	)
+
+	if err == pgx.ErrNoRows {
+		return nil, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("failed to get next queue item: %w", err)
+	}
+
+	item.CurrentPhase = CrawlPhase(phase)
+	return &item, nil
+}
+
+// AddToQueue adds a university to the crawl queue
+func (r *PostgresRepository) AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) {
+	// Get next queue position
+	var nextPosition int
+	err := r.pool.QueryRow(ctx, `SELECT COALESCE(MAX(queue_position), 0) + 1 FROM crawl_queue WHERE queue_position IS NOT NULL`).Scan(&nextPosition)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get next queue position: %w", err)
+	}
+
+	query := `
+		INSERT INTO crawl_queue (university_id, queue_position, priority, initiated_by)
+		VALUES ($1, $2, $3, $4)
+		ON CONFLICT (university_id) DO UPDATE SET
+			queue_position = EXCLUDED.queue_position,
+			priority = EXCLUDED.priority,
+			current_phase = 'pending',
+			retry_count = 0,
+			last_error = NULL,
+			updated_at = NOW()
+		RETURNING id, created_at, updated_at
+	`
+
+	item := &CrawlQueueItem{
+		UniversityID:  universityID,
+		QueuePosition: &nextPosition,
+		Priority:      priority,
+		CurrentPhase:  PhasePending,
+		MaxRetries:    3,
+	}
+
+	err = r.pool.QueryRow(ctx, query, universityID, nextPosition, priority, initiatedBy).Scan(
+		&item.ID, &item.CreatedAt, &item.UpdatedAt,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("failed to add to queue: %w", err)
+	}
+
+	// Get university name
+	r.pool.QueryRow(ctx, `SELECT name, short_name FROM universities WHERE id = $1`, universityID).Scan(
+		&item.UniversityName, &item.UniversityShort,
+	)
+
+	return item, nil
+}
+
+// RemoveFromQueue removes a university from the queue
+func (r *PostgresRepository) RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error {
+	_, err := r.pool.Exec(ctx, `DELETE FROM crawl_queue WHERE university_id = $1`, universityID)
+	return err
+}
+
+// UpdateQueueItem updates a queue item
+func (r *PostgresRepository) UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error {
+	query := `
+		UPDATE crawl_queue SET
+			queue_position = $2,
+			priority = $3,
+			current_phase = $4,
+			discovery_completed = $5,
+			discovery_completed_at = $6,
+			professors_completed = $7,
+			professors_completed_at = $8,
+			all_staff_completed = $9,
+			all_staff_completed_at = $10,
+			publications_completed = $11,
+			publications_completed_at = $12,
+			discovery_count = $13,
+			professors_count = $14,
+			staff_count = $15,
+			publications_count = $16,
+			retry_count = $17,
+			last_error = $18,
+			started_at = $19,
+			completed_at = $20,
+			updated_at = NOW()
+		WHERE university_id = $1
+	`
+
+	_, err := r.pool.Exec(ctx, query,
+		item.UniversityID,
+		item.QueuePosition, item.Priority, string(item.CurrentPhase),
+		item.DiscoveryCompleted, item.DiscoveryCompletedAt,
+		item.ProfessorsCompleted, item.ProfessorsCompletedAt,
+		item.AllStaffCompleted, item.AllStaffCompletedAt,
+		item.PublicationsCompleted, item.PublicationsCompletedAt,
+		item.DiscoveryCount, item.ProfessorsCount, item.StaffCount, item.PublicationsCount,
+		item.RetryCount, item.LastError,
+		item.StartedAt, item.CompletedAt,
+	)
+	return err
+}
+
+// PauseQueueItem pauses a crawl
+func (r *PostgresRepository) PauseQueueItem(ctx context.Context, universityID uuid.UUID) error {
+	_, err := r.pool.Exec(ctx, `UPDATE crawl_queue SET current_phase = 'paused', updated_at = NOW() WHERE university_id = $1`, universityID)
+	return err
+}
+
+// ResumeQueueItem resumes a paused crawl
+func (r *PostgresRepository) ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error {
+	// Determine what phase to resume from
+	query := `
+		UPDATE crawl_queue SET
+			current_phase = CASE
+				WHEN NOT discovery_completed THEN 'discovery'
+				WHEN NOT professors_completed THEN 'professors'
+				WHEN NOT all_staff_completed THEN 'all_staff'
+				WHEN NOT publications_completed THEN 'publications'
+				ELSE 'pending'
+			END,
+			updated_at = NOW()
+		WHERE university_id = $1 AND current_phase = 'paused'
+	`
+	_, err := r.pool.Exec(ctx, query, universityID)
+	return err
+}
+
+// ============================================================================
+// PHASE UPDATES
+// ============================================================================
+
+// CompletePhase marks a phase as completed
+func (r *PostgresRepository) CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error {
+	now := time.Now()
+	var query string
+
+	switch phase {
+	case PhaseDiscovery:
+		query = `UPDATE crawl_queue SET discovery_completed = true, discovery_completed_at = $2, discovery_count = $3, updated_at = NOW() WHERE university_id = $1`
+	case PhaseProfessors:
+		query = `UPDATE crawl_queue SET professors_completed = true, professors_completed_at = $2, professors_count = $3, updated_at = NOW() WHERE university_id = $1`
+	case PhaseAllStaff:
+		query = `UPDATE crawl_queue SET all_staff_completed = true, all_staff_completed_at = $2, staff_count = $3, updated_at = NOW() WHERE university_id = $1`
+	case PhasePublications:
+		query = `UPDATE crawl_queue SET publications_completed = true, publications_completed_at = $2, publications_count = $3, updated_at = NOW() WHERE university_id = $1`
+	default:
+		return fmt.Errorf("unknown phase: %s", phase)
+	}
+
+	_, err := r.pool.Exec(ctx, query, universityID, now, count)
+	return err
+}
+
+// FailPhase records a phase failure
+func (r *PostgresRepository) FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, errMsg string) error {
+	query := `
+		UPDATE crawl_queue SET
+			retry_count = retry_count + 1,
+			last_error = $2,
+			current_phase = CASE
+				WHEN retry_count + 1 >= max_retries THEN 'failed'
+				ELSE current_phase
+			END,
+			updated_at = NOW()
+		WHERE university_id = $1
+	`
+	_, err := r.pool.Exec(ctx, query, universityID, errMsg)
+	return err
+}
+
+// ============================================================================
+// STATS
+// ============================================================================
+
+// GetCompletedTodayCount returns the number of universities completed today
+func (r *PostgresRepository) GetCompletedTodayCount(ctx context.Context) (int, error) {
+	var count int
+	err := r.pool.QueryRow(ctx, `
+		SELECT COUNT(*) FROM crawl_queue
+		WHERE current_phase = 'completed'
+		AND completed_at >= CURRENT_DATE
+	`).Scan(&count)
+	return count, err
+}
+
+// GetTotalProcessedCount returns the total number of processed universities
+func (r *PostgresRepository) GetTotalProcessedCount(ctx context.Context) (int, error) {
+	var count int
+	err := r.pool.QueryRow(ctx, `SELECT COUNT(*) FROM crawl_queue WHERE current_phase = 'completed'`).Scan(&count)
+	return count, err
+}