// Package orchestrator implements multi-phase university crawling with queue management package orchestrator import ( "context" "fmt" "log" "sync" "time" "github.com/google/uuid" ) // CrawlPhase represents a phase in the crawl process type CrawlPhase string const ( PhasePending CrawlPhase = "pending" PhaseDiscovery CrawlPhase = "discovery" // Find sample professor to validate crawling works PhaseProfessors CrawlPhase = "professors" // Crawl all professors PhaseAllStaff CrawlPhase = "all_staff" // Crawl all staff members PhasePublications CrawlPhase = "publications" // Crawl publications for all staff PhaseCompleted CrawlPhase = "completed" PhaseFailed CrawlPhase = "failed" PhasePaused CrawlPhase = "paused" ) // CrawlQueueItem represents a university in the crawl queue type CrawlQueueItem struct { ID uuid.UUID `json:"id"` UniversityID uuid.UUID `json:"university_id"` UniversityName string `json:"university_name"` UniversityShort string `json:"university_short"` QueuePosition *int `json:"queue_position"` Priority int `json:"priority"` CurrentPhase CrawlPhase `json:"current_phase"` DiscoveryCompleted bool `json:"discovery_completed"` DiscoveryCompletedAt *time.Time `json:"discovery_completed_at,omitempty"` ProfessorsCompleted bool `json:"professors_completed"` ProfessorsCompletedAt *time.Time `json:"professors_completed_at,omitempty"` AllStaffCompleted bool `json:"all_staff_completed"` AllStaffCompletedAt *time.Time `json:"all_staff_completed_at,omitempty"` PublicationsCompleted bool `json:"publications_completed"` PublicationsCompletedAt *time.Time `json:"publications_completed_at,omitempty"` DiscoveryCount int `json:"discovery_count"` ProfessorsCount int `json:"professors_count"` StaffCount int `json:"staff_count"` PublicationsCount int `json:"publications_count"` RetryCount int `json:"retry_count"` MaxRetries int `json:"max_retries"` LastError string `json:"last_error,omitempty"` StartedAt *time.Time `json:"started_at,omitempty"` CompletedAt *time.Time `json:"completed_at,omitempty"` ProgressPercent int `json:"progress_percent"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } // CrawlProgress represents progress for a single phase type CrawlProgress struct { Phase CrawlPhase `json:"phase"` ItemsFound int `json:"items_found"` ItemsProcessed int `json:"items_processed"` Errors []string `json:"errors,omitempty"` StartedAt time.Time `json:"started_at"` CompletedAt *time.Time `json:"completed_at,omitempty"` } // OrchestratorStatus represents the current state of the orchestrator type OrchestratorStatus struct { IsRunning bool `json:"is_running"` CurrentUniversity *CrawlQueueItem `json:"current_university,omitempty"` CurrentPhase CrawlPhase `json:"current_phase"` QueueLength int `json:"queue_length"` CompletedToday int `json:"completed_today"` TotalProcessed int `json:"total_processed"` LastActivity *time.Time `json:"last_activity,omitempty"` } // StaffCrawlerInterface defines what the staff crawler must implement type StaffCrawlerInterface interface { // DiscoverSampleProfessor finds at least one professor to validate crawling works DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error) // CrawlProfessors crawls all professors at a university CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error) // CrawlAllStaff crawls all staff members at a university CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error) } // PublicationCrawlerInterface defines what the publication crawler must implement type PublicationCrawlerInterface interface { // CrawlPublicationsForUniversity crawls publications for all staff at a university CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error) } // Repository defines database operations for the orchestrator type Repository interface { // Queue operations GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error) GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error) AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error PauseQueueItem(ctx context.Context, universityID uuid.UUID) error ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error // Phase updates CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, err string) error // Stats GetCompletedTodayCount(ctx context.Context) (int, error) GetTotalProcessedCount(ctx context.Context) (int, error) } // Orchestrator manages the multi-phase crawl process type Orchestrator struct { repo Repository staffCrawler StaffCrawlerInterface pubCrawler PublicationCrawlerInterface // Runtime state mu sync.RWMutex isRunning bool stopChan chan struct{} currentItem *CrawlQueueItem lastActivity time.Time // Configuration phaseCooldown time.Duration // Wait time between phases retryCooldown time.Duration // Wait time after failure before retry maxConcurrent int // Max concurrent crawls (always 1 for now) } // NewOrchestrator creates a new orchestrator instance func NewOrchestrator(repo Repository, staffCrawler StaffCrawlerInterface, pubCrawler PublicationCrawlerInterface) *Orchestrator { return &Orchestrator{ repo: repo, staffCrawler: staffCrawler, pubCrawler: pubCrawler, phaseCooldown: 5 * time.Second, // Small pause between phases retryCooldown: 30 * time.Second, // Wait before retry after failure maxConcurrent: 1, // Sequential processing } } // Start begins the orchestrator loop func (o *Orchestrator) Start() error { o.mu.Lock() if o.isRunning { o.mu.Unlock() return fmt.Errorf("orchestrator already running") } o.isRunning = true o.stopChan = make(chan struct{}) o.mu.Unlock() log.Println("[Orchestrator] Starting crawl orchestration loop") go o.runLoop() return nil } // Stop gracefully stops the orchestrator func (o *Orchestrator) Stop() error { o.mu.Lock() if !o.isRunning { o.mu.Unlock() return fmt.Errorf("orchestrator not running") } close(o.stopChan) o.isRunning = false o.mu.Unlock() log.Println("[Orchestrator] Stopped") return nil } // Status returns the current orchestrator status func (o *Orchestrator) Status(ctx context.Context) (*OrchestratorStatus, error) { o.mu.RLock() defer o.mu.RUnlock() status := &OrchestratorStatus{ IsRunning: o.isRunning, CurrentPhase: PhasePending, } if o.currentItem != nil { status.CurrentUniversity = o.currentItem status.CurrentPhase = o.currentItem.CurrentPhase } if !o.lastActivity.IsZero() { status.LastActivity = &o.lastActivity } // Get queue stats from DB items, err := o.repo.GetQueueItems(ctx) if err == nil { status.QueueLength = len(items) } completedToday, _ := o.repo.GetCompletedTodayCount(ctx) status.CompletedToday = completedToday totalProcessed, _ := o.repo.GetTotalProcessedCount(ctx) status.TotalProcessed = totalProcessed return status, nil } // AddUniversity adds a university to the crawl queue func (o *Orchestrator) AddUniversity(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) { item, err := o.repo.AddToQueue(ctx, universityID, priority, initiatedBy) if err != nil { return nil, fmt.Errorf("failed to add to queue: %w", err) } log.Printf("[Orchestrator] Added university %s to queue with priority %d", universityID, priority) return item, nil } // RemoveUniversity removes a university from the queue func (o *Orchestrator) RemoveUniversity(ctx context.Context, universityID uuid.UUID) error { return o.repo.RemoveFromQueue(ctx, universityID) } // PauseUniversity pauses crawling for a university func (o *Orchestrator) PauseUniversity(ctx context.Context, universityID uuid.UUID) error { return o.repo.PauseQueueItem(ctx, universityID) } // ResumeUniversity resumes crawling for a paused university func (o *Orchestrator) ResumeUniversity(ctx context.Context, universityID uuid.UUID) error { return o.repo.ResumeQueueItem(ctx, universityID) } // GetQueue returns all items in the queue func (o *Orchestrator) GetQueue(ctx context.Context) ([]CrawlQueueItem, error) { return o.repo.GetQueueItems(ctx) } // runLoop is the main orchestration loop func (o *Orchestrator) runLoop() { ticker := time.NewTicker(10 * time.Second) // Check queue every 10 seconds defer ticker.Stop() for { select { case <-o.stopChan: return case <-ticker.C: o.processNextInQueue() } } } // processNextInQueue processes the next university in the queue func (o *Orchestrator) processNextInQueue() { ctx := context.Background() // Get next item in queue item, err := o.repo.GetNextInQueue(ctx) if err != nil { log.Printf("[Orchestrator] Error getting next item: %v", err) return } if item == nil { // No items to process return } // Check if paused if item.CurrentPhase == PhasePaused { return } // Set current item o.mu.Lock() o.currentItem = item o.lastActivity = time.Now() o.mu.Unlock() defer func() { o.mu.Lock() o.currentItem = nil o.mu.Unlock() }() log.Printf("[Orchestrator] Processing university: %s (Phase: %s)", item.UniversityName, item.CurrentPhase) // Process based on current phase switch item.CurrentPhase { case PhasePending: o.runPhase(ctx, item, PhaseDiscovery) case PhaseDiscovery: if item.DiscoveryCompleted { o.runPhase(ctx, item, PhaseProfessors) } else { o.runPhase(ctx, item, PhaseDiscovery) } case PhaseProfessors: if item.ProfessorsCompleted { o.runPhase(ctx, item, PhaseAllStaff) } else { o.runPhase(ctx, item, PhaseProfessors) } case PhaseAllStaff: if item.AllStaffCompleted { o.runPhase(ctx, item, PhasePublications) } else { o.runPhase(ctx, item, PhaseAllStaff) } case PhasePublications: if item.PublicationsCompleted { o.completeUniversity(ctx, item) } else { o.runPhase(ctx, item, PhasePublications) } } } // runPhase executes a specific crawl phase func (o *Orchestrator) runPhase(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase) { log.Printf("[Orchestrator] Running phase %s for %s", phase, item.UniversityName) // Update current phase item.CurrentPhase = phase if err := o.repo.UpdateQueueItem(ctx, item); err != nil { log.Printf("[Orchestrator] Failed to update phase: %v", err) return } var progress *CrawlProgress var err error // Execute phase switch phase { case PhaseDiscovery: progress, err = o.staffCrawler.DiscoverSampleProfessor(ctx, item.UniversityID) case PhaseProfessors: progress, err = o.staffCrawler.CrawlProfessors(ctx, item.UniversityID) case PhaseAllStaff: progress, err = o.staffCrawler.CrawlAllStaff(ctx, item.UniversityID) case PhasePublications: progress, err = o.pubCrawler.CrawlPublicationsForUniversity(ctx, item.UniversityID) } // Handle result if err != nil { log.Printf("[Orchestrator] Phase %s failed: %v", phase, err) o.handlePhaseFailure(ctx, item, phase, err) return } // Mark phase complete count := 0 if progress != nil { count = progress.ItemsFound } if err := o.repo.CompletePhase(ctx, item.UniversityID, phase, count); err != nil { log.Printf("[Orchestrator] Failed to complete phase: %v", err) } log.Printf("[Orchestrator] Phase %s completed for %s (found: %d)", phase, item.UniversityName, count) // Wait before next phase time.Sleep(o.phaseCooldown) } // handlePhaseFailure handles a phase failure func (o *Orchestrator) handlePhaseFailure(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase, err error) { item.RetryCount++ item.LastError = err.Error() if item.RetryCount >= item.MaxRetries { // Max retries reached, mark as failed item.CurrentPhase = PhaseFailed log.Printf("[Orchestrator] University %s failed after %d retries", item.UniversityName, item.RetryCount) } if updateErr := o.repo.FailPhase(ctx, item.UniversityID, phase, err.Error()); updateErr != nil { log.Printf("[Orchestrator] Failed to update failure status: %v", updateErr) } // Wait before potential retry time.Sleep(o.retryCooldown) } // completeUniversity marks a university as fully crawled func (o *Orchestrator) completeUniversity(ctx context.Context, item *CrawlQueueItem) { now := time.Now() item.CurrentPhase = PhaseCompleted item.CompletedAt = &now item.QueuePosition = nil // Remove from active queue if err := o.repo.UpdateQueueItem(ctx, item); err != nil { log.Printf("[Orchestrator] Failed to complete university: %v", err) return } log.Printf("[Orchestrator] University %s completed! Professors: %d, Staff: %d, Publications: %d", item.UniversityName, item.ProfessorsCount, item.StaffCount, item.PublicationsCount) }