breakpilot-lehrer/edu-search-service/internal/orchestrator/orchestrator.go

// Package orchestrator implements multi-phase university crawling with queue management
package orchestrator

import (
	"context"
	"fmt"
	"log"
	"sync"
	"time"

	"github.com/google/uuid"
)

// CrawlPhase represents a phase in the crawl process
type CrawlPhase string

const (
	PhasePending      CrawlPhase = "pending"
	PhaseDiscovery    CrawlPhase = "discovery"    // Find sample professor to validate crawling works
	PhaseProfessors   CrawlPhase = "professors"   // Crawl all professors
	PhaseAllStaff     CrawlPhase = "all_staff"    // Crawl all staff members
	PhasePublications CrawlPhase = "publications" // Crawl publications for all staff
	PhaseCompleted    CrawlPhase = "completed"
	PhaseFailed       CrawlPhase = "failed"
	PhasePaused       CrawlPhase = "paused"
)

// CrawlQueueItem represents a university in the crawl queue
type CrawlQueueItem struct {
	ID                     uuid.UUID  `json:"id"`
	UniversityID           uuid.UUID  `json:"university_id"`
	UniversityName         string     `json:"university_name"`
	UniversityShort        string     `json:"university_short"`
	QueuePosition          *int       `json:"queue_position"`
	Priority               int        `json:"priority"`
	CurrentPhase           CrawlPhase `json:"current_phase"`
	DiscoveryCompleted     bool       `json:"discovery_completed"`
	DiscoveryCompletedAt   *time.Time `json:"discovery_completed_at,omitempty"`
	ProfessorsCompleted    bool       `json:"professors_completed"`
	ProfessorsCompletedAt  *time.Time `json:"professors_completed_at,omitempty"`
	AllStaffCompleted      bool       `json:"all_staff_completed"`
	AllStaffCompletedAt    *time.Time `json:"all_staff_completed_at,omitempty"`
	PublicationsCompleted  bool       `json:"publications_completed"`
	PublicationsCompletedAt *time.Time `json:"publications_completed_at,omitempty"`
	DiscoveryCount         int        `json:"discovery_count"`
	ProfessorsCount        int        `json:"professors_count"`
	StaffCount             int        `json:"staff_count"`
	PublicationsCount      int        `json:"publications_count"`
	RetryCount             int        `json:"retry_count"`
	MaxRetries             int        `json:"max_retries"`
	LastError              string     `json:"last_error,omitempty"`
	StartedAt              *time.Time `json:"started_at,omitempty"`
	CompletedAt            *time.Time `json:"completed_at,omitempty"`
	ProgressPercent        int        `json:"progress_percent"`
	CreatedAt              time.Time  `json:"created_at"`
	UpdatedAt              time.Time  `json:"updated_at"`
}

// CrawlProgress represents progress for a single phase
type CrawlProgress struct {
	Phase          CrawlPhase `json:"phase"`
	ItemsFound     int        `json:"items_found"`
	ItemsProcessed int        `json:"items_processed"`
	Errors         []string   `json:"errors,omitempty"`
	StartedAt      time.Time  `json:"started_at"`
	CompletedAt    *time.Time `json:"completed_at,omitempty"`
}

// OrchestratorStatus represents the current state of the orchestrator
type OrchestratorStatus struct {
	IsRunning         bool             `json:"is_running"`
	CurrentUniversity *CrawlQueueItem  `json:"current_university,omitempty"`
	CurrentPhase      CrawlPhase       `json:"current_phase"`
	QueueLength       int              `json:"queue_length"`
	CompletedToday    int              `json:"completed_today"`
	TotalProcessed    int              `json:"total_processed"`
	LastActivity      *time.Time       `json:"last_activity,omitempty"`
}

// StaffCrawlerInterface defines what the staff crawler must implement
type StaffCrawlerInterface interface {
	// DiscoverSampleProfessor finds at least one professor to validate crawling works
	DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
	// CrawlProfessors crawls all professors at a university
	CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
	// CrawlAllStaff crawls all staff members at a university
	CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
}

// PublicationCrawlerInterface defines what the publication crawler must implement
type PublicationCrawlerInterface interface {
	// CrawlPublicationsForUniversity crawls publications for all staff at a university
	CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
}

// Repository defines database operations for the orchestrator
type Repository interface {
	// Queue operations
	GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error)
	GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error)
	AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error)
	RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error
	UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error
	PauseQueueItem(ctx context.Context, universityID uuid.UUID) error
	ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error

	// Phase updates
	CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error
	FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, err string) error

	// Stats
	GetCompletedTodayCount(ctx context.Context) (int, error)
	GetTotalProcessedCount(ctx context.Context) (int, error)
}

// Orchestrator manages the multi-phase crawl process
type Orchestrator struct {
	repo         Repository
	staffCrawler StaffCrawlerInterface
	pubCrawler   PublicationCrawlerInterface

	// Runtime state
	mu            sync.RWMutex
	isRunning     bool
	stopChan      chan struct{}
	currentItem   *CrawlQueueItem
	lastActivity  time.Time

	// Configuration
	phaseCooldown  time.Duration // Wait time between phases
	retryCooldown  time.Duration // Wait time after failure before retry
	maxConcurrent  int           // Max concurrent crawls (always 1 for now)
}

// NewOrchestrator creates a new orchestrator instance
func NewOrchestrator(repo Repository, staffCrawler StaffCrawlerInterface, pubCrawler PublicationCrawlerInterface) *Orchestrator {
	return &Orchestrator{
		repo:          repo,
		staffCrawler:  staffCrawler,
		pubCrawler:    pubCrawler,
		phaseCooldown: 5 * time.Second,  // Small pause between phases
		retryCooldown: 30 * time.Second, // Wait before retry after failure
		maxConcurrent: 1,                // Sequential processing
	}
}

// Start begins the orchestrator loop
func (o *Orchestrator) Start() error {
	o.mu.Lock()
	if o.isRunning {
		o.mu.Unlock()
		return fmt.Errorf("orchestrator already running")
	}
	o.isRunning = true
	o.stopChan = make(chan struct{})
	o.mu.Unlock()

	log.Println("[Orchestrator] Starting crawl orchestration loop")

	go o.runLoop()
	return nil
}

// Stop gracefully stops the orchestrator
func (o *Orchestrator) Stop() error {
	o.mu.Lock()
	if !o.isRunning {
		o.mu.Unlock()
		return fmt.Errorf("orchestrator not running")
	}
	close(o.stopChan)
	o.isRunning = false
	o.mu.Unlock()

	log.Println("[Orchestrator] Stopped")
	return nil
}

// Status returns the current orchestrator status
func (o *Orchestrator) Status(ctx context.Context) (*OrchestratorStatus, error) {
	o.mu.RLock()
	defer o.mu.RUnlock()

	status := &OrchestratorStatus{
		IsRunning:    o.isRunning,
		CurrentPhase: PhasePending,
	}

	if o.currentItem != nil {
		status.CurrentUniversity = o.currentItem
		status.CurrentPhase = o.currentItem.CurrentPhase
	}

	if !o.lastActivity.IsZero() {
		status.LastActivity = &o.lastActivity
	}

	// Get queue stats from DB
	items, err := o.repo.GetQueueItems(ctx)
	if err == nil {
		status.QueueLength = len(items)
	}

	completedToday, _ := o.repo.GetCompletedTodayCount(ctx)
	status.CompletedToday = completedToday

	totalProcessed, _ := o.repo.GetTotalProcessedCount(ctx)
	status.TotalProcessed = totalProcessed

	return status, nil
}

// AddUniversity adds a university to the crawl queue
func (o *Orchestrator) AddUniversity(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) {
	item, err := o.repo.AddToQueue(ctx, universityID, priority, initiatedBy)
	if err != nil {
		return nil, fmt.Errorf("failed to add to queue: %w", err)
	}

	log.Printf("[Orchestrator] Added university %s to queue with priority %d", universityID, priority)
	return item, nil
}

// RemoveUniversity removes a university from the queue
func (o *Orchestrator) RemoveUniversity(ctx context.Context, universityID uuid.UUID) error {
	return o.repo.RemoveFromQueue(ctx, universityID)
}

// PauseUniversity pauses crawling for a university
func (o *Orchestrator) PauseUniversity(ctx context.Context, universityID uuid.UUID) error {
	return o.repo.PauseQueueItem(ctx, universityID)
}

// ResumeUniversity resumes crawling for a paused university
func (o *Orchestrator) ResumeUniversity(ctx context.Context, universityID uuid.UUID) error {
	return o.repo.ResumeQueueItem(ctx, universityID)
}

// GetQueue returns all items in the queue
func (o *Orchestrator) GetQueue(ctx context.Context) ([]CrawlQueueItem, error) {
	return o.repo.GetQueueItems(ctx)
}

// runLoop is the main orchestration loop
func (o *Orchestrator) runLoop() {
	ticker := time.NewTicker(10 * time.Second) // Check queue every 10 seconds
	defer ticker.Stop()

	for {
		select {
		case <-o.stopChan:
			return
		case <-ticker.C:
			o.processNextInQueue()
		}
	}
}

// processNextInQueue processes the next university in the queue
func (o *Orchestrator) processNextInQueue() {
	ctx := context.Background()

	// Get next item in queue
	item, err := o.repo.GetNextInQueue(ctx)
	if err != nil {
		log.Printf("[Orchestrator] Error getting next item: %v", err)
		return
	}

	if item == nil {
		// No items to process
		return
	}

	// Check if paused
	if item.CurrentPhase == PhasePaused {
		return
	}

	// Set current item
	o.mu.Lock()
	o.currentItem = item
	o.lastActivity = time.Now()
	o.mu.Unlock()

	defer func() {
		o.mu.Lock()
		o.currentItem = nil
		o.mu.Unlock()
	}()

	log.Printf("[Orchestrator] Processing university: %s (Phase: %s)", item.UniversityName, item.CurrentPhase)

	// Process based on current phase
	switch item.CurrentPhase {
	case PhasePending:
		o.runPhase(ctx, item, PhaseDiscovery)
	case PhaseDiscovery:
		if item.DiscoveryCompleted {
			o.runPhase(ctx, item, PhaseProfessors)
		} else {
			o.runPhase(ctx, item, PhaseDiscovery)
		}
	case PhaseProfessors:
		if item.ProfessorsCompleted {
			o.runPhase(ctx, item, PhaseAllStaff)
		} else {
			o.runPhase(ctx, item, PhaseProfessors)
		}
	case PhaseAllStaff:
		if item.AllStaffCompleted {
			o.runPhase(ctx, item, PhasePublications)
		} else {
			o.runPhase(ctx, item, PhaseAllStaff)
		}
	case PhasePublications:
		if item.PublicationsCompleted {
			o.completeUniversity(ctx, item)
		} else {
			o.runPhase(ctx, item, PhasePublications)
		}
	}
}

// runPhase executes a specific crawl phase
func (o *Orchestrator) runPhase(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase) {
	log.Printf("[Orchestrator] Running phase %s for %s", phase, item.UniversityName)

	// Update current phase
	item.CurrentPhase = phase
	if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
		log.Printf("[Orchestrator] Failed to update phase: %v", err)
		return
	}

	var progress *CrawlProgress
	var err error

	// Execute phase
	switch phase {
	case PhaseDiscovery:
		progress, err = o.staffCrawler.DiscoverSampleProfessor(ctx, item.UniversityID)
	case PhaseProfessors:
		progress, err = o.staffCrawler.CrawlProfessors(ctx, item.UniversityID)
	case PhaseAllStaff:
		progress, err = o.staffCrawler.CrawlAllStaff(ctx, item.UniversityID)
	case PhasePublications:
		progress, err = o.pubCrawler.CrawlPublicationsForUniversity(ctx, item.UniversityID)
	}

	// Handle result
	if err != nil {
		log.Printf("[Orchestrator] Phase %s failed: %v", phase, err)
		o.handlePhaseFailure(ctx, item, phase, err)
		return
	}

	// Mark phase complete
	count := 0
	if progress != nil {
		count = progress.ItemsFound
	}

	if err := o.repo.CompletePhase(ctx, item.UniversityID, phase, count); err != nil {
		log.Printf("[Orchestrator] Failed to complete phase: %v", err)
	}

	log.Printf("[Orchestrator] Phase %s completed for %s (found: %d)", phase, item.UniversityName, count)

	// Wait before next phase
	time.Sleep(o.phaseCooldown)
}

// handlePhaseFailure handles a phase failure
func (o *Orchestrator) handlePhaseFailure(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase, err error) {
	item.RetryCount++
	item.LastError = err.Error()

	if item.RetryCount >= item.MaxRetries {
		// Max retries reached, mark as failed
		item.CurrentPhase = PhaseFailed
		log.Printf("[Orchestrator] University %s failed after %d retries", item.UniversityName, item.RetryCount)
	}

	if updateErr := o.repo.FailPhase(ctx, item.UniversityID, phase, err.Error()); updateErr != nil {
		log.Printf("[Orchestrator] Failed to update failure status: %v", updateErr)
	}

	// Wait before potential retry
	time.Sleep(o.retryCooldown)
}

// completeUniversity marks a university as fully crawled
func (o *Orchestrator) completeUniversity(ctx context.Context, item *CrawlQueueItem) {
	now := time.Now()
	item.CurrentPhase = PhaseCompleted
	item.CompletedAt = &now
	item.QueuePosition = nil // Remove from active queue

	if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
		log.Printf("[Orchestrator] Failed to complete university: %v", err)
		return
	}

	log.Printf("[Orchestrator] University %s completed! Professors: %d, Staff: %d, Publications: %d",
		item.UniversityName, item.ProfessorsCount, item.StaffCount, item.PublicationsCount)
}