breakpilot-lehrer/edu-search-service/internal/scheduler/scheduler.go

package scheduler

import (
	"context"
	"log"
	"sync"
	"time"
)

// CrawlFunc is the function signature for executing a crawl
type CrawlFunc func(ctx context.Context) error

// Status represents the current scheduler status
type Status struct {
	Enabled       bool      `json:"enabled"`
	Running       bool      `json:"running"`
	LastRun       time.Time `json:"last_run,omitempty"`
	LastRunStatus string    `json:"last_run_status,omitempty"`
	NextRun       time.Time `json:"next_run,omitempty"`
	Interval      string    `json:"interval"`
}

// Scheduler handles automatic crawl scheduling
type Scheduler struct {
	mu            sync.RWMutex
	enabled       bool
	interval      time.Duration
	crawlFunc     CrawlFunc
	running       bool
	lastRun       time.Time
	lastRunStatus string
	stopChan      chan struct{}
	doneChan      chan struct{}
}

// Config holds scheduler configuration
type Config struct {
	Enabled  bool
	Interval time.Duration
}

// NewScheduler creates a new crawler scheduler
func NewScheduler(cfg Config, crawlFunc CrawlFunc) *Scheduler {
	return &Scheduler{
		enabled:   cfg.Enabled,
		interval:  cfg.Interval,
		crawlFunc: crawlFunc,
		stopChan:  make(chan struct{}),
		doneChan:  make(chan struct{}),
	}
}

// Start begins the scheduler loop
func (s *Scheduler) Start() {
	if !s.enabled {
		log.Println("Scheduler is disabled")
		return
	}

	log.Printf("Scheduler starting with interval: %v", s.interval)

	go s.run()
}

// Stop gracefully stops the scheduler
func (s *Scheduler) Stop() {
	s.mu.Lock()
	if !s.enabled {
		s.mu.Unlock()
		return
	}
	s.mu.Unlock()

	close(s.stopChan)
	<-s.doneChan
	log.Println("Scheduler stopped")
}

// run is the main scheduler loop
func (s *Scheduler) run() {
	defer close(s.doneChan)

	// Calculate time until first run
	// Default: run at 2:00 AM to minimize impact
	now := time.Now()
	nextRun := s.calculateNextRun(now)

	log.Printf("Scheduler: first crawl scheduled for %v", nextRun)

	timer := time.NewTimer(time.Until(nextRun))
	defer timer.Stop()

	for {
		select {
		case <-s.stopChan:
			return
		case <-timer.C:
			s.executeCrawl()
			// Schedule next run
			nextRun = time.Now().Add(s.interval)
			timer.Reset(s.interval)
		}
	}
}

// calculateNextRun determines when the next crawl should occur
func (s *Scheduler) calculateNextRun(from time.Time) time.Time {
	// If interval is 24h or more, schedule for 2:00 AM
	if s.interval >= 24*time.Hour {
		next := time.Date(from.Year(), from.Month(), from.Day(), 2, 0, 0, 0, from.Location())
		if next.Before(from) || next.Equal(from) {
			next = next.Add(24 * time.Hour)
		}
		return next
	}

	// For shorter intervals, start immediately
	return from.Add(1 * time.Minute)
}

// executeCrawl runs the crawl function
func (s *Scheduler) executeCrawl() {
	s.mu.Lock()
	if s.running {
		s.mu.Unlock()
		log.Println("Scheduler: crawl already running, skipping")
		return
	}
	s.running = true
	s.mu.Unlock()

	log.Println("Scheduler: starting scheduled crawl")
	startTime := time.Now()

	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Hour)
	defer cancel()

	err := s.crawlFunc(ctx)

	s.mu.Lock()
	s.running = false
	s.lastRun = startTime
	if err != nil {
		s.lastRunStatus = "failed: " + err.Error()
		log.Printf("Scheduler: crawl failed after %v: %v", time.Since(startTime), err)
	} else {
		s.lastRunStatus = "success"
		log.Printf("Scheduler: crawl completed successfully in %v", time.Since(startTime))
	}
	s.mu.Unlock()
}

// TriggerCrawl manually triggers a crawl
func (s *Scheduler) TriggerCrawl() error {
	s.mu.Lock()
	if s.running {
		s.mu.Unlock()
		return ErrCrawlAlreadyRunning
	}
	s.running = true
	s.mu.Unlock()

	log.Println("Scheduler: manual crawl triggered")

	go func() {
		startTime := time.Now()
		ctx, cancel := context.WithTimeout(context.Background(), 4*time.Hour)
		defer cancel()

		err := s.crawlFunc(ctx)

		s.mu.Lock()
		s.running = false
		s.lastRun = startTime
		if err != nil {
			s.lastRunStatus = "failed: " + err.Error()
			log.Printf("Scheduler: manual crawl failed after %v: %v", time.Since(startTime), err)
		} else {
			s.lastRunStatus = "success"
			log.Printf("Scheduler: manual crawl completed successfully in %v", time.Since(startTime))
		}
		s.mu.Unlock()
	}()

	return nil
}

// Status returns the current scheduler status
func (s *Scheduler) Status() Status {
	s.mu.RLock()
	defer s.mu.RUnlock()

	status := Status{
		Enabled:       s.enabled,
		Running:       s.running,
		LastRun:       s.lastRun,
		LastRunStatus: s.lastRunStatus,
		Interval:      s.interval.String(),
	}

	if s.enabled && !s.lastRun.IsZero() {
		status.NextRun = s.lastRun.Add(s.interval)
	}

	return status
}

// IsRunning returns true if a crawl is currently in progress
func (s *Scheduler) IsRunning() bool {
	s.mu.RLock()
	defer s.mu.RUnlock()
	return s.running
}

// Errors
type SchedulerError string

func (e SchedulerError) Error() string { return string(e) }

const (
	ErrCrawlAlreadyRunning = SchedulerError("crawl already running")
)