package scheduler import ( "context" "log" "sync" "time" ) // CrawlFunc is the function signature for executing a crawl type CrawlFunc func(ctx context.Context) error // Status represents the current scheduler status type Status struct { Enabled bool `json:"enabled"` Running bool `json:"running"` LastRun time.Time `json:"last_run,omitempty"` LastRunStatus string `json:"last_run_status,omitempty"` NextRun time.Time `json:"next_run,omitempty"` Interval string `json:"interval"` } // Scheduler handles automatic crawl scheduling type Scheduler struct { mu sync.RWMutex enabled bool interval time.Duration crawlFunc CrawlFunc running bool lastRun time.Time lastRunStatus string stopChan chan struct{} doneChan chan struct{} } // Config holds scheduler configuration type Config struct { Enabled bool Interval time.Duration } // NewScheduler creates a new crawler scheduler func NewScheduler(cfg Config, crawlFunc CrawlFunc) *Scheduler { return &Scheduler{ enabled: cfg.Enabled, interval: cfg.Interval, crawlFunc: crawlFunc, stopChan: make(chan struct{}), doneChan: make(chan struct{}), } } // Start begins the scheduler loop func (s *Scheduler) Start() { if !s.enabled { log.Println("Scheduler is disabled") return } log.Printf("Scheduler starting with interval: %v", s.interval) go s.run() } // Stop gracefully stops the scheduler func (s *Scheduler) Stop() { s.mu.Lock() if !s.enabled { s.mu.Unlock() return } s.mu.Unlock() close(s.stopChan) <-s.doneChan log.Println("Scheduler stopped") } // run is the main scheduler loop func (s *Scheduler) run() { defer close(s.doneChan) // Calculate time until first run // Default: run at 2:00 AM to minimize impact now := time.Now() nextRun := s.calculateNextRun(now) log.Printf("Scheduler: first crawl scheduled for %v", nextRun) timer := time.NewTimer(time.Until(nextRun)) defer timer.Stop() for { select { case <-s.stopChan: return case <-timer.C: s.executeCrawl() // Schedule next run nextRun = time.Now().Add(s.interval) timer.Reset(s.interval) } } } // calculateNextRun determines when the next crawl should occur func (s *Scheduler) calculateNextRun(from time.Time) time.Time { // If interval is 24h or more, schedule for 2:00 AM if s.interval >= 24*time.Hour { next := time.Date(from.Year(), from.Month(), from.Day(), 2, 0, 0, 0, from.Location()) if next.Before(from) || next.Equal(from) { next = next.Add(24 * time.Hour) } return next } // For shorter intervals, start immediately return from.Add(1 * time.Minute) } // executeCrawl runs the crawl function func (s *Scheduler) executeCrawl() { s.mu.Lock() if s.running { s.mu.Unlock() log.Println("Scheduler: crawl already running, skipping") return } s.running = true s.mu.Unlock() log.Println("Scheduler: starting scheduled crawl") startTime := time.Now() ctx, cancel := context.WithTimeout(context.Background(), 4*time.Hour) defer cancel() err := s.crawlFunc(ctx) s.mu.Lock() s.running = false s.lastRun = startTime if err != nil { s.lastRunStatus = "failed: " + err.Error() log.Printf("Scheduler: crawl failed after %v: %v", time.Since(startTime), err) } else { s.lastRunStatus = "success" log.Printf("Scheduler: crawl completed successfully in %v", time.Since(startTime)) } s.mu.Unlock() } // TriggerCrawl manually triggers a crawl func (s *Scheduler) TriggerCrawl() error { s.mu.Lock() if s.running { s.mu.Unlock() return ErrCrawlAlreadyRunning } s.running = true s.mu.Unlock() log.Println("Scheduler: manual crawl triggered") go func() { startTime := time.Now() ctx, cancel := context.WithTimeout(context.Background(), 4*time.Hour) defer cancel() err := s.crawlFunc(ctx) s.mu.Lock() s.running = false s.lastRun = startTime if err != nil { s.lastRunStatus = "failed: " + err.Error() log.Printf("Scheduler: manual crawl failed after %v: %v", time.Since(startTime), err) } else { s.lastRunStatus = "success" log.Printf("Scheduler: manual crawl completed successfully in %v", time.Since(startTime)) } s.mu.Unlock() }() return nil } // Status returns the current scheduler status func (s *Scheduler) Status() Status { s.mu.RLock() defer s.mu.RUnlock() status := Status{ Enabled: s.enabled, Running: s.running, LastRun: s.lastRun, LastRunStatus: s.lastRunStatus, Interval: s.interval.String(), } if s.enabled && !s.lastRun.IsZero() { status.NextRun = s.lastRun.Add(s.interval) } return status } // IsRunning returns true if a crawl is currently in progress func (s *Scheduler) IsRunning() bool { s.mu.RLock() defer s.mu.RUnlock() return s.running } // Errors type SchedulerError string func (e SchedulerError) Error() string { return string(e) } const ( ErrCrawlAlreadyRunning = SchedulerError("crawl already running") )