All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
408 lines
13 KiB
Go
408 lines
13 KiB
Go
// Package orchestrator implements multi-phase university crawling with queue management
|
|
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
)
|
|
|
|
// CrawlPhase represents a phase in the crawl process
|
|
type CrawlPhase string
|
|
|
|
const (
|
|
PhasePending CrawlPhase = "pending"
|
|
PhaseDiscovery CrawlPhase = "discovery" // Find sample professor to validate crawling works
|
|
PhaseProfessors CrawlPhase = "professors" // Crawl all professors
|
|
PhaseAllStaff CrawlPhase = "all_staff" // Crawl all staff members
|
|
PhasePublications CrawlPhase = "publications" // Crawl publications for all staff
|
|
PhaseCompleted CrawlPhase = "completed"
|
|
PhaseFailed CrawlPhase = "failed"
|
|
PhasePaused CrawlPhase = "paused"
|
|
)
|
|
|
|
// CrawlQueueItem represents a university in the crawl queue
|
|
type CrawlQueueItem struct {
|
|
ID uuid.UUID `json:"id"`
|
|
UniversityID uuid.UUID `json:"university_id"`
|
|
UniversityName string `json:"university_name"`
|
|
UniversityShort string `json:"university_short"`
|
|
QueuePosition *int `json:"queue_position"`
|
|
Priority int `json:"priority"`
|
|
CurrentPhase CrawlPhase `json:"current_phase"`
|
|
DiscoveryCompleted bool `json:"discovery_completed"`
|
|
DiscoveryCompletedAt *time.Time `json:"discovery_completed_at,omitempty"`
|
|
ProfessorsCompleted bool `json:"professors_completed"`
|
|
ProfessorsCompletedAt *time.Time `json:"professors_completed_at,omitempty"`
|
|
AllStaffCompleted bool `json:"all_staff_completed"`
|
|
AllStaffCompletedAt *time.Time `json:"all_staff_completed_at,omitempty"`
|
|
PublicationsCompleted bool `json:"publications_completed"`
|
|
PublicationsCompletedAt *time.Time `json:"publications_completed_at,omitempty"`
|
|
DiscoveryCount int `json:"discovery_count"`
|
|
ProfessorsCount int `json:"professors_count"`
|
|
StaffCount int `json:"staff_count"`
|
|
PublicationsCount int `json:"publications_count"`
|
|
RetryCount int `json:"retry_count"`
|
|
MaxRetries int `json:"max_retries"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
|
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
|
ProgressPercent int `json:"progress_percent"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
// CrawlProgress represents progress for a single phase
|
|
type CrawlProgress struct {
|
|
Phase CrawlPhase `json:"phase"`
|
|
ItemsFound int `json:"items_found"`
|
|
ItemsProcessed int `json:"items_processed"`
|
|
Errors []string `json:"errors,omitempty"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
|
}
|
|
|
|
// OrchestratorStatus represents the current state of the orchestrator
|
|
type OrchestratorStatus struct {
|
|
IsRunning bool `json:"is_running"`
|
|
CurrentUniversity *CrawlQueueItem `json:"current_university,omitempty"`
|
|
CurrentPhase CrawlPhase `json:"current_phase"`
|
|
QueueLength int `json:"queue_length"`
|
|
CompletedToday int `json:"completed_today"`
|
|
TotalProcessed int `json:"total_processed"`
|
|
LastActivity *time.Time `json:"last_activity,omitempty"`
|
|
}
|
|
|
|
// StaffCrawlerInterface defines what the staff crawler must implement
|
|
type StaffCrawlerInterface interface {
|
|
// DiscoverSampleProfessor finds at least one professor to validate crawling works
|
|
DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
|
// CrawlProfessors crawls all professors at a university
|
|
CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
|
// CrawlAllStaff crawls all staff members at a university
|
|
CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
|
}
|
|
|
|
// PublicationCrawlerInterface defines what the publication crawler must implement
|
|
type PublicationCrawlerInterface interface {
|
|
// CrawlPublicationsForUniversity crawls publications for all staff at a university
|
|
CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
|
}
|
|
|
|
// Repository defines database operations for the orchestrator
|
|
type Repository interface {
|
|
// Queue operations
|
|
GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error)
|
|
GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error)
|
|
AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error)
|
|
RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error
|
|
UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error
|
|
PauseQueueItem(ctx context.Context, universityID uuid.UUID) error
|
|
ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error
|
|
|
|
// Phase updates
|
|
CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error
|
|
FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, err string) error
|
|
|
|
// Stats
|
|
GetCompletedTodayCount(ctx context.Context) (int, error)
|
|
GetTotalProcessedCount(ctx context.Context) (int, error)
|
|
}
|
|
|
|
// Orchestrator manages the multi-phase crawl process
|
|
type Orchestrator struct {
|
|
repo Repository
|
|
staffCrawler StaffCrawlerInterface
|
|
pubCrawler PublicationCrawlerInterface
|
|
|
|
// Runtime state
|
|
mu sync.RWMutex
|
|
isRunning bool
|
|
stopChan chan struct{}
|
|
currentItem *CrawlQueueItem
|
|
lastActivity time.Time
|
|
|
|
// Configuration
|
|
phaseCooldown time.Duration // Wait time between phases
|
|
retryCooldown time.Duration // Wait time after failure before retry
|
|
maxConcurrent int // Max concurrent crawls (always 1 for now)
|
|
}
|
|
|
|
// NewOrchestrator creates a new orchestrator instance
|
|
func NewOrchestrator(repo Repository, staffCrawler StaffCrawlerInterface, pubCrawler PublicationCrawlerInterface) *Orchestrator {
|
|
return &Orchestrator{
|
|
repo: repo,
|
|
staffCrawler: staffCrawler,
|
|
pubCrawler: pubCrawler,
|
|
phaseCooldown: 5 * time.Second, // Small pause between phases
|
|
retryCooldown: 30 * time.Second, // Wait before retry after failure
|
|
maxConcurrent: 1, // Sequential processing
|
|
}
|
|
}
|
|
|
|
// Start begins the orchestrator loop
|
|
func (o *Orchestrator) Start() error {
|
|
o.mu.Lock()
|
|
if o.isRunning {
|
|
o.mu.Unlock()
|
|
return fmt.Errorf("orchestrator already running")
|
|
}
|
|
o.isRunning = true
|
|
o.stopChan = make(chan struct{})
|
|
o.mu.Unlock()
|
|
|
|
log.Println("[Orchestrator] Starting crawl orchestration loop")
|
|
|
|
go o.runLoop()
|
|
return nil
|
|
}
|
|
|
|
// Stop gracefully stops the orchestrator
|
|
func (o *Orchestrator) Stop() error {
|
|
o.mu.Lock()
|
|
if !o.isRunning {
|
|
o.mu.Unlock()
|
|
return fmt.Errorf("orchestrator not running")
|
|
}
|
|
close(o.stopChan)
|
|
o.isRunning = false
|
|
o.mu.Unlock()
|
|
|
|
log.Println("[Orchestrator] Stopped")
|
|
return nil
|
|
}
|
|
|
|
// Status returns the current orchestrator status
|
|
func (o *Orchestrator) Status(ctx context.Context) (*OrchestratorStatus, error) {
|
|
o.mu.RLock()
|
|
defer o.mu.RUnlock()
|
|
|
|
status := &OrchestratorStatus{
|
|
IsRunning: o.isRunning,
|
|
CurrentPhase: PhasePending,
|
|
}
|
|
|
|
if o.currentItem != nil {
|
|
status.CurrentUniversity = o.currentItem
|
|
status.CurrentPhase = o.currentItem.CurrentPhase
|
|
}
|
|
|
|
if !o.lastActivity.IsZero() {
|
|
status.LastActivity = &o.lastActivity
|
|
}
|
|
|
|
// Get queue stats from DB
|
|
items, err := o.repo.GetQueueItems(ctx)
|
|
if err == nil {
|
|
status.QueueLength = len(items)
|
|
}
|
|
|
|
completedToday, _ := o.repo.GetCompletedTodayCount(ctx)
|
|
status.CompletedToday = completedToday
|
|
|
|
totalProcessed, _ := o.repo.GetTotalProcessedCount(ctx)
|
|
status.TotalProcessed = totalProcessed
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// AddUniversity adds a university to the crawl queue
|
|
func (o *Orchestrator) AddUniversity(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) {
|
|
item, err := o.repo.AddToQueue(ctx, universityID, priority, initiatedBy)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to add to queue: %w", err)
|
|
}
|
|
|
|
log.Printf("[Orchestrator] Added university %s to queue with priority %d", universityID, priority)
|
|
return item, nil
|
|
}
|
|
|
|
// RemoveUniversity removes a university from the queue
|
|
func (o *Orchestrator) RemoveUniversity(ctx context.Context, universityID uuid.UUID) error {
|
|
return o.repo.RemoveFromQueue(ctx, universityID)
|
|
}
|
|
|
|
// PauseUniversity pauses crawling for a university
|
|
func (o *Orchestrator) PauseUniversity(ctx context.Context, universityID uuid.UUID) error {
|
|
return o.repo.PauseQueueItem(ctx, universityID)
|
|
}
|
|
|
|
// ResumeUniversity resumes crawling for a paused university
|
|
func (o *Orchestrator) ResumeUniversity(ctx context.Context, universityID uuid.UUID) error {
|
|
return o.repo.ResumeQueueItem(ctx, universityID)
|
|
}
|
|
|
|
// GetQueue returns all items in the queue
|
|
func (o *Orchestrator) GetQueue(ctx context.Context) ([]CrawlQueueItem, error) {
|
|
return o.repo.GetQueueItems(ctx)
|
|
}
|
|
|
|
// runLoop is the main orchestration loop
|
|
func (o *Orchestrator) runLoop() {
|
|
ticker := time.NewTicker(10 * time.Second) // Check queue every 10 seconds
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-o.stopChan:
|
|
return
|
|
case <-ticker.C:
|
|
o.processNextInQueue()
|
|
}
|
|
}
|
|
}
|
|
|
|
// processNextInQueue processes the next university in the queue
|
|
func (o *Orchestrator) processNextInQueue() {
|
|
ctx := context.Background()
|
|
|
|
// Get next item in queue
|
|
item, err := o.repo.GetNextInQueue(ctx)
|
|
if err != nil {
|
|
log.Printf("[Orchestrator] Error getting next item: %v", err)
|
|
return
|
|
}
|
|
|
|
if item == nil {
|
|
// No items to process
|
|
return
|
|
}
|
|
|
|
// Check if paused
|
|
if item.CurrentPhase == PhasePaused {
|
|
return
|
|
}
|
|
|
|
// Set current item
|
|
o.mu.Lock()
|
|
o.currentItem = item
|
|
o.lastActivity = time.Now()
|
|
o.mu.Unlock()
|
|
|
|
defer func() {
|
|
o.mu.Lock()
|
|
o.currentItem = nil
|
|
o.mu.Unlock()
|
|
}()
|
|
|
|
log.Printf("[Orchestrator] Processing university: %s (Phase: %s)", item.UniversityName, item.CurrentPhase)
|
|
|
|
// Process based on current phase
|
|
switch item.CurrentPhase {
|
|
case PhasePending:
|
|
o.runPhase(ctx, item, PhaseDiscovery)
|
|
case PhaseDiscovery:
|
|
if item.DiscoveryCompleted {
|
|
o.runPhase(ctx, item, PhaseProfessors)
|
|
} else {
|
|
o.runPhase(ctx, item, PhaseDiscovery)
|
|
}
|
|
case PhaseProfessors:
|
|
if item.ProfessorsCompleted {
|
|
o.runPhase(ctx, item, PhaseAllStaff)
|
|
} else {
|
|
o.runPhase(ctx, item, PhaseProfessors)
|
|
}
|
|
case PhaseAllStaff:
|
|
if item.AllStaffCompleted {
|
|
o.runPhase(ctx, item, PhasePublications)
|
|
} else {
|
|
o.runPhase(ctx, item, PhaseAllStaff)
|
|
}
|
|
case PhasePublications:
|
|
if item.PublicationsCompleted {
|
|
o.completeUniversity(ctx, item)
|
|
} else {
|
|
o.runPhase(ctx, item, PhasePublications)
|
|
}
|
|
}
|
|
}
|
|
|
|
// runPhase executes a specific crawl phase
|
|
func (o *Orchestrator) runPhase(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase) {
|
|
log.Printf("[Orchestrator] Running phase %s for %s", phase, item.UniversityName)
|
|
|
|
// Update current phase
|
|
item.CurrentPhase = phase
|
|
if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
|
|
log.Printf("[Orchestrator] Failed to update phase: %v", err)
|
|
return
|
|
}
|
|
|
|
var progress *CrawlProgress
|
|
var err error
|
|
|
|
// Execute phase
|
|
switch phase {
|
|
case PhaseDiscovery:
|
|
progress, err = o.staffCrawler.DiscoverSampleProfessor(ctx, item.UniversityID)
|
|
case PhaseProfessors:
|
|
progress, err = o.staffCrawler.CrawlProfessors(ctx, item.UniversityID)
|
|
case PhaseAllStaff:
|
|
progress, err = o.staffCrawler.CrawlAllStaff(ctx, item.UniversityID)
|
|
case PhasePublications:
|
|
progress, err = o.pubCrawler.CrawlPublicationsForUniversity(ctx, item.UniversityID)
|
|
}
|
|
|
|
// Handle result
|
|
if err != nil {
|
|
log.Printf("[Orchestrator] Phase %s failed: %v", phase, err)
|
|
o.handlePhaseFailure(ctx, item, phase, err)
|
|
return
|
|
}
|
|
|
|
// Mark phase complete
|
|
count := 0
|
|
if progress != nil {
|
|
count = progress.ItemsFound
|
|
}
|
|
|
|
if err := o.repo.CompletePhase(ctx, item.UniversityID, phase, count); err != nil {
|
|
log.Printf("[Orchestrator] Failed to complete phase: %v", err)
|
|
}
|
|
|
|
log.Printf("[Orchestrator] Phase %s completed for %s (found: %d)", phase, item.UniversityName, count)
|
|
|
|
// Wait before next phase
|
|
time.Sleep(o.phaseCooldown)
|
|
}
|
|
|
|
// handlePhaseFailure handles a phase failure
|
|
func (o *Orchestrator) handlePhaseFailure(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase, err error) {
|
|
item.RetryCount++
|
|
item.LastError = err.Error()
|
|
|
|
if item.RetryCount >= item.MaxRetries {
|
|
// Max retries reached, mark as failed
|
|
item.CurrentPhase = PhaseFailed
|
|
log.Printf("[Orchestrator] University %s failed after %d retries", item.UniversityName, item.RetryCount)
|
|
}
|
|
|
|
if updateErr := o.repo.FailPhase(ctx, item.UniversityID, phase, err.Error()); updateErr != nil {
|
|
log.Printf("[Orchestrator] Failed to update failure status: %v", updateErr)
|
|
}
|
|
|
|
// Wait before potential retry
|
|
time.Sleep(o.retryCooldown)
|
|
}
|
|
|
|
// completeUniversity marks a university as fully crawled
|
|
func (o *Orchestrator) completeUniversity(ctx context.Context, item *CrawlQueueItem) {
|
|
now := time.Now()
|
|
item.CurrentPhase = PhaseCompleted
|
|
item.CompletedAt = &now
|
|
item.QueuePosition = nil // Remove from active queue
|
|
|
|
if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
|
|
log.Printf("[Orchestrator] Failed to complete university: %v", err)
|
|
return
|
|
}
|
|
|
|
log.Printf("[Orchestrator] University %s completed! Professors: %d, Staff: %d, Publications: %d",
|
|
item.UniversityName, item.ProfessorsCount, item.StaffCount, item.PublicationsCount)
|
|
}
|