// Package staff provides university staff crawling functionality package staff import ( "context" "fmt" "log" "time" "github.com/google/uuid" "github.com/breakpilot/edu-search-service/internal/database" "github.com/breakpilot/edu-search-service/internal/orchestrator" ) // OrchestratorAdapter adapts the StaffCrawler to the orchestrator.StaffCrawlerInterface // This bridges the gap between the generic StaffCrawler and the multi-phase orchestrator type OrchestratorAdapter struct { crawler *StaffCrawler repo *database.Repository } // NewOrchestratorAdapter creates a new adapter that connects StaffCrawler to the orchestrator func NewOrchestratorAdapter(crawler *StaffCrawler, repo *database.Repository) *OrchestratorAdapter { return &OrchestratorAdapter{ crawler: crawler, repo: repo, } } // DiscoverSampleProfessor finds at least one professor to validate crawling works for this university // This is Phase 1: Quick validation that the university website is crawlable func (a *OrchestratorAdapter) DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) { start := time.Now() progress := &orchestrator.CrawlProgress{ Phase: orchestrator.PhaseDiscovery, StartedAt: start, } log.Printf("[OrchestratorAdapter] Discovery phase for university %s", universityID) // Get university from database uni, err := a.repo.GetUniversityByID(ctx, universityID) if err != nil { progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err)) return progress, fmt.Errorf("failed to get university: %w", err) } if uni == nil { progress.Errors = append(progress.Errors, "University not found") return progress, fmt.Errorf("university not found: %s", universityID) } log.Printf("[OrchestratorAdapter] Discovering staff pages for %s (%s)", uni.Name, uni.URL) // Use the crawler to find staff pages (discovery phase) staffPages, err := a.crawler.findStaffPages(ctx, uni) if err != nil { progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to find staff pages: %v", err)) return progress, fmt.Errorf("failed to find staff pages: %w", err) } log.Printf("[OrchestratorAdapter] Found %d staff pages for %s", len(staffPages), uni.Name) // Try to extract at least one professor as validation var sampleFound int for _, pageURL := range staffPages { if sampleFound > 0 { break // We just need to validate one works } staffMembers, err := a.crawler.extractStaffFromPage(ctx, pageURL, uni) if err != nil { log.Printf("[OrchestratorAdapter] Error extracting from %s: %v", pageURL, err) continue } // Count professors found for _, staff := range staffMembers { if staff.IsProfessor { sampleFound++ log.Printf("[OrchestratorAdapter] Found sample professor: %s %s", stringValue(staff.FirstName), staff.LastName) break } } // Even non-professors validate the crawler works if sampleFound == 0 && len(staffMembers) > 0 { sampleFound = 1 log.Printf("[OrchestratorAdapter] Found sample staff member (not professor): %s %s", stringValue(staffMembers[0].FirstName), staffMembers[0].LastName) } } progress.ItemsFound = len(staffPages) // Number of crawlable pages found now := time.Now() progress.CompletedAt = &now if sampleFound == 0 && len(staffPages) > 0 { // Pages found but no staff extracted - still consider it successful log.Printf("[OrchestratorAdapter] Discovery completed: %d pages found, extraction may need tuning", len(staffPages)) } else if sampleFound == 0 { progress.Errors = append(progress.Errors, "No staff pages found") return progress, fmt.Errorf("no staff pages found for %s", uni.Name) } log.Printf("[OrchestratorAdapter] Discovery completed for %s: %d pages found", uni.Name, len(staffPages)) return progress, nil } // CrawlProfessors crawls all professors at a university // This is Phase 2: Focus on finding professors specifically func (a *OrchestratorAdapter) CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) { start := time.Now() progress := &orchestrator.CrawlProgress{ Phase: orchestrator.PhaseProfessors, StartedAt: start, } log.Printf("[OrchestratorAdapter] Professors phase for university %s", universityID) // Get university uni, err := a.repo.GetUniversityByID(ctx, universityID) if err != nil || uni == nil { progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err)) return progress, fmt.Errorf("failed to get university: %w", err) } // Perform full crawl result, err := a.crawler.CrawlUniversity(ctx, uni) if err != nil { progress.Errors = append(progress.Errors, fmt.Sprintf("Crawl failed: %v", err)) return progress, err } // Count professors specifically professorCount := 0 staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{ UniversityID: &universityID, IsProfessor: boolPtr(true), Limit: 10000, }) if err == nil { professorCount = staffList.Total } progress.ItemsFound = professorCount progress.ItemsProcessed = result.StaffFound progress.Errors = result.Errors now := time.Now() progress.CompletedAt = &now log.Printf("[OrchestratorAdapter] Professors phase completed for %s: %d professors found", uni.Name, professorCount) return progress, nil } // CrawlAllStaff crawls all staff members at a university // This is Phase 3: Get all staff (already done in Phase 2, but we verify/extend) func (a *OrchestratorAdapter) CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) { start := time.Now() progress := &orchestrator.CrawlProgress{ Phase: orchestrator.PhaseAllStaff, StartedAt: start, } log.Printf("[OrchestratorAdapter] All Staff phase for university %s", universityID) // Get university uni, err := a.repo.GetUniversityByID(ctx, universityID) if err != nil || uni == nil { progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err)) return progress, fmt.Errorf("failed to get university: %w", err) } // Run another crawl pass to catch any missed staff result, err := a.crawler.CrawlUniversity(ctx, uni) if err != nil { progress.Errors = result.Errors // Don't fail completely - we may have some staff already log.Printf("[OrchestratorAdapter] All Staff crawl had errors: %v", err) } // Get total staff count staffCount := 0 staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{ UniversityID: &universityID, Limit: 1, // Just need count }) if err == nil { staffCount = staffList.Total } progress.ItemsFound = staffCount if result != nil { progress.ItemsProcessed = result.StaffFound progress.Errors = result.Errors } now := time.Now() progress.CompletedAt = &now log.Printf("[OrchestratorAdapter] All Staff phase completed for %s: %d total staff", uni.Name, staffCount) return progress, nil } // Helper functions func stringValue(s *string) string { if s == nil { return "" } return *s } func boolPtr(b bool) *bool { return &b }