breakpilot-pwa/edu-search-service/internal/staff/orchestrator_adapter.go

// Package staff provides university staff crawling functionality
package staff

import (
	"context"
	"fmt"
	"log"
	"time"

	"github.com/google/uuid"

	"github.com/breakpilot/edu-search-service/internal/database"
	"github.com/breakpilot/edu-search-service/internal/orchestrator"
)

// OrchestratorAdapter adapts the StaffCrawler to the orchestrator.StaffCrawlerInterface
// This bridges the gap between the generic StaffCrawler and the multi-phase orchestrator
type OrchestratorAdapter struct {
	crawler *StaffCrawler
	repo    *database.Repository
}

// NewOrchestratorAdapter creates a new adapter that connects StaffCrawler to the orchestrator
func NewOrchestratorAdapter(crawler *StaffCrawler, repo *database.Repository) *OrchestratorAdapter {
	return &OrchestratorAdapter{
		crawler: crawler,
		repo:    repo,
	}
}

// DiscoverSampleProfessor finds at least one professor to validate crawling works for this university
// This is Phase 1: Quick validation that the university website is crawlable
func (a *OrchestratorAdapter) DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
	start := time.Now()
	progress := &orchestrator.CrawlProgress{
		Phase:     orchestrator.PhaseDiscovery,
		StartedAt: start,
	}

	log.Printf("[OrchestratorAdapter] Discovery phase for university %s", universityID)

	// Get university from database
	uni, err := a.repo.GetUniversityByID(ctx, universityID)
	if err != nil {
		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
		return progress, fmt.Errorf("failed to get university: %w", err)
	}

	if uni == nil {
		progress.Errors = append(progress.Errors, "University not found")
		return progress, fmt.Errorf("university not found: %s", universityID)
	}

	log.Printf("[OrchestratorAdapter] Discovering staff pages for %s (%s)", uni.Name, uni.URL)

	// Use the crawler to find staff pages (discovery phase)
	staffPages, err := a.crawler.findStaffPages(ctx, uni)
	if err != nil {
		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to find staff pages: %v", err))
		return progress, fmt.Errorf("failed to find staff pages: %w", err)
	}

	log.Printf("[OrchestratorAdapter] Found %d staff pages for %s", len(staffPages), uni.Name)

	// Try to extract at least one professor as validation
	var sampleFound int
	for _, pageURL := range staffPages {
		if sampleFound > 0 {
			break // We just need to validate one works
		}

		staffMembers, err := a.crawler.extractStaffFromPage(ctx, pageURL, uni)
		if err != nil {
			log.Printf("[OrchestratorAdapter] Error extracting from %s: %v", pageURL, err)
			continue
		}

		// Count professors found
		for _, staff := range staffMembers {
			if staff.IsProfessor {
				sampleFound++
				log.Printf("[OrchestratorAdapter] Found sample professor: %s %s",
					stringValue(staff.FirstName), staff.LastName)
				break
			}
		}

		// Even non-professors validate the crawler works
		if sampleFound == 0 && len(staffMembers) > 0 {
			sampleFound = 1
			log.Printf("[OrchestratorAdapter] Found sample staff member (not professor): %s %s",
				stringValue(staffMembers[0].FirstName), staffMembers[0].LastName)
		}
	}

	progress.ItemsFound = len(staffPages) // Number of crawlable pages found
	now := time.Now()
	progress.CompletedAt = &now

	if sampleFound == 0 && len(staffPages) > 0 {
		// Pages found but no staff extracted - still consider it successful
		log.Printf("[OrchestratorAdapter] Discovery completed: %d pages found, extraction may need tuning", len(staffPages))
	} else if sampleFound == 0 {
		progress.Errors = append(progress.Errors, "No staff pages found")
		return progress, fmt.Errorf("no staff pages found for %s", uni.Name)
	}

	log.Printf("[OrchestratorAdapter] Discovery completed for %s: %d pages found", uni.Name, len(staffPages))
	return progress, nil
}

// CrawlProfessors crawls all professors at a university
// This is Phase 2: Focus on finding professors specifically
func (a *OrchestratorAdapter) CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
	start := time.Now()
	progress := &orchestrator.CrawlProgress{
		Phase:     orchestrator.PhaseProfessors,
		StartedAt: start,
	}

	log.Printf("[OrchestratorAdapter] Professors phase for university %s", universityID)

	// Get university
	uni, err := a.repo.GetUniversityByID(ctx, universityID)
	if err != nil || uni == nil {
		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
		return progress, fmt.Errorf("failed to get university: %w", err)
	}

	// Perform full crawl
	result, err := a.crawler.CrawlUniversity(ctx, uni)
	if err != nil {
		progress.Errors = append(progress.Errors, fmt.Sprintf("Crawl failed: %v", err))
		return progress, err
	}

	// Count professors specifically
	professorCount := 0
	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
		UniversityID: &universityID,
		IsProfessor:  boolPtr(true),
		Limit:        10000,
	})
	if err == nil {
		professorCount = staffList.Total
	}

	progress.ItemsFound = professorCount
	progress.ItemsProcessed = result.StaffFound
	progress.Errors = result.Errors
	now := time.Now()
	progress.CompletedAt = &now

	log.Printf("[OrchestratorAdapter] Professors phase completed for %s: %d professors found", uni.Name, professorCount)
	return progress, nil
}

// CrawlAllStaff crawls all staff members at a university
// This is Phase 3: Get all staff (already done in Phase 2, but we verify/extend)
func (a *OrchestratorAdapter) CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
	start := time.Now()
	progress := &orchestrator.CrawlProgress{
		Phase:     orchestrator.PhaseAllStaff,
		StartedAt: start,
	}

	log.Printf("[OrchestratorAdapter] All Staff phase for university %s", universityID)

	// Get university
	uni, err := a.repo.GetUniversityByID(ctx, universityID)
	if err != nil || uni == nil {
		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
		return progress, fmt.Errorf("failed to get university: %w", err)
	}

	// Run another crawl pass to catch any missed staff
	result, err := a.crawler.CrawlUniversity(ctx, uni)
	if err != nil {
		progress.Errors = result.Errors
		// Don't fail completely - we may have some staff already
		log.Printf("[OrchestratorAdapter] All Staff crawl had errors: %v", err)
	}

	// Get total staff count
	staffCount := 0
	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
		UniversityID: &universityID,
		Limit:        1, // Just need count
	})
	if err == nil {
		staffCount = staffList.Total
	}

	progress.ItemsFound = staffCount
	if result != nil {
		progress.ItemsProcessed = result.StaffFound
		progress.Errors = result.Errors
	}
	now := time.Now()
	progress.CompletedAt = &now

	log.Printf("[OrchestratorAdapter] All Staff phase completed for %s: %d total staff", uni.Name, staffCount)
	return progress, nil
}

// Helper functions

func stringValue(s *string) string {
	if s == nil {
		return ""
	}
	return *s
}

func boolPtr(b bool) *bool {
	return &b
}