package publications import ( "context" "fmt" "log" "sync" "time" "github.com/breakpilot/edu-search-service/internal/database" "github.com/google/uuid" ) // PublicationCrawler crawls publications for university staff type PublicationCrawler struct { repo *database.Repository crossref *CrossRefClient rateLimit time.Duration mu sync.Mutex lastRequest time.Time } // CrawlResult contains the result of a publication crawl type CrawlResult struct { StaffID uuid.UUID PubsFound int PubsNew int PubsUpdated int Errors []string Duration time.Duration } // NewPublicationCrawler creates a new publication crawler func NewPublicationCrawler(repo *database.Repository, email string) *PublicationCrawler { return &PublicationCrawler{ repo: repo, crossref: NewCrossRefClient(email), rateLimit: time.Second, // CrossRef polite pool: 50 req/sec max } } // CrawlForStaff crawls publications for a single staff member func (c *PublicationCrawler) CrawlForStaff(ctx context.Context, staff *database.UniversityStaff) (*CrawlResult, error) { start := time.Now() result := &CrawlResult{ StaffID: staff.ID, } log.Printf("Starting publication crawl for %s", *staff.FullName) var pubs []*database.Publication // Strategy 1: Search by ORCID (most reliable) if staff.ORCID != nil && *staff.ORCID != "" { c.waitForRateLimit() orcidPubs, err := c.crossref.SearchByORCID(ctx, *staff.ORCID, 100) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("ORCID search error: %v", err)) } else { pubs = append(pubs, orcidPubs...) log.Printf("Found %d publications via ORCID for %s", len(orcidPubs), *staff.FullName) } } // Strategy 2: Search by author name if staff.FullName != nil && *staff.FullName != "" { c.waitForRateLimit() namePubs, err := c.crossref.SearchByAuthor(ctx, *staff.FullName, 50) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Name search error: %v", err)) } else { // Deduplicate for _, pub := range namePubs { if !containsPub(pubs, pub) { pubs = append(pubs, pub) } } log.Printf("Found %d additional publications via name search for %s", len(namePubs), *staff.FullName) } } // Save publications and create links for _, pub := range pubs { // Save publication err := c.repo.CreatePublication(ctx, pub) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Save error for %s: %v", pub.Title, err)) continue } result.PubsFound++ // Link to staff link := &database.StaffPublication{ StaffID: staff.ID, PublicationID: pub.ID, } // Determine author position pos := findAuthorPosition(pub, staff) if pos > 0 { link.AuthorPosition = &pos } if err := c.repo.LinkStaffPublication(ctx, link); err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Link error: %v", err)) } } result.Duration = time.Since(start) log.Printf("Completed publication crawl for %s: found=%d, duration=%v", *staff.FullName, result.PubsFound, result.Duration) return result, nil } // CrawlForUniversity crawls publications for all staff at a university func (c *PublicationCrawler) CrawlForUniversity(ctx context.Context, uniID uuid.UUID, limit int) (*database.UniversityCrawlStatus, error) { log.Printf("Starting publication crawl for university %s", uniID) // Get staff with ORCID first (more reliable) params := database.StaffSearchParams{ UniversityID: &uniID, Limit: limit, } result, err := c.repo.SearchStaff(ctx, params) if err != nil { return nil, err } status := &database.UniversityCrawlStatus{ UniversityID: uniID, PubCrawlStatus: "running", } var totalPubs int var errors []string for _, staff := range result.Staff { select { case <-ctx.Done(): status.PubCrawlStatus = "cancelled" status.PubErrors = append(errors, "Crawl cancelled") return status, ctx.Err() default: } crawlResult, err := c.CrawlForStaff(ctx, &staff) if err != nil { errors = append(errors, fmt.Sprintf("%s: %v", staff.LastName, err)) continue } totalPubs += crawlResult.PubsFound errors = append(errors, crawlResult.Errors...) } now := time.Now() status.LastPubCrawl = &now status.PubCrawlStatus = "completed" status.PubCount = totalPubs status.PubErrors = errors // Update status in database if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil { log.Printf("Warning: Failed to update crawl status: %v", err) } log.Printf("Completed publication crawl for university %s: %d publications found", uniID, totalPubs) return status, nil } // ResolveDOI resolves a DOI and saves the publication func (c *PublicationCrawler) ResolveDOI(ctx context.Context, doi string) (*database.Publication, error) { c.waitForRateLimit() pub, err := c.crossref.GetWorkByDOI(ctx, doi) if err != nil { return nil, err } if err := c.repo.CreatePublication(ctx, pub); err != nil { return nil, err } return pub, nil } // waitForRateLimit enforces rate limiting func (c *PublicationCrawler) waitForRateLimit() { c.mu.Lock() defer c.mu.Unlock() elapsed := time.Since(c.lastRequest) if elapsed < c.rateLimit { time.Sleep(c.rateLimit - elapsed) } c.lastRequest = time.Now() } // containsPub checks if a publication is already in the list (by DOI or title) func containsPub(pubs []*database.Publication, pub *database.Publication) bool { for _, existing := range pubs { // Check DOI if pub.DOI != nil && existing.DOI != nil && *pub.DOI == *existing.DOI { return true } // Check title (rough match) if pub.Title == existing.Title { return true } } return false } // findAuthorPosition finds the position of a staff member in the author list func findAuthorPosition(pub *database.Publication, staff *database.UniversityStaff) int { for i, author := range pub.Authors { // Check if author name matches staff if staff.LastName != "" && containsIgnoreCase(author, staff.LastName) { return i + 1 } } return 0 } // containsIgnoreCase checks if s contains substr (case insensitive) func containsIgnoreCase(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(substr) == 0 || (len(s) > 0 && containsIgnoreCaseHelper(s, substr))) } func containsIgnoreCaseHelper(s, substr string) bool { for i := 0; i <= len(s)-len(substr); i++ { if equalFold(s[i:i+len(substr)], substr) { return true } } return false } func equalFold(s1, s2 string) bool { if len(s1) != len(s2) { return false } for i := 0; i < len(s1); i++ { c1, c2 := s1[i], s2[i] if c1 != c2 { // Simple ASCII case folding if c1 >= 'A' && c1 <= 'Z' { c1 += 'a' - 'A' } if c2 >= 'A' && c2 <= 'Z' { c2 += 'a' - 'A' } if c1 != c2 { return false } } } return true }