package staff import ( "bytes" "context" "fmt" "log" "regexp" "strings" "github.com/PuerkitoBio/goquery" "github.com/breakpilot/edu-search-service/internal/database" ) // EnrichStaffProfiles fetches individual profile pages and extracts detailed info // like email, phone, office, research interests, and publication links func (c *StaffCrawler) EnrichStaffProfiles(ctx context.Context, uni *database.University) (int, error) { // Get all staff for this university that have profile URLs staffList, err := c.repo.SearchStaff(ctx, database.StaffSearchParams{ UniversityID: &uni.ID, Limit: 10000, }) if err != nil { return 0, fmt.Errorf("failed to search staff: %w", err) } log.Printf("[Profile Enrichment] Starting enrichment for %d staff members at %s", staffList.Total, uni.Name) enriched := 0 for _, staff := range staffList.Staff { select { case <-ctx.Done(): return enriched, ctx.Err() default: } // Skip if no profile URL if staff.ProfileURL == nil || *staff.ProfileURL == "" { continue } // Skip if already has email (already enriched) if staff.Email != nil && *staff.Email != "" { continue } // Fetch and extract profile details details, err := c.extractProfileDetails(ctx, *staff.ProfileURL) if err != nil { log.Printf("[Profile Enrichment] Error fetching %s: %v", *staff.ProfileURL, err) continue } // Update staff record with new details updated := false if details.Email != "" && staff.Email == nil { staff.Email = &details.Email updated = true } if details.Phone != "" && staff.Phone == nil { staff.Phone = &details.Phone updated = true } if details.Office != "" && staff.Office == nil { staff.Office = &details.Office updated = true } if details.ORCID != "" && staff.ORCID == nil { staff.ORCID = &details.ORCID updated = true } if details.GoogleScholarID != "" && staff.GoogleScholarID == nil { staff.GoogleScholarID = &details.GoogleScholarID updated = true } if details.ResearchgateURL != "" && staff.ResearchgateURL == nil { staff.ResearchgateURL = &details.ResearchgateURL updated = true } if details.LinkedInURL != "" && staff.LinkedInURL == nil { staff.LinkedInURL = &details.LinkedInURL updated = true } if details.PersonalWebsite != "" && staff.PersonalWebsite == nil { staff.PersonalWebsite = &details.PersonalWebsite updated = true } if len(details.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 { staff.ResearchInterests = details.ResearchInterests updated = true } if details.PhotoURL != "" && staff.PhotoURL == nil { staff.PhotoURL = &details.PhotoURL updated = true } if updated { err = c.repo.CreateStaff(ctx, &staff) if err != nil { log.Printf("[Profile Enrichment] Error updating %s: %v", staff.LastName, err) continue } enriched++ log.Printf("[Profile Enrichment] Enriched: %s %s (email=%v)", stringValue(staff.FirstName), staff.LastName, details.Email != "") } } log.Printf("[Profile Enrichment] Completed: enriched %d of %d staff members", enriched, staffList.Total) return enriched, nil } // ProfileDetails contains extracted details from a profile page type ProfileDetails struct { Email string Phone string Office string ORCID string GoogleScholarID string ResearchgateURL string LinkedInURL string PersonalWebsite string ResearchInterests []string PhotoURL string } // extractProfileDetails extracts contact info from an individual profile page func (c *StaffCrawler) extractProfileDetails(ctx context.Context, profileURL string) (*ProfileDetails, error) { body, err := c.fetchPage(ctx, profileURL) if err != nil { return nil, err } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { return nil, err } details := &ProfileDetails{} // UOL-specific: Look for definition list pattern (dt/dd pairs) // This is the most reliable way to get contact info on UOL pages doc.Find("dt").Each(func(i int, dt *goquery.Selection) { label := strings.TrimSpace(strings.ToLower(dt.Text())) dd := dt.Next() if dd.Length() == 0 || goquery.NodeName(dd) != "dd" { return } value := strings.TrimSpace(dd.Text()) switch { case strings.Contains(label, "email") || strings.Contains(label, "e-mail"): if details.Email == "" { // Get email from mailto link if present dd.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) { if details.Email != "" { return } href, _ := a.Attr("href") email := strings.TrimPrefix(href, "mailto:") email = strings.Split(email, "?")[0] if strings.Contains(email, "@") { details.Email = strings.TrimSpace(email) } }) // Fallback: extract from text if details.Email == "" && strings.Contains(value, "@") { emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,10}`) if match := emailPattern.FindString(value); match != "" { details.Email = match } } } case strings.Contains(label, "telefon") || strings.Contains(label, "phone") || strings.Contains(label, "tel"): if details.Phone == "" { // Get phone from tel: link if present dd.Find("a[href^='tel:']").Each(func(j int, a *goquery.Selection) { if details.Phone != "" { return } href, _ := a.Attr("href") phone := strings.TrimPrefix(href, "tel:") if len(phone) >= 8 { details.Phone = phone } }) // Fallback: extract from text if details.Phone == "" { phonePattern := regexp.MustCompile(`\+?[\d\s\-/()]{8,20}`) if match := phonePattern.FindString(value); match != "" { details.Phone = strings.TrimSpace(match) } } } case strings.Contains(label, "raum") || strings.Contains(label, "büro") || strings.Contains(label, "office"): if details.Office == "" { details.Office = value } } }) // Fallback: Extract email from mailto links if not found via dt/dd if details.Email == "" { doc.Find("a[href^='mailto:']").Each(func(i int, s *goquery.Selection) { if details.Email != "" { return } href, _ := s.Attr("href") email := strings.TrimPrefix(href, "mailto:") email = strings.Split(email, "?")[0] // Only accept personal email addresses (not generic like info@, sekretariat@) if strings.Contains(email, "@") { emailLower := strings.ToLower(email) isGeneric := strings.HasPrefix(emailLower, "info@") || strings.HasPrefix(emailLower, "sekretariat@") || strings.HasPrefix(emailLower, "kontakt@") || strings.HasPrefix(emailLower, "office@") || strings.HasPrefix(emailLower, "fachschaft@") if !isGeneric { details.Email = strings.TrimSpace(email) } } }) } // Fallback: Extract phone if not found via dt/dd if details.Phone == "" { doc.Find("a[href^='tel:']").Each(func(i int, s *goquery.Selection) { if details.Phone != "" { return } href, _ := s.Attr("href") phone := strings.TrimPrefix(href, "tel:") if len(phone) >= 8 { details.Phone = phone } }) } // Extract ORCID doc.Find("a[href*='orcid.org']").Each(func(i int, s *goquery.Selection) { if details.ORCID != "" { return } href, _ := s.Attr("href") orcidPattern := regexp.MustCompile(`\d{4}-\d{4}-\d{4}-\d{3}[\dX]`) if match := orcidPattern.FindString(href); match != "" { details.ORCID = match } }) // Extract Google Scholar ID doc.Find("a[href*='scholar.google']").Each(func(i int, s *goquery.Selection) { if details.GoogleScholarID != "" { return } href, _ := s.Attr("href") // Extract user ID from URL like scholar.google.com/citations?user=XXXXX if strings.Contains(href, "user=") { parts := strings.Split(href, "user=") if len(parts) > 1 { userID := strings.Split(parts[1], "&")[0] details.GoogleScholarID = userID } } }) // Extract ResearchGate URL doc.Find("a[href*='researchgate.net']").Each(func(i int, s *goquery.Selection) { if details.ResearchgateURL != "" { return } href, _ := s.Attr("href") if strings.Contains(href, "researchgate.net") { details.ResearchgateURL = href } }) // Extract LinkedIn URL doc.Find("a[href*='linkedin.com']").Each(func(i int, s *goquery.Selection) { if details.LinkedInURL != "" { return } href, _ := s.Attr("href") if strings.Contains(href, "linkedin.com") { details.LinkedInURL = href } }) // Extract personal website (non-university links) doc.Find("a[href^='http']").Each(func(i int, s *goquery.Selection) { if details.PersonalWebsite != "" { return } href, _ := s.Attr("href") text := strings.ToLower(s.Text()) // Skip university links, social media, etc. if strings.Contains(href, "uni-oldenburg.de") || strings.Contains(href, "uol.de") || strings.Contains(href, "linkedin") || strings.Contains(href, "researchgate") || strings.Contains(href, "orcid.org") || strings.Contains(href, "scholar.google") || strings.Contains(href, "twitter") || strings.Contains(href, "facebook") { return } // Look for personal website indicators if strings.Contains(text, "homepage") || strings.Contains(text, "website") || strings.Contains(text, "personal") || strings.Contains(text, "www") { details.PersonalWebsite = href } }) // Extract photo URL doc.Find("img").Each(func(i int, s *goquery.Selection) { if details.PhotoURL != "" { return } src, exists := s.Attr("src") if !exists { return } // Skip icons, logos, etc. srcLower := strings.ToLower(src) if strings.Contains(srcLower, "icon") || strings.Contains(srcLower, "logo") || strings.Contains(srcLower, "placeholder") || strings.Contains(srcLower, "default") { return } // Look for images that might be profile photos alt, _ := s.Attr("alt") altLower := strings.ToLower(alt) classes, _ := s.Attr("class") classesLower := strings.ToLower(classes) if strings.Contains(altLower, "foto") || strings.Contains(altLower, "photo") || strings.Contains(altLower, "portrait") || strings.Contains(altLower, "bild") || strings.Contains(classesLower, "photo") || strings.Contains(classesLower, "portrait") || strings.Contains(classesLower, "profile") { details.PhotoURL = resolveURL(profileURL, src) } }) // Extract research interests/areas // Look for sections about research, forschung, schwerpunkte doc.Find("*").Each(func(i int, s *goquery.Selection) { if len(details.ResearchInterests) > 0 { return } text := strings.ToLower(s.Text()) if strings.Contains(text, "forschung") || strings.Contains(text, "research") || strings.Contains(text, "schwerpunkt") || strings.Contains(text, "interest") { // Check if parent has a list of items s.Parent().Find("li").Each(func(j int, li *goquery.Selection) { interest := strings.TrimSpace(li.Text()) if len(interest) > 3 && len(interest) < 200 { details.ResearchInterests = append(details.ResearchInterests, interest) } }) } }) return details, nil }