breakpilot-lehrer/edu-search-service/internal/staff/staff_crawler_enrich.go

package staff

import (
	"bytes"
	"context"
	"fmt"
	"log"
	"regexp"
	"strings"

	"github.com/PuerkitoBio/goquery"

	"github.com/breakpilot/edu-search-service/internal/database"
)

// EnrichStaffProfiles fetches individual profile pages and extracts detailed info
// like email, phone, office, research interests, and publication links
func (c *StaffCrawler) EnrichStaffProfiles(ctx context.Context, uni *database.University) (int, error) {
	// Get all staff for this university that have profile URLs
	staffList, err := c.repo.SearchStaff(ctx, database.StaffSearchParams{
		UniversityID: &uni.ID,
		Limit:        10000,
	})
	if err != nil {
		return 0, fmt.Errorf("failed to search staff: %w", err)
	}

	log.Printf("[Profile Enrichment] Starting enrichment for %d staff members at %s", staffList.Total, uni.Name)

	enriched := 0
	for _, staff := range staffList.Staff {
		select {
		case <-ctx.Done():
			return enriched, ctx.Err()
		default:
		}

		// Skip if no profile URL
		if staff.ProfileURL == nil || *staff.ProfileURL == "" {
			continue
		}

		// Skip if already has email (already enriched)
		if staff.Email != nil && *staff.Email != "" {
			continue
		}

		// Fetch and extract profile details
		details, err := c.extractProfileDetails(ctx, *staff.ProfileURL)
		if err != nil {
			log.Printf("[Profile Enrichment] Error fetching %s: %v", *staff.ProfileURL, err)
			continue
		}

		// Update staff record with new details
		updated := false
		if details.Email != "" && staff.Email == nil {
			staff.Email = &details.Email
			updated = true
		}
		if details.Phone != "" && staff.Phone == nil {
			staff.Phone = &details.Phone
			updated = true
		}
		if details.Office != "" && staff.Office == nil {
			staff.Office = &details.Office
			updated = true
		}
		if details.ORCID != "" && staff.ORCID == nil {
			staff.ORCID = &details.ORCID
			updated = true
		}
		if details.GoogleScholarID != "" && staff.GoogleScholarID == nil {
			staff.GoogleScholarID = &details.GoogleScholarID
			updated = true
		}
		if details.ResearchgateURL != "" && staff.ResearchgateURL == nil {
			staff.ResearchgateURL = &details.ResearchgateURL
			updated = true
		}
		if details.LinkedInURL != "" && staff.LinkedInURL == nil {
			staff.LinkedInURL = &details.LinkedInURL
			updated = true
		}
		if details.PersonalWebsite != "" && staff.PersonalWebsite == nil {
			staff.PersonalWebsite = &details.PersonalWebsite
			updated = true
		}
		if len(details.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
			staff.ResearchInterests = details.ResearchInterests
			updated = true
		}
		if details.PhotoURL != "" && staff.PhotoURL == nil {
			staff.PhotoURL = &details.PhotoURL
			updated = true
		}

		if updated {
			err = c.repo.CreateStaff(ctx, &staff)
			if err != nil {
				log.Printf("[Profile Enrichment] Error updating %s: %v", staff.LastName, err)
				continue
			}
			enriched++
			log.Printf("[Profile Enrichment] Enriched: %s %s (email=%v)", stringValue(staff.FirstName), staff.LastName, details.Email != "")
		}
	}

	log.Printf("[Profile Enrichment] Completed: enriched %d of %d staff members", enriched, staffList.Total)
	return enriched, nil
}

// ProfileDetails contains extracted details from a profile page
type ProfileDetails struct {
	Email             string
	Phone             string
	Office            string
	ORCID             string
	GoogleScholarID   string
	ResearchgateURL   string
	LinkedInURL       string
	PersonalWebsite   string
	ResearchInterests []string
	PhotoURL          string
}

// extractProfileDetails extracts contact info from an individual profile page
func (c *StaffCrawler) extractProfileDetails(ctx context.Context, profileURL string) (*ProfileDetails, error) {
	body, err := c.fetchPage(ctx, profileURL)
	if err != nil {
		return nil, err
	}

	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
	if err != nil {
		return nil, err
	}

	details := &ProfileDetails{}

	// UOL-specific: Look for definition list pattern (dt/dd pairs)
	// This is the most reliable way to get contact info on UOL pages
	doc.Find("dt").Each(func(i int, dt *goquery.Selection) {
		label := strings.TrimSpace(strings.ToLower(dt.Text()))
		dd := dt.Next()
		if dd.Length() == 0 || goquery.NodeName(dd) != "dd" {
			return
		}
		value := strings.TrimSpace(dd.Text())

		switch {
		case strings.Contains(label, "email") || strings.Contains(label, "e-mail"):
			if details.Email == "" {
				// Get email from mailto link if present
				dd.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
					if details.Email != "" {
						return
					}
					href, _ := a.Attr("href")
					email := strings.TrimPrefix(href, "mailto:")
					email = strings.Split(email, "?")[0]
					if strings.Contains(email, "@") {
						details.Email = strings.TrimSpace(email)
					}
				})
				// Fallback: extract from text
				if details.Email == "" && strings.Contains(value, "@") {
					emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,10}`)
					if match := emailPattern.FindString(value); match != "" {
						details.Email = match
					}
				}
			}
		case strings.Contains(label, "telefon") || strings.Contains(label, "phone") || strings.Contains(label, "tel"):
			if details.Phone == "" {
				// Get phone from tel: link if present
				dd.Find("a[href^='tel:']").Each(func(j int, a *goquery.Selection) {
					if details.Phone != "" {
						return
					}
					href, _ := a.Attr("href")
					phone := strings.TrimPrefix(href, "tel:")
					if len(phone) >= 8 {
						details.Phone = phone
					}
				})
				// Fallback: extract from text
				if details.Phone == "" {
					phonePattern := regexp.MustCompile(`\+?[\d\s\-/()]{8,20}`)
					if match := phonePattern.FindString(value); match != "" {
						details.Phone = strings.TrimSpace(match)
					}
				}
			}
		case strings.Contains(label, "raum") || strings.Contains(label, "büro") || strings.Contains(label, "office"):
			if details.Office == "" {
				details.Office = value
			}
		}
	})

	// Fallback: Extract email from mailto links if not found via dt/dd
	if details.Email == "" {
		doc.Find("a[href^='mailto:']").Each(func(i int, s *goquery.Selection) {
			if details.Email != "" {
				return
			}
			href, _ := s.Attr("href")
			email := strings.TrimPrefix(href, "mailto:")
			email = strings.Split(email, "?")[0]
			// Only accept personal email addresses (not generic like info@, sekretariat@)
			if strings.Contains(email, "@") {
				emailLower := strings.ToLower(email)
				isGeneric := strings.HasPrefix(emailLower, "info@") ||
					strings.HasPrefix(emailLower, "sekretariat@") ||
					strings.HasPrefix(emailLower, "kontakt@") ||
					strings.HasPrefix(emailLower, "office@") ||
					strings.HasPrefix(emailLower, "fachschaft@")
				if !isGeneric {
					details.Email = strings.TrimSpace(email)
				}
			}
		})
	}

	// Fallback: Extract phone if not found via dt/dd
	if details.Phone == "" {
		doc.Find("a[href^='tel:']").Each(func(i int, s *goquery.Selection) {
			if details.Phone != "" {
				return
			}
			href, _ := s.Attr("href")
			phone := strings.TrimPrefix(href, "tel:")
			if len(phone) >= 8 {
				details.Phone = phone
			}
		})
	}

	// Extract ORCID
	doc.Find("a[href*='orcid.org']").Each(func(i int, s *goquery.Selection) {
		if details.ORCID != "" {
			return
		}
		href, _ := s.Attr("href")
		orcidPattern := regexp.MustCompile(`\d{4}-\d{4}-\d{4}-\d{3}[\dX]`)
		if match := orcidPattern.FindString(href); match != "" {
			details.ORCID = match
		}
	})

	// Extract Google Scholar ID
	doc.Find("a[href*='scholar.google']").Each(func(i int, s *goquery.Selection) {
		if details.GoogleScholarID != "" {
			return
		}
		href, _ := s.Attr("href")
		// Extract user ID from URL like scholar.google.com/citations?user=XXXXX
		if strings.Contains(href, "user=") {
			parts := strings.Split(href, "user=")
			if len(parts) > 1 {
				userID := strings.Split(parts[1], "&")[0]
				details.GoogleScholarID = userID
			}
		}
	})

	// Extract ResearchGate URL
	doc.Find("a[href*='researchgate.net']").Each(func(i int, s *goquery.Selection) {
		if details.ResearchgateURL != "" {
			return
		}
		href, _ := s.Attr("href")
		if strings.Contains(href, "researchgate.net") {
			details.ResearchgateURL = href
		}
	})

	// Extract LinkedIn URL
	doc.Find("a[href*='linkedin.com']").Each(func(i int, s *goquery.Selection) {
		if details.LinkedInURL != "" {
			return
		}
		href, _ := s.Attr("href")
		if strings.Contains(href, "linkedin.com") {
			details.LinkedInURL = href
		}
	})

	// Extract personal website (non-university links)
	doc.Find("a[href^='http']").Each(func(i int, s *goquery.Selection) {
		if details.PersonalWebsite != "" {
			return
		}
		href, _ := s.Attr("href")
		text := strings.ToLower(s.Text())

		// Skip university links, social media, etc.
		if strings.Contains(href, "uni-oldenburg.de") || strings.Contains(href, "uol.de") ||
			strings.Contains(href, "linkedin") || strings.Contains(href, "researchgate") ||
			strings.Contains(href, "orcid.org") || strings.Contains(href, "scholar.google") ||
			strings.Contains(href, "twitter") || strings.Contains(href, "facebook") {
			return
		}

		// Look for personal website indicators
		if strings.Contains(text, "homepage") || strings.Contains(text, "website") ||
			strings.Contains(text, "personal") || strings.Contains(text, "www") {
			details.PersonalWebsite = href
		}
	})

	// Extract photo URL
	doc.Find("img").Each(func(i int, s *goquery.Selection) {
		if details.PhotoURL != "" {
			return
		}
		src, exists := s.Attr("src")
		if !exists {
			return
		}

		// Skip icons, logos, etc.
		srcLower := strings.ToLower(src)
		if strings.Contains(srcLower, "icon") || strings.Contains(srcLower, "logo") ||
			strings.Contains(srcLower, "placeholder") || strings.Contains(srcLower, "default") {
			return
		}

		// Look for images that might be profile photos
		alt, _ := s.Attr("alt")
		altLower := strings.ToLower(alt)
		classes, _ := s.Attr("class")
		classesLower := strings.ToLower(classes)

		if strings.Contains(altLower, "foto") || strings.Contains(altLower, "photo") ||
			strings.Contains(altLower, "portrait") || strings.Contains(altLower, "bild") ||
			strings.Contains(classesLower, "photo") || strings.Contains(classesLower, "portrait") ||
			strings.Contains(classesLower, "profile") {
			details.PhotoURL = resolveURL(profileURL, src)
		}
	})

	// Extract research interests/areas
	// Look for sections about research, forschung, schwerpunkte
	doc.Find("*").Each(func(i int, s *goquery.Selection) {
		if len(details.ResearchInterests) > 0 {
			return
		}
		text := strings.ToLower(s.Text())
		if strings.Contains(text, "forschung") || strings.Contains(text, "research") ||
			strings.Contains(text, "schwerpunkt") || strings.Contains(text, "interest") {
			// Check if parent has a list of items
			s.Parent().Find("li").Each(func(j int, li *goquery.Selection) {
				interest := strings.TrimSpace(li.Text())
				if len(interest) > 3 && len(interest) < 200 {
					details.ResearchInterests = append(details.ResearchInterests, interest)
				}
			})
		}
	})

	return details, nil
}