Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
364
edu-search-service/internal/staff/staff_crawler_enrich.go
Normal file
364
edu-search-service/internal/staff/staff_crawler_enrich.go
Normal file
@@ -0,0 +1,364 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
// EnrichStaffProfiles fetches individual profile pages and extracts detailed info
|
||||
// like email, phone, office, research interests, and publication links
|
||||
func (c *StaffCrawler) EnrichStaffProfiles(ctx context.Context, uni *database.University) (int, error) {
|
||||
// Get all staff for this university that have profile URLs
|
||||
staffList, err := c.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &uni.ID,
|
||||
Limit: 10000,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to search staff: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[Profile Enrichment] Starting enrichment for %d staff members at %s", staffList.Total, uni.Name)
|
||||
|
||||
enriched := 0
|
||||
for _, staff := range staffList.Staff {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return enriched, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
// Skip if no profile URL
|
||||
if staff.ProfileURL == nil || *staff.ProfileURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if already has email (already enriched)
|
||||
if staff.Email != nil && *staff.Email != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Fetch and extract profile details
|
||||
details, err := c.extractProfileDetails(ctx, *staff.ProfileURL)
|
||||
if err != nil {
|
||||
log.Printf("[Profile Enrichment] Error fetching %s: %v", *staff.ProfileURL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update staff record with new details
|
||||
updated := false
|
||||
if details.Email != "" && staff.Email == nil {
|
||||
staff.Email = &details.Email
|
||||
updated = true
|
||||
}
|
||||
if details.Phone != "" && staff.Phone == nil {
|
||||
staff.Phone = &details.Phone
|
||||
updated = true
|
||||
}
|
||||
if details.Office != "" && staff.Office == nil {
|
||||
staff.Office = &details.Office
|
||||
updated = true
|
||||
}
|
||||
if details.ORCID != "" && staff.ORCID == nil {
|
||||
staff.ORCID = &details.ORCID
|
||||
updated = true
|
||||
}
|
||||
if details.GoogleScholarID != "" && staff.GoogleScholarID == nil {
|
||||
staff.GoogleScholarID = &details.GoogleScholarID
|
||||
updated = true
|
||||
}
|
||||
if details.ResearchgateURL != "" && staff.ResearchgateURL == nil {
|
||||
staff.ResearchgateURL = &details.ResearchgateURL
|
||||
updated = true
|
||||
}
|
||||
if details.LinkedInURL != "" && staff.LinkedInURL == nil {
|
||||
staff.LinkedInURL = &details.LinkedInURL
|
||||
updated = true
|
||||
}
|
||||
if details.PersonalWebsite != "" && staff.PersonalWebsite == nil {
|
||||
staff.PersonalWebsite = &details.PersonalWebsite
|
||||
updated = true
|
||||
}
|
||||
if len(details.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
|
||||
staff.ResearchInterests = details.ResearchInterests
|
||||
updated = true
|
||||
}
|
||||
if details.PhotoURL != "" && staff.PhotoURL == nil {
|
||||
staff.PhotoURL = &details.PhotoURL
|
||||
updated = true
|
||||
}
|
||||
|
||||
if updated {
|
||||
err = c.repo.CreateStaff(ctx, &staff)
|
||||
if err != nil {
|
||||
log.Printf("[Profile Enrichment] Error updating %s: %v", staff.LastName, err)
|
||||
continue
|
||||
}
|
||||
enriched++
|
||||
log.Printf("[Profile Enrichment] Enriched: %s %s (email=%v)", stringValue(staff.FirstName), staff.LastName, details.Email != "")
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[Profile Enrichment] Completed: enriched %d of %d staff members", enriched, staffList.Total)
|
||||
return enriched, nil
|
||||
}
|
||||
|
||||
// ProfileDetails contains extracted details from a profile page
|
||||
type ProfileDetails struct {
|
||||
Email string
|
||||
Phone string
|
||||
Office string
|
||||
ORCID string
|
||||
GoogleScholarID string
|
||||
ResearchgateURL string
|
||||
LinkedInURL string
|
||||
PersonalWebsite string
|
||||
ResearchInterests []string
|
||||
PhotoURL string
|
||||
}
|
||||
|
||||
// extractProfileDetails extracts contact info from an individual profile page
|
||||
func (c *StaffCrawler) extractProfileDetails(ctx context.Context, profileURL string) (*ProfileDetails, error) {
|
||||
body, err := c.fetchPage(ctx, profileURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
details := &ProfileDetails{}
|
||||
|
||||
// UOL-specific: Look for definition list pattern (dt/dd pairs)
|
||||
// This is the most reliable way to get contact info on UOL pages
|
||||
doc.Find("dt").Each(func(i int, dt *goquery.Selection) {
|
||||
label := strings.TrimSpace(strings.ToLower(dt.Text()))
|
||||
dd := dt.Next()
|
||||
if dd.Length() == 0 || goquery.NodeName(dd) != "dd" {
|
||||
return
|
||||
}
|
||||
value := strings.TrimSpace(dd.Text())
|
||||
|
||||
switch {
|
||||
case strings.Contains(label, "email") || strings.Contains(label, "e-mail"):
|
||||
if details.Email == "" {
|
||||
// Get email from mailto link if present
|
||||
dd.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
|
||||
if details.Email != "" {
|
||||
return
|
||||
}
|
||||
href, _ := a.Attr("href")
|
||||
email := strings.TrimPrefix(href, "mailto:")
|
||||
email = strings.Split(email, "?")[0]
|
||||
if strings.Contains(email, "@") {
|
||||
details.Email = strings.TrimSpace(email)
|
||||
}
|
||||
})
|
||||
// Fallback: extract from text
|
||||
if details.Email == "" && strings.Contains(value, "@") {
|
||||
emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,10}`)
|
||||
if match := emailPattern.FindString(value); match != "" {
|
||||
details.Email = match
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.Contains(label, "telefon") || strings.Contains(label, "phone") || strings.Contains(label, "tel"):
|
||||
if details.Phone == "" {
|
||||
// Get phone from tel: link if present
|
||||
dd.Find("a[href^='tel:']").Each(func(j int, a *goquery.Selection) {
|
||||
if details.Phone != "" {
|
||||
return
|
||||
}
|
||||
href, _ := a.Attr("href")
|
||||
phone := strings.TrimPrefix(href, "tel:")
|
||||
if len(phone) >= 8 {
|
||||
details.Phone = phone
|
||||
}
|
||||
})
|
||||
// Fallback: extract from text
|
||||
if details.Phone == "" {
|
||||
phonePattern := regexp.MustCompile(`\+?[\d\s\-/()]{8,20}`)
|
||||
if match := phonePattern.FindString(value); match != "" {
|
||||
details.Phone = strings.TrimSpace(match)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.Contains(label, "raum") || strings.Contains(label, "büro") || strings.Contains(label, "office"):
|
||||
if details.Office == "" {
|
||||
details.Office = value
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Fallback: Extract email from mailto links if not found via dt/dd
|
||||
if details.Email == "" {
|
||||
doc.Find("a[href^='mailto:']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.Email != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
email := strings.TrimPrefix(href, "mailto:")
|
||||
email = strings.Split(email, "?")[0]
|
||||
// Only accept personal email addresses (not generic like info@, sekretariat@)
|
||||
if strings.Contains(email, "@") {
|
||||
emailLower := strings.ToLower(email)
|
||||
isGeneric := strings.HasPrefix(emailLower, "info@") ||
|
||||
strings.HasPrefix(emailLower, "sekretariat@") ||
|
||||
strings.HasPrefix(emailLower, "kontakt@") ||
|
||||
strings.HasPrefix(emailLower, "office@") ||
|
||||
strings.HasPrefix(emailLower, "fachschaft@")
|
||||
if !isGeneric {
|
||||
details.Email = strings.TrimSpace(email)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Fallback: Extract phone if not found via dt/dd
|
||||
if details.Phone == "" {
|
||||
doc.Find("a[href^='tel:']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.Phone != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
phone := strings.TrimPrefix(href, "tel:")
|
||||
if len(phone) >= 8 {
|
||||
details.Phone = phone
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Extract ORCID
|
||||
doc.Find("a[href*='orcid.org']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.ORCID != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
orcidPattern := regexp.MustCompile(`\d{4}-\d{4}-\d{4}-\d{3}[\dX]`)
|
||||
if match := orcidPattern.FindString(href); match != "" {
|
||||
details.ORCID = match
|
||||
}
|
||||
})
|
||||
|
||||
// Extract Google Scholar ID
|
||||
doc.Find("a[href*='scholar.google']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.GoogleScholarID != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
// Extract user ID from URL like scholar.google.com/citations?user=XXXXX
|
||||
if strings.Contains(href, "user=") {
|
||||
parts := strings.Split(href, "user=")
|
||||
if len(parts) > 1 {
|
||||
userID := strings.Split(parts[1], "&")[0]
|
||||
details.GoogleScholarID = userID
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Extract ResearchGate URL
|
||||
doc.Find("a[href*='researchgate.net']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.ResearchgateURL != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
if strings.Contains(href, "researchgate.net") {
|
||||
details.ResearchgateURL = href
|
||||
}
|
||||
})
|
||||
|
||||
// Extract LinkedIn URL
|
||||
doc.Find("a[href*='linkedin.com']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.LinkedInURL != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
if strings.Contains(href, "linkedin.com") {
|
||||
details.LinkedInURL = href
|
||||
}
|
||||
})
|
||||
|
||||
// Extract personal website (non-university links)
|
||||
doc.Find("a[href^='http']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.PersonalWebsite != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
text := strings.ToLower(s.Text())
|
||||
|
||||
// Skip university links, social media, etc.
|
||||
if strings.Contains(href, "uni-oldenburg.de") || strings.Contains(href, "uol.de") ||
|
||||
strings.Contains(href, "linkedin") || strings.Contains(href, "researchgate") ||
|
||||
strings.Contains(href, "orcid.org") || strings.Contains(href, "scholar.google") ||
|
||||
strings.Contains(href, "twitter") || strings.Contains(href, "facebook") {
|
||||
return
|
||||
}
|
||||
|
||||
// Look for personal website indicators
|
||||
if strings.Contains(text, "homepage") || strings.Contains(text, "website") ||
|
||||
strings.Contains(text, "personal") || strings.Contains(text, "www") {
|
||||
details.PersonalWebsite = href
|
||||
}
|
||||
})
|
||||
|
||||
// Extract photo URL
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
if details.PhotoURL != "" {
|
||||
return
|
||||
}
|
||||
src, exists := s.Attr("src")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip icons, logos, etc.
|
||||
srcLower := strings.ToLower(src)
|
||||
if strings.Contains(srcLower, "icon") || strings.Contains(srcLower, "logo") ||
|
||||
strings.Contains(srcLower, "placeholder") || strings.Contains(srcLower, "default") {
|
||||
return
|
||||
}
|
||||
|
||||
// Look for images that might be profile photos
|
||||
alt, _ := s.Attr("alt")
|
||||
altLower := strings.ToLower(alt)
|
||||
classes, _ := s.Attr("class")
|
||||
classesLower := strings.ToLower(classes)
|
||||
|
||||
if strings.Contains(altLower, "foto") || strings.Contains(altLower, "photo") ||
|
||||
strings.Contains(altLower, "portrait") || strings.Contains(altLower, "bild") ||
|
||||
strings.Contains(classesLower, "photo") || strings.Contains(classesLower, "portrait") ||
|
||||
strings.Contains(classesLower, "profile") {
|
||||
details.PhotoURL = resolveURL(profileURL, src)
|
||||
}
|
||||
})
|
||||
|
||||
// Extract research interests/areas
|
||||
// Look for sections about research, forschung, schwerpunkte
|
||||
doc.Find("*").Each(func(i int, s *goquery.Selection) {
|
||||
if len(details.ResearchInterests) > 0 {
|
||||
return
|
||||
}
|
||||
text := strings.ToLower(s.Text())
|
||||
if strings.Contains(text, "forschung") || strings.Contains(text, "research") ||
|
||||
strings.Contains(text, "schwerpunkt") || strings.Contains(text, "interest") {
|
||||
// Check if parent has a list of items
|
||||
s.Parent().Find("li").Each(func(j int, li *goquery.Selection) {
|
||||
interest := strings.TrimSpace(li.Text())
|
||||
if len(interest) > 3 && len(interest) < 200 {
|
||||
details.ResearchInterests = append(details.ResearchInterests, interest)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
return details, nil
|
||||
}
|
||||
Reference in New Issue
Block a user