Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
496 lines
14 KiB
Go
496 lines
14 KiB
Go
package staff
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"log"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"github.com/breakpilot/edu-search-service/internal/database"
|
|
)
|
|
|
|
// extractStaffFromPage extracts staff information from a staff listing page
|
|
func (c *StaffCrawler) extractStaffFromPage(ctx context.Context, pageURL string, uni *database.University) ([]*database.UniversityStaff, error) {
|
|
body, err := c.fetchPage(ctx, pageURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var staff []*database.UniversityStaff
|
|
|
|
// Try different extraction strategies
|
|
extractors := []func(*goquery.Document, string) []*database.UniversityStaff{
|
|
c.extractFromUOLPatterns, // UOL-specific patterns first
|
|
c.extractFromPersonCards,
|
|
c.extractFromTable,
|
|
c.extractFromList,
|
|
c.extractFromVCard,
|
|
}
|
|
|
|
for _, extractor := range extractors {
|
|
extracted := extractor(doc, pageURL)
|
|
if len(extracted) > 0 {
|
|
staff = append(staff, extracted...)
|
|
}
|
|
}
|
|
|
|
return staff, nil
|
|
}
|
|
|
|
// extractFromUOLPatterns extracts staff using Uni Oldenburg specific patterns
|
|
// UOL uses: nav#left-nav for person lists, p.mit-icon.person for person links,
|
|
// and /suche/person?username=XXX for person API
|
|
// Also captures hierarchy from section headers (Leitung, Mitarbeiter, etc.)
|
|
func (c *StaffCrawler) extractFromUOLPatterns(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
|
var staff []*database.UniversityStaff
|
|
seen := make(map[string]bool)
|
|
|
|
// Extract department name from page title or breadcrumb
|
|
deptName := ""
|
|
doc.Find("h1").First().Each(func(i int, s *goquery.Selection) {
|
|
deptName = strings.TrimSpace(s.Text())
|
|
})
|
|
|
|
// Pattern 5 (NEW): Parse content with hierarchy headers
|
|
// UOL pages have structure like:
|
|
// #### Leitung
|
|
// <ul><li><a href="...">Prof. Dr. Name</a></li></ul>
|
|
// #### Wissenschaftliche Mitarbeiterinnen und Mitarbeiter
|
|
// <ul><li><a href="...">M. Sc. Name</a></li></ul>
|
|
currentRole := ""
|
|
var leaderName string // Track the department head for supervisor assignment
|
|
|
|
// Walk through content area looking for headers and lists
|
|
doc.Find("#content h4, #content h3, #content ul li a, .inhalt h4, .inhalt h3, .inhalt ul li a").Each(func(i int, s *goquery.Selection) {
|
|
tagName := goquery.NodeName(s)
|
|
|
|
// Check if this is a section header
|
|
if tagName == "h3" || tagName == "h4" {
|
|
headerText := strings.ToLower(strings.TrimSpace(s.Text()))
|
|
if strings.Contains(headerText, "leitung") {
|
|
currentRole = "leitung"
|
|
} else if strings.Contains(headerText, "sekretariat") {
|
|
currentRole = "sekretariat"
|
|
} else if strings.Contains(headerText, "wissenschaftlich") || strings.Contains(headerText, "mitarbeiter") {
|
|
currentRole = "mitarbeiter"
|
|
} else if strings.Contains(headerText, "doktorand") || strings.Contains(headerText, "promovierend") {
|
|
currentRole = "doktorand"
|
|
} else if strings.Contains(headerText, "technisch") {
|
|
currentRole = "technisch"
|
|
} else if strings.Contains(headerText, "extern") {
|
|
currentRole = "extern"
|
|
} else if strings.Contains(headerText, "student") || strings.Contains(headerText, "hilfskr") || strings.Contains(headerText, "hiwi") {
|
|
currentRole = "hiwi"
|
|
}
|
|
return
|
|
}
|
|
|
|
// Process person links under current header
|
|
if tagName == "a" {
|
|
href, exists := s.Attr("href")
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Check if this looks like a person page link
|
|
if !strings.Contains(href, "/personen/") && !strings.Contains(href, "suche/person") {
|
|
return
|
|
}
|
|
|
|
name := strings.TrimSpace(s.Text())
|
|
if name == "" || seen[name] || !c.looksLikeName(name) {
|
|
return
|
|
}
|
|
seen[name] = true
|
|
|
|
person := &database.UniversityStaff{}
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
|
|
if person.LastName != "" {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
|
|
// Set team role based on current section
|
|
if currentRole != "" {
|
|
person.TeamRole = ¤tRole
|
|
}
|
|
|
|
// Track leader for supervisor assignment
|
|
if currentRole == "leitung" && leaderName == "" {
|
|
leaderName = name
|
|
person.IsProfessor = true
|
|
posType := "professor"
|
|
person.PositionType = &posType
|
|
}
|
|
|
|
staff = append(staff, person)
|
|
}
|
|
}
|
|
})
|
|
|
|
// Pattern 1: nav#left-nav ul li a - side navigation with person links
|
|
// Format: /abteilung/personen/prof-dr-name or /abteilung/personen/m-sc-name
|
|
doc.Find("nav#left-nav ul li a, #left-navi li a").Each(func(i int, s *goquery.Selection) {
|
|
href, exists := s.Attr("href")
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Check if this looks like a person page link
|
|
if !strings.Contains(href, "/personen/") {
|
|
return
|
|
}
|
|
|
|
name := strings.TrimSpace(s.Text())
|
|
if name == "" || seen[name] {
|
|
return
|
|
}
|
|
seen[name] = true
|
|
|
|
person := &database.UniversityStaff{}
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
|
|
if person.LastName != "" {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
// Pattern 2: p.mit-icon.person a - inline person references
|
|
// Format: <p class="mit-icon person"><a href="/suche/person/USERNAME">Prof. Dr. Name</a></p>
|
|
// OR: <p class="mit-icon person"><a href="/abteilung/personen/prof-dr-name">Prof. Dr. Name</a></p>
|
|
doc.Find("p.mit-icon.person a, .mit-icon.person a").Each(func(i int, s *goquery.Selection) {
|
|
name := strings.TrimSpace(s.Text())
|
|
if name == "" || seen[name] {
|
|
return
|
|
}
|
|
seen[name] = true
|
|
|
|
person := &database.UniversityStaff{}
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
|
|
if person.LastName != "" {
|
|
href, exists := s.Attr("href")
|
|
if exists {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
}
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
// Pattern 3: Links to /suche/person?username=XXX
|
|
doc.Find("a[href*='suche/person']").Each(func(i int, s *goquery.Selection) {
|
|
name := strings.TrimSpace(s.Text())
|
|
// Skip non-person text like "Internetkoordinator"
|
|
if name == "" || seen[name] || !c.looksLikeName(name) {
|
|
return
|
|
}
|
|
seen[name] = true
|
|
|
|
person := &database.UniversityStaff{}
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
|
|
if person.LastName != "" {
|
|
href, exists := s.Attr("href")
|
|
if exists {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
}
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
// Pattern 4: Breadcrumb navigation sublinks with person names
|
|
// Format: <ul class="sublinks"><li><a href="/dept/personen/name">Prof. Dr. Name</a></li>
|
|
doc.Find(".sublinks li a, nav#navizeile .sublinks li a").Each(func(i int, s *goquery.Selection) {
|
|
href, exists := s.Attr("href")
|
|
if !exists || !strings.Contains(href, "/personen/") {
|
|
return
|
|
}
|
|
|
|
name := strings.TrimSpace(s.Text())
|
|
if name == "" || seen[name] {
|
|
return
|
|
}
|
|
seen[name] = true
|
|
|
|
person := &database.UniversityStaff{}
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
|
|
if person.LastName != "" {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
if len(staff) > 0 {
|
|
log.Printf("[UOL Extractor] Found %d staff members using UOL patterns (dept: %s)", len(staff), deptName)
|
|
}
|
|
|
|
return staff
|
|
}
|
|
|
|
// extractFromPersonCards extracts staff from card-style layouts
|
|
func (c *StaffCrawler) extractFromPersonCards(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
|
var staff []*database.UniversityStaff
|
|
|
|
// Common card selectors
|
|
cardSelectors := []string{
|
|
".person-card",
|
|
".staff-card",
|
|
".team-member",
|
|
".mitarbeiter",
|
|
".person",
|
|
".employee",
|
|
"[itemtype='http://schema.org/Person']",
|
|
".vcard",
|
|
}
|
|
|
|
for _, selector := range cardSelectors {
|
|
doc.Find(selector).Each(func(i int, s *goquery.Selection) {
|
|
person := c.extractPersonFromElement(s, baseURL)
|
|
if person != nil && person.LastName != "" {
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
if len(staff) > 0 {
|
|
break
|
|
}
|
|
}
|
|
|
|
return staff
|
|
}
|
|
|
|
// extractFromTable extracts staff from table layouts
|
|
func (c *StaffCrawler) extractFromTable(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
|
var staff []*database.UniversityStaff
|
|
|
|
doc.Find("table").Each(func(i int, table *goquery.Selection) {
|
|
// Check if this looks like a staff table
|
|
headerText := strings.ToLower(table.Find("th").Text())
|
|
if !strings.Contains(headerText, "name") && !strings.Contains(headerText, "person") {
|
|
return
|
|
}
|
|
|
|
table.Find("tr").Each(func(j int, row *goquery.Selection) {
|
|
if row.Find("th").Length() > 0 {
|
|
return // Skip header row
|
|
}
|
|
|
|
cells := row.Find("td")
|
|
if cells.Length() < 2 {
|
|
return
|
|
}
|
|
|
|
person := &database.UniversityStaff{}
|
|
|
|
// First cell usually contains name
|
|
nameCell := cells.First()
|
|
name := strings.TrimSpace(nameCell.Text())
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
|
|
// Look for email
|
|
row.Find("a[href^='mailto:']").Each(func(k int, a *goquery.Selection) {
|
|
href, _ := a.Attr("href")
|
|
email := strings.TrimPrefix(href, "mailto:")
|
|
person.Email = &email
|
|
})
|
|
|
|
// Look for profile link
|
|
nameCell.Find("a[href]").Each(func(k int, a *goquery.Selection) {
|
|
href, exists := a.Attr("href")
|
|
if exists && !strings.HasPrefix(href, "mailto:") {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
}
|
|
})
|
|
|
|
// Extract position from other cells
|
|
cells.Each(func(k int, cell *goquery.Selection) {
|
|
text := strings.TrimSpace(cell.Text())
|
|
if c.looksLikePosition(text) {
|
|
person.Position = &text
|
|
person.PositionType = c.classifyPosition(text)
|
|
person.IsProfessor = c.isProfessor(text)
|
|
}
|
|
})
|
|
|
|
if person.LastName != "" {
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
})
|
|
|
|
return staff
|
|
}
|
|
|
|
// extractFromList extracts staff from list layouts
|
|
func (c *StaffCrawler) extractFromList(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
|
var staff []*database.UniversityStaff
|
|
|
|
listSelectors := []string{"ul.staff", "ul.team", "ul.mitarbeiter", ".staff-list li", ".team-list li"}
|
|
|
|
for _, selector := range listSelectors {
|
|
doc.Find(selector).Each(func(i int, li *goquery.Selection) {
|
|
person := c.extractPersonFromElement(li, baseURL)
|
|
if person != nil && person.LastName != "" {
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
if len(staff) > 0 {
|
|
break
|
|
}
|
|
}
|
|
|
|
return staff
|
|
}
|
|
|
|
// extractFromVCard extracts staff from vCard/hCard microformats
|
|
func (c *StaffCrawler) extractFromVCard(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
|
var staff []*database.UniversityStaff
|
|
|
|
doc.Find(".vcard, .h-card").Each(func(i int, s *goquery.Selection) {
|
|
person := &database.UniversityStaff{}
|
|
|
|
// Name
|
|
fn := s.Find(".fn, .p-name").Text()
|
|
if fn != "" {
|
|
person.FullName = &fn
|
|
c.parseName(fn, person)
|
|
}
|
|
|
|
// Email
|
|
email := s.Find(".email, .u-email").Text()
|
|
if email == "" {
|
|
s.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
|
|
href, _ := a.Attr("href")
|
|
email = strings.TrimPrefix(href, "mailto:")
|
|
})
|
|
}
|
|
if email != "" {
|
|
person.Email = &email
|
|
}
|
|
|
|
// Title/Position
|
|
title := s.Find(".title, .p-job-title, .role").Text()
|
|
if title != "" {
|
|
person.Position = &title
|
|
person.PositionType = c.classifyPosition(title)
|
|
person.IsProfessor = c.isProfessor(title)
|
|
}
|
|
|
|
// Photo
|
|
s.Find(".photo, .u-photo, img").Each(func(j int, img *goquery.Selection) {
|
|
src, exists := img.Attr("src")
|
|
if exists {
|
|
fullURL := resolveURL(baseURL, src)
|
|
person.PhotoURL = &fullURL
|
|
}
|
|
})
|
|
|
|
// Profile URL
|
|
s.Find("a[href].url, a[href].u-url").Each(func(j int, a *goquery.Selection) {
|
|
href, exists := a.Attr("href")
|
|
if exists {
|
|
fullURL := resolveURL(baseURL, href)
|
|
person.ProfileURL = &fullURL
|
|
}
|
|
})
|
|
|
|
if person.LastName != "" {
|
|
staff = append(staff, person)
|
|
}
|
|
})
|
|
|
|
return staff
|
|
}
|
|
|
|
// extractPersonFromElement extracts a person from a generic HTML element
|
|
func (c *StaffCrawler) extractPersonFromElement(s *goquery.Selection, baseURL string) *database.UniversityStaff {
|
|
person := &database.UniversityStaff{}
|
|
|
|
// Try to find name
|
|
nameSelectors := []string{".name", ".person-name", "h2", "h3", "h4", ".title", "strong", "b"}
|
|
for _, sel := range nameSelectors {
|
|
name := strings.TrimSpace(s.Find(sel).First().Text())
|
|
if name != "" && len(name) < 100 && !c.looksLikePosition(name) {
|
|
person.FullName = &name
|
|
c.parseName(name, person)
|
|
break
|
|
}
|
|
}
|
|
|
|
// If no name found, try the whole text
|
|
if person.LastName == "" {
|
|
text := strings.TrimSpace(s.Text())
|
|
lines := strings.Split(text, "\n")
|
|
if len(lines) > 0 {
|
|
firstLine := strings.TrimSpace(lines[0])
|
|
if len(firstLine) > 0 && len(firstLine) < 100 {
|
|
person.FullName = &firstLine
|
|
c.parseName(firstLine, person)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract email
|
|
s.Find("a[href^='mailto:']").Each(func(i int, a *goquery.Selection) {
|
|
href, _ := a.Attr("href")
|
|
email := strings.TrimPrefix(href, "mailto:")
|
|
email = strings.Split(email, "?")[0] // Remove query params
|
|
person.Email = &email
|
|
})
|
|
|
|
// Extract position
|
|
positionSelectors := []string{".position", ".role", ".job-title", ".funktion", "small", ".subtitle"}
|
|
for _, sel := range positionSelectors {
|
|
pos := strings.TrimSpace(s.Find(sel).First().Text())
|
|
if pos != "" && c.looksLikePosition(pos) {
|
|
person.Position = &pos
|
|
person.PositionType = c.classifyPosition(pos)
|
|
person.IsProfessor = c.isProfessor(pos)
|
|
break
|
|
}
|
|
}
|
|
|
|
// Extract photo
|
|
s.Find("img").Each(func(i int, img *goquery.Selection) {
|
|
src, exists := img.Attr("src")
|
|
if exists && !strings.Contains(src, "placeholder") && !strings.Contains(src, "icon") {
|
|
fullURL := resolveURL(baseURL, src)
|
|
person.PhotoURL = &fullURL
|
|
}
|
|
})
|
|
|
|
// Extract profile link
|
|
s.Find("a[href]").Each(func(i int, a *goquery.Selection) {
|
|
href, exists := a.Attr("href")
|
|
if exists && !strings.HasPrefix(href, "mailto:") && !strings.HasPrefix(href, "tel:") {
|
|
fullURL := resolveURL(baseURL, href)
|
|
if person.ProfileURL == nil {
|
|
person.ProfileURL = &fullURL
|
|
}
|
|
}
|
|
})
|
|
|
|
return person
|
|
}
|