package staff import ( "bytes" "context" "log" "strings" "github.com/PuerkitoBio/goquery" "github.com/breakpilot/edu-search-service/internal/database" ) // extractStaffFromPage extracts staff information from a staff listing page func (c *StaffCrawler) extractStaffFromPage(ctx context.Context, pageURL string, uni *database.University) ([]*database.UniversityStaff, error) { body, err := c.fetchPage(ctx, pageURL) if err != nil { return nil, err } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { return nil, err } var staff []*database.UniversityStaff // Try different extraction strategies extractors := []func(*goquery.Document, string) []*database.UniversityStaff{ c.extractFromUOLPatterns, // UOL-specific patterns first c.extractFromPersonCards, c.extractFromTable, c.extractFromList, c.extractFromVCard, } for _, extractor := range extractors { extracted := extractor(doc, pageURL) if len(extracted) > 0 { staff = append(staff, extracted...) } } return staff, nil } // extractFromUOLPatterns extracts staff using Uni Oldenburg specific patterns // UOL uses: nav#left-nav for person lists, p.mit-icon.person for person links, // and /suche/person?username=XXX for person API // Also captures hierarchy from section headers (Leitung, Mitarbeiter, etc.) func (c *StaffCrawler) extractFromUOLPatterns(doc *goquery.Document, baseURL string) []*database.UniversityStaff { var staff []*database.UniversityStaff seen := make(map[string]bool) // Extract department name from page title or breadcrumb deptName := "" doc.Find("h1").First().Each(func(i int, s *goquery.Selection) { deptName = strings.TrimSpace(s.Text()) }) // Pattern 5 (NEW): Parse content with hierarchy headers // UOL pages have structure like: // #### Leitung // // #### Wissenschaftliche Mitarbeiterinnen und Mitarbeiter // currentRole := "" var leaderName string // Track the department head for supervisor assignment // Walk through content area looking for headers and lists doc.Find("#content h4, #content h3, #content ul li a, .inhalt h4, .inhalt h3, .inhalt ul li a").Each(func(i int, s *goquery.Selection) { tagName := goquery.NodeName(s) // Check if this is a section header if tagName == "h3" || tagName == "h4" { headerText := strings.ToLower(strings.TrimSpace(s.Text())) if strings.Contains(headerText, "leitung") { currentRole = "leitung" } else if strings.Contains(headerText, "sekretariat") { currentRole = "sekretariat" } else if strings.Contains(headerText, "wissenschaftlich") || strings.Contains(headerText, "mitarbeiter") { currentRole = "mitarbeiter" } else if strings.Contains(headerText, "doktorand") || strings.Contains(headerText, "promovierend") { currentRole = "doktorand" } else if strings.Contains(headerText, "technisch") { currentRole = "technisch" } else if strings.Contains(headerText, "extern") { currentRole = "extern" } else if strings.Contains(headerText, "student") || strings.Contains(headerText, "hilfskr") || strings.Contains(headerText, "hiwi") { currentRole = "hiwi" } return } // Process person links under current header if tagName == "a" { href, exists := s.Attr("href") if !exists { return } // Check if this looks like a person page link if !strings.Contains(href, "/personen/") && !strings.Contains(href, "suche/person") { return } name := strings.TrimSpace(s.Text()) if name == "" || seen[name] || !c.looksLikeName(name) { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL // Set team role based on current section if currentRole != "" { person.TeamRole = ¤tRole } // Track leader for supervisor assignment if currentRole == "leitung" && leaderName == "" { leaderName = name person.IsProfessor = true posType := "professor" person.PositionType = &posType } staff = append(staff, person) } } }) // Pattern 1: nav#left-nav ul li a - side navigation with person links // Format: /abteilung/personen/prof-dr-name or /abteilung/personen/m-sc-name doc.Find("nav#left-nav ul li a, #left-navi li a").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if !exists { return } // Check if this looks like a person page link if !strings.Contains(href, "/personen/") { return } name := strings.TrimSpace(s.Text()) if name == "" || seen[name] { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL staff = append(staff, person) } }) // Pattern 2: p.mit-icon.person a - inline person references // Format:

Prof. Dr. Name

// OR:

Prof. Dr. Name

doc.Find("p.mit-icon.person a, .mit-icon.person a").Each(func(i int, s *goquery.Selection) { name := strings.TrimSpace(s.Text()) if name == "" || seen[name] { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { href, exists := s.Attr("href") if exists { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL } staff = append(staff, person) } }) // Pattern 3: Links to /suche/person?username=XXX doc.Find("a[href*='suche/person']").Each(func(i int, s *goquery.Selection) { name := strings.TrimSpace(s.Text()) // Skip non-person text like "Internetkoordinator" if name == "" || seen[name] || !c.looksLikeName(name) { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { href, exists := s.Attr("href") if exists { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL } staff = append(staff, person) } }) // Pattern 4: Breadcrumb navigation sublinks with person names // Format: