package staff import ( "bytes" "context" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" "github.com/google/uuid" "github.com/breakpilot/edu-search-service/internal/database" ) // StaffCrawler crawls university websites for staff information type StaffCrawler struct { client *http.Client userAgent string rateLimit time.Duration lastFetch map[string]time.Time mu sync.Mutex repo *database.Repository patterns *UniversityPatterns } // CrawlResult contains the result of a staff crawl type CrawlResult struct { UniversityID uuid.UUID StaffFound int StaffNew int StaffUpdated int Errors []string Duration time.Duration } // NewStaffCrawler creates a new staff crawler func NewStaffCrawler(repo *database.Repository) *StaffCrawler { return &StaffCrawler{ client: &http.Client{ Timeout: 30 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 5 { return fmt.Errorf("too many redirects") } return nil }, }, userAgent: "BreakPilot-EduBot/1.0 (+https://breakpilot.de/bot)", rateLimit: 2 * time.Second, // 0.5 requests/second per domain lastFetch: make(map[string]time.Time), repo: repo, patterns: NewUniversityPatterns(), } } // CrawlUniversity crawls a single university for staff func (c *StaffCrawler) CrawlUniversity(ctx context.Context, uni *database.University) (*CrawlResult, error) { start := time.Now() result := &CrawlResult{ UniversityID: uni.ID, } log.Printf("Starting staff crawl for %s (%s)", uni.Name, uni.URL) // Find staff pages staffPages, err := c.findStaffPages(ctx, uni) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Failed to find staff pages: %v", err)) return result, err } log.Printf("Found %d potential staff pages for %s", len(staffPages), uni.Name) // Crawl each staff page for _, pageURL := range staffPages { select { case <-ctx.Done(): result.Errors = append(result.Errors, "Crawl cancelled") return result, ctx.Err() default: } staffMembers, err := c.extractStaffFromPage(ctx, pageURL, uni) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Error crawling %s: %v", pageURL, err)) continue } for _, staff := range staffMembers { staff.UniversityID = uni.ID staff.SourceURL = &pageURL err := c.repo.CreateStaff(ctx, staff) if err != nil { errMsg := fmt.Sprintf("Error saving staff %s %s: %v", stringValue(staff.FirstName), staff.LastName, err) result.Errors = append(result.Errors, errMsg) // Log first few errors for debugging if len(result.Errors) <= 5 { log.Printf("[StaffCrawler] %s", errMsg) } continue } result.StaffFound++ if staff.CreatedAt.Equal(staff.UpdatedAt) { result.StaffNew++ } else { result.StaffUpdated++ } } } result.Duration = time.Since(start) // Note: Profile enrichment is now handled by AI (vast.ai) // The crawler only extracts profile links, AI extracts detailed data log.Printf("Crawl complete. Profile enrichment will be handled by AI extraction.") // Update crawl status now := time.Now() status := &database.UniversityCrawlStatus{ UniversityID: uni.ID, LastStaffCrawl: &now, StaffCrawlStatus: "completed", StaffCount: result.StaffFound, StaffErrors: result.Errors, } if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil { log.Printf("Warning: Failed to update crawl status: %v", err) } log.Printf("Completed staff crawl for %s: found=%d, new=%d, updated=%d, errors=%d, duration=%v", uni.Name, result.StaffFound, result.StaffNew, result.StaffUpdated, len(result.Errors), result.Duration) return result, nil } // findStaffPages discovers staff listing pages on a university website func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) { var pages []string // Use custom pattern if available if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" { pages = append(pages, *uni.StaffPagePattern) return pages, nil } // Try common patterns baseURL := strings.TrimSuffix(uni.URL, "/") commonPaths := []string{ "/personen", "/team", "/mitarbeiter", "/mitarbeitende", "/staff", "/people", "/ueber-uns/team", "/about/team", "/fakultaet/personen", "/institute", } for _, path := range commonPaths { testURL := baseURL + path exists, err := c.checkPageExists(ctx, testURL) if err == nil && exists { pages = append(pages, testURL) } } // Also try to find staff links on the main page mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL) if err == nil { pages = append(pages, mainPageLinks...) } // UOL-specific: Find department/personen pages through navigation // Check for both uol.de and uni-oldenburg.de (they are the same university) if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") { log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL) uolPages, err := c.findUOLDepartmentPages(ctx, baseURL) if err == nil { log.Printf("[UOL] Found %d department pages", len(uolPages)) pages = append(pages, uolPages...) } else { log.Printf("[UOL] Error finding department pages: %v", err) } } // Deduplicate seen := make(map[string]bool) var unique []string for _, p := range pages { if !seen[p] { seen[p] = true unique = append(unique, p) } } return unique, nil } // findUOLDepartmentPages finds department person pages for Uni Oldenburg func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) { var pages []string // UOL uses both uol.de and uni-oldenburg.de domains // Departments have /personen or /team subpages // Helper to check if URL is UOL-related isUOLURL := func(url string) bool { lower := strings.ToLower(url) return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de") } // First try to find department links from known starting points startPages := []string{ "https://uol.de/informatik/department/abteilungen-und-einrichtungen", "https://uol.de/fk2", "https://uol.de/fk1", "https://uol.de/fk3", "https://uol.de/fk4", "https://uol.de/fk5", "https://uol.de/fk6", baseURL, } deptPaths := make(map[string]bool) for _, startURL := range startPages { log.Printf("[UOL] Scanning start page: %s", startURL) body, err := c.fetchPage(ctx, startURL) if err != nil { log.Printf("[UOL] Error fetching %s: %v", startURL, err) continue } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { continue } // Find links to department pages (they typically have /personen subpages) doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if !exists { return } // Look for department-like paths hrefLower := strings.ToLower(href) isDeptPath := isUOLURL(href) && !strings.Contains(hrefLower, "/studium") && !strings.Contains(hrefLower, "/forschung") && !strings.Contains(hrefLower, "/aktuelles") && !strings.Contains(hrefLower, "/kontakt") if isDeptPath { fullURL := resolveURL(startURL, href) if fullURL != "" && isUOLURL(fullURL) { // Add personen page for this department personenURL := strings.TrimSuffix(fullURL, "/") + "/personen" deptPaths[personenURL] = true } } }) // Also look for direct /personen or /team links doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if exists { fullURL := resolveURL(startURL, href) if fullURL != "" && isUOLURL(fullURL) { deptPaths[fullURL] = true } } }) } // Add well-known department personen pages directly (these exist for sure) knownDepts := []string{ "https://uol.de/socps/personen", "https://uol.de/vlba/team", "https://uol.de/informatik/department", "https://uol.de/se/team", "https://uol.de/ei/personen", "https://uol.de/is/team", "https://uol.de/paedagogik/personen", "https://uol.de/psychologie/personen", "https://uol.de/germanistik/personen", "https://uol.de/physik/personen", "https://uol.de/chemie/personen", "https://uol.de/biologie/personen", "https://uol.de/mathe/personen", } for _, dept := range knownDepts { deptPaths[dept] = true } log.Printf("[UOL] Checking %d potential department pages", len(deptPaths)) // Verify which pages actually exist for path := range deptPaths { exists, err := c.checkPageExists(ctx, path) if err == nil && exists { log.Printf("[UOL] Found valid page: %s", path) pages = append(pages, path) } } log.Printf("[UOL] Found %d valid department/personen pages", len(pages)) return pages, nil } // checkPageExists checks if a URL returns a 200 status func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) { c.waitForRateLimit(urlStr) req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil) if err != nil { return false, err } req.Header.Set("User-Agent", c.userAgent) resp, err := c.client.Do(req) if err != nil { return false, err } defer resp.Body.Close() return resp.StatusCode == http.StatusOK, nil } // findStaffLinksOnPage finds links to staff pages on a given page func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) { body, err := c.fetchPage(ctx, pageURL) if err != nil { return nil, err } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { return nil, err } var links []string staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"} doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if !exists { return } text := strings.ToLower(s.Text()) hrefLower := strings.ToLower(href) for _, keyword := range staffKeywords { if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) { fullURL := resolveURL(pageURL, href) if fullURL != "" { links = append(links, fullURL) } break } } }) return links, nil } // extractStaffFromPage extracts staff information from a staff listing page func (c *StaffCrawler) extractStaffFromPage(ctx context.Context, pageURL string, uni *database.University) ([]*database.UniversityStaff, error) { body, err := c.fetchPage(ctx, pageURL) if err != nil { return nil, err } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { return nil, err } var staff []*database.UniversityStaff // Try different extraction strategies extractors := []func(*goquery.Document, string) []*database.UniversityStaff{ c.extractFromUOLPatterns, // UOL-specific patterns first c.extractFromPersonCards, c.extractFromTable, c.extractFromList, c.extractFromVCard, } for _, extractor := range extractors { extracted := extractor(doc, pageURL) if len(extracted) > 0 { staff = append(staff, extracted...) } } return staff, nil } // extractFromUOLPatterns extracts staff using Uni Oldenburg specific patterns // UOL uses: nav#left-nav for person lists, p.mit-icon.person for person links, // and /suche/person?username=XXX for person API // Also captures hierarchy from section headers (Leitung, Mitarbeiter, etc.) func (c *StaffCrawler) extractFromUOLPatterns(doc *goquery.Document, baseURL string) []*database.UniversityStaff { var staff []*database.UniversityStaff seen := make(map[string]bool) // Extract department name from page title or breadcrumb deptName := "" doc.Find("h1").First().Each(func(i int, s *goquery.Selection) { deptName = strings.TrimSpace(s.Text()) }) // Pattern 5 (NEW): Parse content with hierarchy headers // UOL pages have structure like: // #### Leitung // // #### Wissenschaftliche Mitarbeiterinnen und Mitarbeiter // currentRole := "" var leaderName string // Track the department head for supervisor assignment // Walk through content area looking for headers and lists doc.Find("#content h4, #content h3, #content ul li a, .inhalt h4, .inhalt h3, .inhalt ul li a").Each(func(i int, s *goquery.Selection) { tagName := goquery.NodeName(s) // Check if this is a section header if tagName == "h3" || tagName == "h4" { headerText := strings.ToLower(strings.TrimSpace(s.Text())) if strings.Contains(headerText, "leitung") { currentRole = "leitung" } else if strings.Contains(headerText, "sekretariat") { currentRole = "sekretariat" } else if strings.Contains(headerText, "wissenschaftlich") || strings.Contains(headerText, "mitarbeiter") { currentRole = "mitarbeiter" } else if strings.Contains(headerText, "doktorand") || strings.Contains(headerText, "promovierend") { currentRole = "doktorand" } else if strings.Contains(headerText, "technisch") { currentRole = "technisch" } else if strings.Contains(headerText, "extern") { currentRole = "extern" } else if strings.Contains(headerText, "student") || strings.Contains(headerText, "hilfskr") || strings.Contains(headerText, "hiwi") { currentRole = "hiwi" } return } // Process person links under current header if tagName == "a" { href, exists := s.Attr("href") if !exists { return } // Check if this looks like a person page link if !strings.Contains(href, "/personen/") && !strings.Contains(href, "suche/person") { return } name := strings.TrimSpace(s.Text()) if name == "" || seen[name] || !c.looksLikeName(name) { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL // Set team role based on current section if currentRole != "" { person.TeamRole = ¤tRole } // Track leader for supervisor assignment if currentRole == "leitung" && leaderName == "" { leaderName = name person.IsProfessor = true posType := "professor" person.PositionType = &posType } staff = append(staff, person) } } }) // Pattern 1: nav#left-nav ul li a - side navigation with person links // Format: /abteilung/personen/prof-dr-name or /abteilung/personen/m-sc-name doc.Find("nav#left-nav ul li a, #left-navi li a").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if !exists { return } // Check if this looks like a person page link if !strings.Contains(href, "/personen/") { return } name := strings.TrimSpace(s.Text()) if name == "" || seen[name] { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL staff = append(staff, person) } }) // Pattern 2: p.mit-icon.person a - inline person references // Format:

Prof. Dr. Name

// OR:

Prof. Dr. Name

doc.Find("p.mit-icon.person a, .mit-icon.person a").Each(func(i int, s *goquery.Selection) { name := strings.TrimSpace(s.Text()) if name == "" || seen[name] { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { href, exists := s.Attr("href") if exists { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL } staff = append(staff, person) } }) // Pattern 3: Links to /suche/person?username=XXX doc.Find("a[href*='suche/person']").Each(func(i int, s *goquery.Selection) { name := strings.TrimSpace(s.Text()) // Skip non-person text like "Internetkoordinator" if name == "" || seen[name] || !c.looksLikeName(name) { return } seen[name] = true person := &database.UniversityStaff{} person.FullName = &name c.parseName(name, person) if person.LastName != "" { href, exists := s.Attr("href") if exists { fullURL := resolveURL(baseURL, href) person.ProfileURL = &fullURL } staff = append(staff, person) } }) // Pattern 4: Breadcrumb navigation sublinks with person names // Format: