package staff import ( "bytes" "context" "log" "net/http" "strings" "github.com/PuerkitoBio/goquery" "github.com/breakpilot/edu-search-service/internal/database" ) // findStaffPages discovers staff listing pages on a university website func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) { var pages []string // Use custom pattern if available if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" { pages = append(pages, *uni.StaffPagePattern) return pages, nil } // Try common patterns baseURL := strings.TrimSuffix(uni.URL, "/") commonPaths := []string{ "/personen", "/team", "/mitarbeiter", "/mitarbeitende", "/staff", "/people", "/ueber-uns/team", "/about/team", "/fakultaet/personen", "/institute", } for _, path := range commonPaths { testURL := baseURL + path exists, err := c.checkPageExists(ctx, testURL) if err == nil && exists { pages = append(pages, testURL) } } // Also try to find staff links on the main page mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL) if err == nil { pages = append(pages, mainPageLinks...) } // UOL-specific: Find department/personen pages through navigation // Check for both uol.de and uni-oldenburg.de (they are the same university) if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") { log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL) uolPages, err := c.findUOLDepartmentPages(ctx, baseURL) if err == nil { log.Printf("[UOL] Found %d department pages", len(uolPages)) pages = append(pages, uolPages...) } else { log.Printf("[UOL] Error finding department pages: %v", err) } } // Deduplicate seen := make(map[string]bool) var unique []string for _, p := range pages { if !seen[p] { seen[p] = true unique = append(unique, p) } } return unique, nil } // findUOLDepartmentPages finds department person pages for Uni Oldenburg func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) { var pages []string // UOL uses both uol.de and uni-oldenburg.de domains // Departments have /personen or /team subpages // Helper to check if URL is UOL-related isUOLURL := func(url string) bool { lower := strings.ToLower(url) return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de") } // First try to find department links from known starting points startPages := []string{ "https://uol.de/informatik/department/abteilungen-und-einrichtungen", "https://uol.de/fk2", "https://uol.de/fk1", "https://uol.de/fk3", "https://uol.de/fk4", "https://uol.de/fk5", "https://uol.de/fk6", baseURL, } deptPaths := make(map[string]bool) for _, startURL := range startPages { log.Printf("[UOL] Scanning start page: %s", startURL) body, err := c.fetchPage(ctx, startURL) if err != nil { log.Printf("[UOL] Error fetching %s: %v", startURL, err) continue } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { continue } // Find links to department pages (they typically have /personen subpages) doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if !exists { return } // Look for department-like paths hrefLower := strings.ToLower(href) isDeptPath := isUOLURL(href) && !strings.Contains(hrefLower, "/studium") && !strings.Contains(hrefLower, "/forschung") && !strings.Contains(hrefLower, "/aktuelles") && !strings.Contains(hrefLower, "/kontakt") if isDeptPath { fullURL := resolveURL(startURL, href) if fullURL != "" && isUOLURL(fullURL) { // Add personen page for this department personenURL := strings.TrimSuffix(fullURL, "/") + "/personen" deptPaths[personenURL] = true } } }) // Also look for direct /personen or /team links doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if exists { fullURL := resolveURL(startURL, href) if fullURL != "" && isUOLURL(fullURL) { deptPaths[fullURL] = true } } }) } // Add well-known department personen pages directly (these exist for sure) knownDepts := []string{ "https://uol.de/socps/personen", "https://uol.de/vlba/team", "https://uol.de/informatik/department", "https://uol.de/se/team", "https://uol.de/ei/personen", "https://uol.de/is/team", "https://uol.de/paedagogik/personen", "https://uol.de/psychologie/personen", "https://uol.de/germanistik/personen", "https://uol.de/physik/personen", "https://uol.de/chemie/personen", "https://uol.de/biologie/personen", "https://uol.de/mathe/personen", } for _, dept := range knownDepts { deptPaths[dept] = true } log.Printf("[UOL] Checking %d potential department pages", len(deptPaths)) // Verify which pages actually exist for path := range deptPaths { exists, err := c.checkPageExists(ctx, path) if err == nil && exists { log.Printf("[UOL] Found valid page: %s", path) pages = append(pages, path) } } log.Printf("[UOL] Found %d valid department/personen pages", len(pages)) return pages, nil } // checkPageExists checks if a URL returns a 200 status func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) { c.waitForRateLimit(urlStr) req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil) if err != nil { return false, err } req.Header.Set("User-Agent", c.userAgent) resp, err := c.client.Do(req) if err != nil { return false, err } defer resp.Body.Close() return resp.StatusCode == http.StatusOK, nil } // findStaffLinksOnPage finds links to staff pages on a given page func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) { body, err := c.fetchPage(ctx, pageURL) if err != nil { return nil, err } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { return nil, err } var links []string staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"} doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if !exists { return } text := strings.ToLower(s.Text()) hrefLower := strings.ToLower(href) for _, keyword := range staffKeywords { if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) { fullURL := resolveURL(pageURL, href) if fullURL != "" { links = append(links, fullURL) } break } } }) return links, nil }