Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
247
edu-search-service/internal/staff/staff_crawler_discovery.go
Normal file
247
edu-search-service/internal/staff/staff_crawler_discovery.go
Normal file
@@ -0,0 +1,247 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
// findStaffPages discovers staff listing pages on a university website
|
||||
func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) {
|
||||
var pages []string
|
||||
|
||||
// Use custom pattern if available
|
||||
if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" {
|
||||
pages = append(pages, *uni.StaffPagePattern)
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// Try common patterns
|
||||
baseURL := strings.TrimSuffix(uni.URL, "/")
|
||||
commonPaths := []string{
|
||||
"/personen",
|
||||
"/team",
|
||||
"/mitarbeiter",
|
||||
"/mitarbeitende",
|
||||
"/staff",
|
||||
"/people",
|
||||
"/ueber-uns/team",
|
||||
"/about/team",
|
||||
"/fakultaet/personen",
|
||||
"/institute",
|
||||
}
|
||||
|
||||
for _, path := range commonPaths {
|
||||
testURL := baseURL + path
|
||||
exists, err := c.checkPageExists(ctx, testURL)
|
||||
if err == nil && exists {
|
||||
pages = append(pages, testURL)
|
||||
}
|
||||
}
|
||||
|
||||
// Also try to find staff links on the main page
|
||||
mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL)
|
||||
if err == nil {
|
||||
pages = append(pages, mainPageLinks...)
|
||||
}
|
||||
|
||||
// UOL-specific: Find department/personen pages through navigation
|
||||
// Check for both uol.de and uni-oldenburg.de (they are the same university)
|
||||
if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") {
|
||||
log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL)
|
||||
uolPages, err := c.findUOLDepartmentPages(ctx, baseURL)
|
||||
if err == nil {
|
||||
log.Printf("[UOL] Found %d department pages", len(uolPages))
|
||||
pages = append(pages, uolPages...)
|
||||
} else {
|
||||
log.Printf("[UOL] Error finding department pages: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate
|
||||
seen := make(map[string]bool)
|
||||
var unique []string
|
||||
for _, p := range pages {
|
||||
if !seen[p] {
|
||||
seen[p] = true
|
||||
unique = append(unique, p)
|
||||
}
|
||||
}
|
||||
|
||||
return unique, nil
|
||||
}
|
||||
|
||||
// findUOLDepartmentPages finds department person pages for Uni Oldenburg
|
||||
func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) {
|
||||
var pages []string
|
||||
|
||||
// UOL uses both uol.de and uni-oldenburg.de domains
|
||||
// Departments have /personen or /team subpages
|
||||
|
||||
// Helper to check if URL is UOL-related
|
||||
isUOLURL := func(url string) bool {
|
||||
lower := strings.ToLower(url)
|
||||
return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de")
|
||||
}
|
||||
|
||||
// First try to find department links from known starting points
|
||||
startPages := []string{
|
||||
"https://uol.de/informatik/department/abteilungen-und-einrichtungen",
|
||||
"https://uol.de/fk2",
|
||||
"https://uol.de/fk1",
|
||||
"https://uol.de/fk3",
|
||||
"https://uol.de/fk4",
|
||||
"https://uol.de/fk5",
|
||||
"https://uol.de/fk6",
|
||||
baseURL,
|
||||
}
|
||||
|
||||
deptPaths := make(map[string]bool)
|
||||
|
||||
for _, startURL := range startPages {
|
||||
log.Printf("[UOL] Scanning start page: %s", startURL)
|
||||
body, err := c.fetchPage(ctx, startURL)
|
||||
if err != nil {
|
||||
log.Printf("[UOL] Error fetching %s: %v", startURL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Find links to department pages (they typically have /personen subpages)
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Look for department-like paths
|
||||
hrefLower := strings.ToLower(href)
|
||||
isDeptPath := isUOLURL(href) &&
|
||||
!strings.Contains(hrefLower, "/studium") &&
|
||||
!strings.Contains(hrefLower, "/forschung") &&
|
||||
!strings.Contains(hrefLower, "/aktuelles") &&
|
||||
!strings.Contains(hrefLower, "/kontakt")
|
||||
|
||||
if isDeptPath {
|
||||
fullURL := resolveURL(startURL, href)
|
||||
if fullURL != "" && isUOLURL(fullURL) {
|
||||
// Add personen page for this department
|
||||
personenURL := strings.TrimSuffix(fullURL, "/") + "/personen"
|
||||
deptPaths[personenURL] = true
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Also look for direct /personen or /team links
|
||||
doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if exists {
|
||||
fullURL := resolveURL(startURL, href)
|
||||
if fullURL != "" && isUOLURL(fullURL) {
|
||||
deptPaths[fullURL] = true
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Add well-known department personen pages directly (these exist for sure)
|
||||
knownDepts := []string{
|
||||
"https://uol.de/socps/personen",
|
||||
"https://uol.de/vlba/team",
|
||||
"https://uol.de/informatik/department",
|
||||
"https://uol.de/se/team",
|
||||
"https://uol.de/ei/personen",
|
||||
"https://uol.de/is/team",
|
||||
"https://uol.de/paedagogik/personen",
|
||||
"https://uol.de/psychologie/personen",
|
||||
"https://uol.de/germanistik/personen",
|
||||
"https://uol.de/physik/personen",
|
||||
"https://uol.de/chemie/personen",
|
||||
"https://uol.de/biologie/personen",
|
||||
"https://uol.de/mathe/personen",
|
||||
}
|
||||
for _, dept := range knownDepts {
|
||||
deptPaths[dept] = true
|
||||
}
|
||||
|
||||
log.Printf("[UOL] Checking %d potential department pages", len(deptPaths))
|
||||
|
||||
// Verify which pages actually exist
|
||||
for path := range deptPaths {
|
||||
exists, err := c.checkPageExists(ctx, path)
|
||||
if err == nil && exists {
|
||||
log.Printf("[UOL] Found valid page: %s", path)
|
||||
pages = append(pages, path)
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[UOL] Found %d valid department/personen pages", len(pages))
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// checkPageExists checks if a URL returns a 200 status
|
||||
func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) {
|
||||
c.waitForRateLimit(urlStr)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK, nil
|
||||
}
|
||||
|
||||
// findStaffLinksOnPage finds links to staff pages on a given page
|
||||
func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) {
|
||||
body, err := c.fetchPage(ctx, pageURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var links []string
|
||||
staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"}
|
||||
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
text := strings.ToLower(s.Text())
|
||||
hrefLower := strings.ToLower(href)
|
||||
|
||||
for _, keyword := range staffKeywords {
|
||||
if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) {
|
||||
fullURL := resolveURL(pageURL, href)
|
||||
if fullURL != "" {
|
||||
links = append(links, fullURL)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return links, nil
|
||||
}
|
||||
Reference in New Issue
Block a user