Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
334 lines
8.8 KiB
Go
334 lines
8.8 KiB
Go
package staff
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
"github.com/breakpilot/edu-search-service/internal/database"
|
|
)
|
|
|
|
// StaffCrawler crawls university websites for staff information
|
|
type StaffCrawler struct {
|
|
client *http.Client
|
|
userAgent string
|
|
rateLimit time.Duration
|
|
lastFetch map[string]time.Time
|
|
mu sync.Mutex
|
|
repo *database.Repository
|
|
patterns *UniversityPatterns
|
|
}
|
|
|
|
// CrawlResult contains the result of a staff crawl
|
|
type CrawlResult struct {
|
|
UniversityID uuid.UUID
|
|
StaffFound int
|
|
StaffNew int
|
|
StaffUpdated int
|
|
Errors []string
|
|
Duration time.Duration
|
|
}
|
|
|
|
// NewStaffCrawler creates a new staff crawler
|
|
func NewStaffCrawler(repo *database.Repository) *StaffCrawler {
|
|
return &StaffCrawler{
|
|
client: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
if len(via) >= 5 {
|
|
return fmt.Errorf("too many redirects")
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
userAgent: "BreakPilot-EduBot/1.0 (+https://breakpilot.de/bot)",
|
|
rateLimit: 2 * time.Second, // 0.5 requests/second per domain
|
|
lastFetch: make(map[string]time.Time),
|
|
repo: repo,
|
|
patterns: NewUniversityPatterns(),
|
|
}
|
|
}
|
|
|
|
// CrawlUniversity crawls a single university for staff
|
|
func (c *StaffCrawler) CrawlUniversity(ctx context.Context, uni *database.University) (*CrawlResult, error) {
|
|
start := time.Now()
|
|
result := &CrawlResult{
|
|
UniversityID: uni.ID,
|
|
}
|
|
|
|
log.Printf("Starting staff crawl for %s (%s)", uni.Name, uni.URL)
|
|
|
|
// Find staff pages
|
|
staffPages, err := c.findStaffPages(ctx, uni)
|
|
if err != nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("Failed to find staff pages: %v", err))
|
|
return result, err
|
|
}
|
|
|
|
log.Printf("Found %d potential staff pages for %s", len(staffPages), uni.Name)
|
|
|
|
// Crawl each staff page
|
|
for _, pageURL := range staffPages {
|
|
select {
|
|
case <-ctx.Done():
|
|
result.Errors = append(result.Errors, "Crawl cancelled")
|
|
return result, ctx.Err()
|
|
default:
|
|
}
|
|
|
|
staffMembers, err := c.extractStaffFromPage(ctx, pageURL, uni)
|
|
if err != nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("Error crawling %s: %v", pageURL, err))
|
|
continue
|
|
}
|
|
|
|
for _, staff := range staffMembers {
|
|
staff.UniversityID = uni.ID
|
|
staff.SourceURL = &pageURL
|
|
|
|
err := c.repo.CreateStaff(ctx, staff)
|
|
if err != nil {
|
|
errMsg := fmt.Sprintf("Error saving staff %s %s: %v", stringValue(staff.FirstName), staff.LastName, err)
|
|
result.Errors = append(result.Errors, errMsg)
|
|
// Log first few errors for debugging
|
|
if len(result.Errors) <= 5 {
|
|
log.Printf("[StaffCrawler] %s", errMsg)
|
|
}
|
|
continue
|
|
}
|
|
|
|
result.StaffFound++
|
|
if staff.CreatedAt.Equal(staff.UpdatedAt) {
|
|
result.StaffNew++
|
|
} else {
|
|
result.StaffUpdated++
|
|
}
|
|
}
|
|
}
|
|
|
|
result.Duration = time.Since(start)
|
|
|
|
// Note: Profile enrichment is now handled by AI (vast.ai)
|
|
// The crawler only extracts profile links, AI extracts detailed data
|
|
log.Printf("Crawl complete. Profile enrichment will be handled by AI extraction.")
|
|
|
|
// Update crawl status
|
|
now := time.Now()
|
|
status := &database.UniversityCrawlStatus{
|
|
UniversityID: uni.ID,
|
|
LastStaffCrawl: &now,
|
|
StaffCrawlStatus: "completed",
|
|
StaffCount: result.StaffFound,
|
|
StaffErrors: result.Errors,
|
|
}
|
|
if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil {
|
|
log.Printf("Warning: Failed to update crawl status: %v", err)
|
|
}
|
|
|
|
log.Printf("Completed staff crawl for %s: found=%d, new=%d, updated=%d, errors=%d, duration=%v",
|
|
uni.Name, result.StaffFound, result.StaffNew, result.StaffUpdated, len(result.Errors), result.Duration)
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// fetchPage fetches a page with rate limiting
|
|
func (c *StaffCrawler) fetchPage(ctx context.Context, urlStr string) ([]byte, error) {
|
|
c.waitForRateLimit(urlStr)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.userAgent)
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
|
req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8")
|
|
|
|
resp, err := c.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
// Limit to 10MB
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return body, nil
|
|
}
|
|
|
|
// waitForRateLimit enforces rate limiting per domain
|
|
func (c *StaffCrawler) waitForRateLimit(urlStr string) {
|
|
u, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
host := u.Host
|
|
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
if last, ok := c.lastFetch[host]; ok {
|
|
elapsed := time.Since(last)
|
|
if elapsed < c.rateLimit {
|
|
time.Sleep(c.rateLimit - elapsed)
|
|
}
|
|
}
|
|
|
|
c.lastFetch[host] = time.Now()
|
|
}
|
|
|
|
// resolveURL resolves a relative URL against a base URL
|
|
func resolveURL(baseURL, href string) string {
|
|
if href == "" {
|
|
return ""
|
|
}
|
|
|
|
// Already absolute
|
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
|
return href
|
|
}
|
|
|
|
base, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
ref, err := url.Parse(href)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
return base.ResolveReference(ref).String()
|
|
}
|
|
|
|
// parseName parses a full name into first and last name
|
|
func (c *StaffCrawler) parseName(fullName string, person *database.UniversityStaff) {
|
|
// Remove title prefixes - be careful with M.A. and M.Sc. to not match names starting with "Ma"
|
|
// Only match these if followed by a space or dot
|
|
titlePattern := regexp.MustCompile(`(?i)^(Prof\.?\s*|Dr\.?\s*|M\.A\.\s+|M\.Sc\.\s+|Dipl\.?-?\w*\.?\s*|PD\s+)+`)
|
|
|
|
title := titlePattern.FindString(fullName)
|
|
if title != "" {
|
|
person.Title = &title
|
|
fullName = strings.TrimSpace(titlePattern.ReplaceAllString(fullName, ""))
|
|
}
|
|
|
|
// Clean up
|
|
fullName = strings.TrimSpace(fullName)
|
|
fullName = regexp.MustCompile(`\s+`).ReplaceAllString(fullName, " ")
|
|
|
|
parts := strings.Fields(fullName)
|
|
if len(parts) == 0 {
|
|
return
|
|
}
|
|
|
|
if len(parts) == 1 {
|
|
person.LastName = parts[0]
|
|
} else {
|
|
// Last word is usually the last name
|
|
person.LastName = parts[len(parts)-1]
|
|
firstName := strings.Join(parts[:len(parts)-1], " ")
|
|
person.FirstName = &firstName
|
|
}
|
|
}
|
|
|
|
// looksLikeName checks if a string looks like a person's name (not a role/function)
|
|
func (c *StaffCrawler) looksLikeName(text string) bool {
|
|
text = strings.TrimSpace(text)
|
|
if len(text) < 3 || len(text) > 100 {
|
|
return false
|
|
}
|
|
|
|
// Skip common non-name texts
|
|
skipWords := []string{
|
|
"internetkoordinator", "sekretariat", "verwaltung", "büro",
|
|
"übersicht", "kontakt", "impressum", "datenschutz",
|
|
"startseite", "home", "zurück", "mehr",
|
|
}
|
|
lower := strings.ToLower(text)
|
|
for _, skip := range skipWords {
|
|
if strings.Contains(lower, skip) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Check for common title prefixes that indicate a name
|
|
titlePrefixes := []string{"prof", "dr", "m.a.", "m.sc.", "dipl", "b.a.", "b.sc."}
|
|
for _, prefix := range titlePrefixes {
|
|
if strings.HasPrefix(lower, prefix) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Check if it has at least two words (first name, last name)
|
|
parts := strings.Fields(text)
|
|
return len(parts) >= 2
|
|
}
|
|
|
|
// looksLikePosition checks if a text looks like a position/title
|
|
func (c *StaffCrawler) looksLikePosition(text string) bool {
|
|
text = strings.ToLower(text)
|
|
positionKeywords := []string{
|
|
"professor", "prof.", "dozent", "lektor",
|
|
"wissenschaftlich", "mitarbeiter", "assistent",
|
|
"sekretär", "manager", "leiter", "direktor",
|
|
"researcher", "postdoc", "doktorand", "phd",
|
|
"student", "hilfskraft", "tutor",
|
|
}
|
|
|
|
for _, keyword := range positionKeywords {
|
|
if strings.Contains(text, keyword) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// classifyPosition classifies a position into a type
|
|
func (c *StaffCrawler) classifyPosition(position string) *string {
|
|
pos := strings.ToLower(position)
|
|
|
|
var posType string
|
|
switch {
|
|
case strings.Contains(pos, "professor") || strings.Contains(pos, "prof."):
|
|
posType = "professor"
|
|
case strings.Contains(pos, "postdoc") || strings.Contains(pos, "post-doc"):
|
|
posType = "postdoc"
|
|
case strings.Contains(pos, "doktorand") || strings.Contains(pos, "phd") || strings.Contains(pos, "promovend"):
|
|
posType = "phd_student"
|
|
case strings.Contains(pos, "wissenschaftlich") || strings.Contains(pos, "researcher"):
|
|
posType = "researcher"
|
|
case strings.Contains(pos, "sekretär") || strings.Contains(pos, "verwaltung") || strings.Contains(pos, "admin"):
|
|
posType = "admin"
|
|
case strings.Contains(pos, "student") || strings.Contains(pos, "hilfskraft"):
|
|
posType = "student"
|
|
default:
|
|
posType = "staff"
|
|
}
|
|
|
|
return &posType
|
|
}
|
|
|
|
// isProfessor checks if a position indicates a professor
|
|
func (c *StaffCrawler) isProfessor(position string) bool {
|
|
pos := strings.ToLower(position)
|
|
return strings.Contains(pos, "professor") || strings.Contains(pos, "prof.")
|
|
}
|