package staff import ( "context" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "sync" "time" "github.com/google/uuid" "github.com/breakpilot/edu-search-service/internal/database" ) // StaffCrawler crawls university websites for staff information type StaffCrawler struct { client *http.Client userAgent string rateLimit time.Duration lastFetch map[string]time.Time mu sync.Mutex repo *database.Repository patterns *UniversityPatterns } // CrawlResult contains the result of a staff crawl type CrawlResult struct { UniversityID uuid.UUID StaffFound int StaffNew int StaffUpdated int Errors []string Duration time.Duration } // NewStaffCrawler creates a new staff crawler func NewStaffCrawler(repo *database.Repository) *StaffCrawler { return &StaffCrawler{ client: &http.Client{ Timeout: 30 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 5 { return fmt.Errorf("too many redirects") } return nil }, }, userAgent: "BreakPilot-EduBot/1.0 (+https://breakpilot.de/bot)", rateLimit: 2 * time.Second, // 0.5 requests/second per domain lastFetch: make(map[string]time.Time), repo: repo, patterns: NewUniversityPatterns(), } } // CrawlUniversity crawls a single university for staff func (c *StaffCrawler) CrawlUniversity(ctx context.Context, uni *database.University) (*CrawlResult, error) { start := time.Now() result := &CrawlResult{ UniversityID: uni.ID, } log.Printf("Starting staff crawl for %s (%s)", uni.Name, uni.URL) // Find staff pages staffPages, err := c.findStaffPages(ctx, uni) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Failed to find staff pages: %v", err)) return result, err } log.Printf("Found %d potential staff pages for %s", len(staffPages), uni.Name) // Crawl each staff page for _, pageURL := range staffPages { select { case <-ctx.Done(): result.Errors = append(result.Errors, "Crawl cancelled") return result, ctx.Err() default: } staffMembers, err := c.extractStaffFromPage(ctx, pageURL, uni) if err != nil { result.Errors = append(result.Errors, fmt.Sprintf("Error crawling %s: %v", pageURL, err)) continue } for _, staff := range staffMembers { staff.UniversityID = uni.ID staff.SourceURL = &pageURL err := c.repo.CreateStaff(ctx, staff) if err != nil { errMsg := fmt.Sprintf("Error saving staff %s %s: %v", stringValue(staff.FirstName), staff.LastName, err) result.Errors = append(result.Errors, errMsg) // Log first few errors for debugging if len(result.Errors) <= 5 { log.Printf("[StaffCrawler] %s", errMsg) } continue } result.StaffFound++ if staff.CreatedAt.Equal(staff.UpdatedAt) { result.StaffNew++ } else { result.StaffUpdated++ } } } result.Duration = time.Since(start) // Note: Profile enrichment is now handled by AI (vast.ai) // The crawler only extracts profile links, AI extracts detailed data log.Printf("Crawl complete. Profile enrichment will be handled by AI extraction.") // Update crawl status now := time.Now() status := &database.UniversityCrawlStatus{ UniversityID: uni.ID, LastStaffCrawl: &now, StaffCrawlStatus: "completed", StaffCount: result.StaffFound, StaffErrors: result.Errors, } if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil { log.Printf("Warning: Failed to update crawl status: %v", err) } log.Printf("Completed staff crawl for %s: found=%d, new=%d, updated=%d, errors=%d, duration=%v", uni.Name, result.StaffFound, result.StaffNew, result.StaffUpdated, len(result.Errors), result.Duration) return result, nil } // fetchPage fetches a page with rate limiting func (c *StaffCrawler) fetchPage(ctx context.Context, urlStr string) ([]byte, error) { c.waitForRateLimit(urlStr) req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", c.userAgent) req.Header.Set("Accept", "text/html,application/xhtml+xml") req.Header.Set("Accept-Language", "de-DE,de;q=0.9,en;q=0.8") resp, err := c.client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d", resp.StatusCode) } // Limit to 10MB body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) if err != nil { return nil, err } return body, nil } // waitForRateLimit enforces rate limiting per domain func (c *StaffCrawler) waitForRateLimit(urlStr string) { u, err := url.Parse(urlStr) if err != nil { return } host := u.Host c.mu.Lock() defer c.mu.Unlock() if last, ok := c.lastFetch[host]; ok { elapsed := time.Since(last) if elapsed < c.rateLimit { time.Sleep(c.rateLimit - elapsed) } } c.lastFetch[host] = time.Now() } // resolveURL resolves a relative URL against a base URL func resolveURL(baseURL, href string) string { if href == "" { return "" } // Already absolute if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { return href } base, err := url.Parse(baseURL) if err != nil { return "" } ref, err := url.Parse(href) if err != nil { return "" } return base.ResolveReference(ref).String() } // parseName parses a full name into first and last name func (c *StaffCrawler) parseName(fullName string, person *database.UniversityStaff) { // Remove title prefixes - be careful with M.A. and M.Sc. to not match names starting with "Ma" // Only match these if followed by a space or dot titlePattern := regexp.MustCompile(`(?i)^(Prof\.?\s*|Dr\.?\s*|M\.A\.\s+|M\.Sc\.\s+|Dipl\.?-?\w*\.?\s*|PD\s+)+`) title := titlePattern.FindString(fullName) if title != "" { person.Title = &title fullName = strings.TrimSpace(titlePattern.ReplaceAllString(fullName, "")) } // Clean up fullName = strings.TrimSpace(fullName) fullName = regexp.MustCompile(`\s+`).ReplaceAllString(fullName, " ") parts := strings.Fields(fullName) if len(parts) == 0 { return } if len(parts) == 1 { person.LastName = parts[0] } else { // Last word is usually the last name person.LastName = parts[len(parts)-1] firstName := strings.Join(parts[:len(parts)-1], " ") person.FirstName = &firstName } } // looksLikeName checks if a string looks like a person's name (not a role/function) func (c *StaffCrawler) looksLikeName(text string) bool { text = strings.TrimSpace(text) if len(text) < 3 || len(text) > 100 { return false } // Skip common non-name texts skipWords := []string{ "internetkoordinator", "sekretariat", "verwaltung", "büro", "übersicht", "kontakt", "impressum", "datenschutz", "startseite", "home", "zurück", "mehr", } lower := strings.ToLower(text) for _, skip := range skipWords { if strings.Contains(lower, skip) { return false } } // Check for common title prefixes that indicate a name titlePrefixes := []string{"prof", "dr", "m.a.", "m.sc.", "dipl", "b.a.", "b.sc."} for _, prefix := range titlePrefixes { if strings.HasPrefix(lower, prefix) { return true } } // Check if it has at least two words (first name, last name) parts := strings.Fields(text) return len(parts) >= 2 } // looksLikePosition checks if a text looks like a position/title func (c *StaffCrawler) looksLikePosition(text string) bool { text = strings.ToLower(text) positionKeywords := []string{ "professor", "prof.", "dozent", "lektor", "wissenschaftlich", "mitarbeiter", "assistent", "sekretär", "manager", "leiter", "direktor", "researcher", "postdoc", "doktorand", "phd", "student", "hilfskraft", "tutor", } for _, keyword := range positionKeywords { if strings.Contains(text, keyword) { return true } } return false } // classifyPosition classifies a position into a type func (c *StaffCrawler) classifyPosition(position string) *string { pos := strings.ToLower(position) var posType string switch { case strings.Contains(pos, "professor") || strings.Contains(pos, "prof."): posType = "professor" case strings.Contains(pos, "postdoc") || strings.Contains(pos, "post-doc"): posType = "postdoc" case strings.Contains(pos, "doktorand") || strings.Contains(pos, "phd") || strings.Contains(pos, "promovend"): posType = "phd_student" case strings.Contains(pos, "wissenschaftlich") || strings.Contains(pos, "researcher"): posType = "researcher" case strings.Contains(pos, "sekretär") || strings.Contains(pos, "verwaltung") || strings.Contains(pos, "admin"): posType = "admin" case strings.Contains(pos, "student") || strings.Contains(pos, "hilfskraft"): posType = "student" default: posType = "staff" } return &posType } // isProfessor checks if a position indicates a professor func (c *StaffCrawler) isProfessor(position string) bool { pos := strings.ToLower(position) return strings.Contains(pos, "professor") || strings.Contains(pos, "prof.") }