Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
@@ -0,0 +1,247 @@
+package staff
+
+import (
+	"bytes"
+	"context"
+	"log"
+	"net/http"
+	"strings"
+
+	"github.com/PuerkitoBio/goquery"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+)
+
+// findStaffPages discovers staff listing pages on a university website
+func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) {
+	var pages []string
+
+	// Use custom pattern if available
+	if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" {
+		pages = append(pages, *uni.StaffPagePattern)
+		return pages, nil
+	}
+
+	// Try common patterns
+	baseURL := strings.TrimSuffix(uni.URL, "/")
+	commonPaths := []string{
+		"/personen",
+		"/team",
+		"/mitarbeiter",
+		"/mitarbeitende",
+		"/staff",
+		"/people",
+		"/ueber-uns/team",
+		"/about/team",
+		"/fakultaet/personen",
+		"/institute",
+	}
+
+	for _, path := range commonPaths {
+		testURL := baseURL + path
+		exists, err := c.checkPageExists(ctx, testURL)
+		if err == nil && exists {
+			pages = append(pages, testURL)
+		}
+	}
+
+	// Also try to find staff links on the main page
+	mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL)
+	if err == nil {
+		pages = append(pages, mainPageLinks...)
+	}
+
+	// UOL-specific: Find department/personen pages through navigation
+	// Check for both uol.de and uni-oldenburg.de (they are the same university)
+	if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") {
+		log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL)
+		uolPages, err := c.findUOLDepartmentPages(ctx, baseURL)
+		if err == nil {
+			log.Printf("[UOL] Found %d department pages", len(uolPages))
+			pages = append(pages, uolPages...)
+		} else {
+			log.Printf("[UOL] Error finding department pages: %v", err)
+		}
+	}
+
+	// Deduplicate
+	seen := make(map[string]bool)
+	var unique []string
+	for _, p := range pages {
+		if !seen[p] {
+			seen[p] = true
+			unique = append(unique, p)
+		}
+	}
+
+	return unique, nil
+}
+
+// findUOLDepartmentPages finds department person pages for Uni Oldenburg
+func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) {
+	var pages []string
+
+	// UOL uses both uol.de and uni-oldenburg.de domains
+	// Departments have /personen or /team subpages
+
+	// Helper to check if URL is UOL-related
+	isUOLURL := func(url string) bool {
+		lower := strings.ToLower(url)
+		return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de")
+	}
+
+	// First try to find department links from known starting points
+	startPages := []string{
+		"https://uol.de/informatik/department/abteilungen-und-einrichtungen",
+		"https://uol.de/fk2",
+		"https://uol.de/fk1",
+		"https://uol.de/fk3",
+		"https://uol.de/fk4",
+		"https://uol.de/fk5",
+		"https://uol.de/fk6",
+		baseURL,
+	}
+
+	deptPaths := make(map[string]bool)
+
+	for _, startURL := range startPages {
+		log.Printf("[UOL] Scanning start page: %s", startURL)
+		body, err := c.fetchPage(ctx, startURL)
+		if err != nil {
+			log.Printf("[UOL] Error fetching %s: %v", startURL, err)
+			continue
+		}
+
+		doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+		if err != nil {
+			continue
+		}
+
+		// Find links to department pages (they typically have /personen subpages)
+		doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
+			href, exists := s.Attr("href")
+			if !exists {
+				return
+			}
+
+			// Look for department-like paths
+			hrefLower := strings.ToLower(href)
+			isDeptPath := isUOLURL(href) &&
+				!strings.Contains(hrefLower, "/studium") &&
+				!strings.Contains(hrefLower, "/forschung") &&
+				!strings.Contains(hrefLower, "/aktuelles") &&
+				!strings.Contains(hrefLower, "/kontakt")
+
+			if isDeptPath {
+				fullURL := resolveURL(startURL, href)
+				if fullURL != "" && isUOLURL(fullURL) {
+					// Add personen page for this department
+					personenURL := strings.TrimSuffix(fullURL, "/") + "/personen"
+					deptPaths[personenURL] = true
+				}
+			}
+		})
+
+		// Also look for direct /personen or /team links
+		doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) {
+			href, exists := s.Attr("href")
+			if exists {
+				fullURL := resolveURL(startURL, href)
+				if fullURL != "" && isUOLURL(fullURL) {
+					deptPaths[fullURL] = true
+				}
+			}
+		})
+	}
+
+	// Add well-known department personen pages directly (these exist for sure)
+	knownDepts := []string{
+		"https://uol.de/socps/personen",
+		"https://uol.de/vlba/team",
+		"https://uol.de/informatik/department",
+		"https://uol.de/se/team",
+		"https://uol.de/ei/personen",
+		"https://uol.de/is/team",
+		"https://uol.de/paedagogik/personen",
+		"https://uol.de/psychologie/personen",
+		"https://uol.de/germanistik/personen",
+		"https://uol.de/physik/personen",
+		"https://uol.de/chemie/personen",
+		"https://uol.de/biologie/personen",
+		"https://uol.de/mathe/personen",
+	}
+	for _, dept := range knownDepts {
+		deptPaths[dept] = true
+	}
+
+	log.Printf("[UOL] Checking %d potential department pages", len(deptPaths))
+
+	// Verify which pages actually exist
+	for path := range deptPaths {
+		exists, err := c.checkPageExists(ctx, path)
+		if err == nil && exists {
+			log.Printf("[UOL] Found valid page: %s", path)
+			pages = append(pages, path)
+		}
+	}
+
+	log.Printf("[UOL] Found %d valid department/personen pages", len(pages))
+	return pages, nil
+}
+
+// checkPageExists checks if a URL returns a 200 status
+func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) {
+	c.waitForRateLimit(urlStr)
+
+	req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
+	if err != nil {
+		return false, err
+	}
+	req.Header.Set("User-Agent", c.userAgent)
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return false, err
+	}
+	defer resp.Body.Close()
+
+	return resp.StatusCode == http.StatusOK, nil
+}
+
+// findStaffLinksOnPage finds links to staff pages on a given page
+func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) {
+	body, err := c.fetchPage(ctx, pageURL)
+	if err != nil {
+		return nil, err
+	}
+
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+	if err != nil {
+		return nil, err
+	}
+
+	var links []string
+	staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"}
+
+	doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists {
+			return
+		}
+
+		text := strings.ToLower(s.Text())
+		hrefLower := strings.ToLower(href)
+
+		for _, keyword := range staffKeywords {
+			if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) {
+				fullURL := resolveURL(pageURL, href)
+				if fullURL != "" {
+					links = append(links, fullURL)
+				}
+				break
+			}
+		}
+	})
+
+	return links, nil
+}
@@ -0,0 +1,364 @@
+package staff
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"log"
+	"regexp"
+	"strings"
+
+	"github.com/PuerkitoBio/goquery"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+)
+
+// EnrichStaffProfiles fetches individual profile pages and extracts detailed info
+// like email, phone, office, research interests, and publication links
+func (c *StaffCrawler) EnrichStaffProfiles(ctx context.Context, uni *database.University) (int, error) {
+	// Get all staff for this university that have profile URLs
+	staffList, err := c.repo.SearchStaff(ctx, database.StaffSearchParams{
+		UniversityID: &uni.ID,
+		Limit:        10000,
+	})
+	if err != nil {
+		return 0, fmt.Errorf("failed to search staff: %w", err)
+	}
+
+	log.Printf("[Profile Enrichment] Starting enrichment for %d staff members at %s", staffList.Total, uni.Name)
+
+	enriched := 0
+	for _, staff := range staffList.Staff {
+		select {
+		case <-ctx.Done():
+			return enriched, ctx.Err()
+		default:
+		}
+
+		// Skip if no profile URL
+		if staff.ProfileURL == nil || *staff.ProfileURL == "" {
+			continue
+		}
+
+		// Skip if already has email (already enriched)
+		if staff.Email != nil && *staff.Email != "" {
+			continue
+		}
+
+		// Fetch and extract profile details
+		details, err := c.extractProfileDetails(ctx, *staff.ProfileURL)
+		if err != nil {
+			log.Printf("[Profile Enrichment] Error fetching %s: %v", *staff.ProfileURL, err)
+			continue
+		}
+
+		// Update staff record with new details
+		updated := false
+		if details.Email != "" && staff.Email == nil {
+			staff.Email = &details.Email
+			updated = true
+		}
+		if details.Phone != "" && staff.Phone == nil {
+			staff.Phone = &details.Phone
+			updated = true
+		}
+		if details.Office != "" && staff.Office == nil {
+			staff.Office = &details.Office
+			updated = true
+		}
+		if details.ORCID != "" && staff.ORCID == nil {
+			staff.ORCID = &details.ORCID
+			updated = true
+		}
+		if details.GoogleScholarID != "" && staff.GoogleScholarID == nil {
+			staff.GoogleScholarID = &details.GoogleScholarID
+			updated = true
+		}
+		if details.ResearchgateURL != "" && staff.ResearchgateURL == nil {
+			staff.ResearchgateURL = &details.ResearchgateURL
+			updated = true
+		}
+		if details.LinkedInURL != "" && staff.LinkedInURL == nil {
+			staff.LinkedInURL = &details.LinkedInURL
+			updated = true
+		}
+		if details.PersonalWebsite != "" && staff.PersonalWebsite == nil {
+			staff.PersonalWebsite = &details.PersonalWebsite
+			updated = true
+		}
+		if len(details.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
+			staff.ResearchInterests = details.ResearchInterests
+			updated = true
+		}
+		if details.PhotoURL != "" && staff.PhotoURL == nil {
+			staff.PhotoURL = &details.PhotoURL
+			updated = true
+		}
+
+		if updated {
+			err = c.repo.CreateStaff(ctx, &staff)
+			if err != nil {
+				log.Printf("[Profile Enrichment] Error updating %s: %v", staff.LastName, err)
+				continue
+			}
+			enriched++
+			log.Printf("[Profile Enrichment] Enriched: %s %s (email=%v)", stringValue(staff.FirstName), staff.LastName, details.Email != "")
+		}
+	}
+
+	log.Printf("[Profile Enrichment] Completed: enriched %d of %d staff members", enriched, staffList.Total)
+	return enriched, nil
+}
+
+// ProfileDetails contains extracted details from a profile page
+type ProfileDetails struct {
+	Email             string
+	Phone             string
+	Office            string
+	ORCID             string
+	GoogleScholarID   string
+	ResearchgateURL   string
+	LinkedInURL       string
+	PersonalWebsite   string
+	ResearchInterests []string
+	PhotoURL          string
+}
+
+// extractProfileDetails extracts contact info from an individual profile page
+func (c *StaffCrawler) extractProfileDetails(ctx context.Context, profileURL string) (*ProfileDetails, error) {
+	body, err := c.fetchPage(ctx, profileURL)
+	if err != nil {
+		return nil, err
+	}
+
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+	if err != nil {
+		return nil, err
+	}
+
+	details := &ProfileDetails{}
+
+	// UOL-specific: Look for definition list pattern (dt/dd pairs)
+	// This is the most reliable way to get contact info on UOL pages
+	doc.Find("dt").Each(func(i int, dt *goquery.Selection) {
+		label := strings.TrimSpace(strings.ToLower(dt.Text()))
+		dd := dt.Next()
+		if dd.Length() == 0 || goquery.NodeName(dd) != "dd" {
+			return
+		}
+		value := strings.TrimSpace(dd.Text())
+
+		switch {
+		case strings.Contains(label, "email") || strings.Contains(label, "e-mail"):
+			if details.Email == "" {
+				// Get email from mailto link if present
+				dd.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
+					if details.Email != "" {
+						return
+					}
+					href, _ := a.Attr("href")
+					email := strings.TrimPrefix(href, "mailto:")
+					email = strings.Split(email, "?")[0]
+					if strings.Contains(email, "@") {
+						details.Email = strings.TrimSpace(email)
+					}
+				})
+				// Fallback: extract from text
+				if details.Email == "" && strings.Contains(value, "@") {
+					emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,10}`)
+					if match := emailPattern.FindString(value); match != "" {
+						details.Email = match
+					}
+				}
+			}
+		case strings.Contains(label, "telefon") || strings.Contains(label, "phone") || strings.Contains(label, "tel"):
+			if details.Phone == "" {
+				// Get phone from tel: link if present
+				dd.Find("a[href^='tel:']").Each(func(j int, a *goquery.Selection) {
+					if details.Phone != "" {
+						return
+					}
+					href, _ := a.Attr("href")
+					phone := strings.TrimPrefix(href, "tel:")
+					if len(phone) >= 8 {
+						details.Phone = phone
+					}
+				})
+				// Fallback: extract from text
+				if details.Phone == "" {
+					phonePattern := regexp.MustCompile(`\+?[\d\s\-/()]{8,20}`)
+					if match := phonePattern.FindString(value); match != "" {
+						details.Phone = strings.TrimSpace(match)
+					}
+				}
+			}
+		case strings.Contains(label, "raum") || strings.Contains(label, "büro") || strings.Contains(label, "office"):
+			if details.Office == "" {
+				details.Office = value
+			}
+		}
+	})
+
+	// Fallback: Extract email from mailto links if not found via dt/dd
+	if details.Email == "" {
+		doc.Find("a[href^='mailto:']").Each(func(i int, s *goquery.Selection) {
+			if details.Email != "" {
+				return
+			}
+			href, _ := s.Attr("href")
+			email := strings.TrimPrefix(href, "mailto:")
+			email = strings.Split(email, "?")[0]
+			// Only accept personal email addresses (not generic like info@, sekretariat@)
+			if strings.Contains(email, "@") {
+				emailLower := strings.ToLower(email)
+				isGeneric := strings.HasPrefix(emailLower, "info@") ||
+					strings.HasPrefix(emailLower, "sekretariat@") ||
+					strings.HasPrefix(emailLower, "kontakt@") ||
+					strings.HasPrefix(emailLower, "office@") ||
+					strings.HasPrefix(emailLower, "fachschaft@")
+				if !isGeneric {
+					details.Email = strings.TrimSpace(email)
+				}
+			}
+		})
+	}
+
+	// Fallback: Extract phone if not found via dt/dd
+	if details.Phone == "" {
+		doc.Find("a[href^='tel:']").Each(func(i int, s *goquery.Selection) {
+			if details.Phone != "" {
+				return
+			}
+			href, _ := s.Attr("href")
+			phone := strings.TrimPrefix(href, "tel:")
+			if len(phone) >= 8 {
+				details.Phone = phone
+			}
+		})
+	}
+
+	// Extract ORCID
+	doc.Find("a[href*='orcid.org']").Each(func(i int, s *goquery.Selection) {
+		if details.ORCID != "" {
+			return
+		}
+		href, _ := s.Attr("href")
+		orcidPattern := regexp.MustCompile(`\d{4}-\d{4}-\d{4}-\d{3}[\dX]`)
+		if match := orcidPattern.FindString(href); match != "" {
+			details.ORCID = match
+		}
+	})
+
+	// Extract Google Scholar ID
+	doc.Find("a[href*='scholar.google']").Each(func(i int, s *goquery.Selection) {
+		if details.GoogleScholarID != "" {
+			return
+		}
+		href, _ := s.Attr("href")
+		// Extract user ID from URL like scholar.google.com/citations?user=XXXXX
+		if strings.Contains(href, "user=") {
+			parts := strings.Split(href, "user=")
+			if len(parts) > 1 {
+				userID := strings.Split(parts[1], "&")[0]
+				details.GoogleScholarID = userID
+			}
+		}
+	})
+
+	// Extract ResearchGate URL
+	doc.Find("a[href*='researchgate.net']").Each(func(i int, s *goquery.Selection) {
+		if details.ResearchgateURL != "" {
+			return
+		}
+		href, _ := s.Attr("href")
+		if strings.Contains(href, "researchgate.net") {
+			details.ResearchgateURL = href
+		}
+	})
+
+	// Extract LinkedIn URL
+	doc.Find("a[href*='linkedin.com']").Each(func(i int, s *goquery.Selection) {
+		if details.LinkedInURL != "" {
+			return
+		}
+		href, _ := s.Attr("href")
+		if strings.Contains(href, "linkedin.com") {
+			details.LinkedInURL = href
+		}
+	})
+
+	// Extract personal website (non-university links)
+	doc.Find("a[href^='http']").Each(func(i int, s *goquery.Selection) {
+		if details.PersonalWebsite != "" {
+			return
+		}
+		href, _ := s.Attr("href")
+		text := strings.ToLower(s.Text())
+
+		// Skip university links, social media, etc.
+		if strings.Contains(href, "uni-oldenburg.de") || strings.Contains(href, "uol.de") ||
+			strings.Contains(href, "linkedin") || strings.Contains(href, "researchgate") ||
+			strings.Contains(href, "orcid.org") || strings.Contains(href, "scholar.google") ||
+			strings.Contains(href, "twitter") || strings.Contains(href, "facebook") {
+			return
+		}
+
+		// Look for personal website indicators
+		if strings.Contains(text, "homepage") || strings.Contains(text, "website") ||
+			strings.Contains(text, "personal") || strings.Contains(text, "www") {
+			details.PersonalWebsite = href
+		}
+	})
+
+	// Extract photo URL
+	doc.Find("img").Each(func(i int, s *goquery.Selection) {
+		if details.PhotoURL != "" {
+			return
+		}
+		src, exists := s.Attr("src")
+		if !exists {
+			return
+		}
+
+		// Skip icons, logos, etc.
+		srcLower := strings.ToLower(src)
+		if strings.Contains(srcLower, "icon") || strings.Contains(srcLower, "logo") ||
+			strings.Contains(srcLower, "placeholder") || strings.Contains(srcLower, "default") {
+			return
+		}
+
+		// Look for images that might be profile photos
+		alt, _ := s.Attr("alt")
+		altLower := strings.ToLower(alt)
+		classes, _ := s.Attr("class")
+		classesLower := strings.ToLower(classes)
+
+		if strings.Contains(altLower, "foto") || strings.Contains(altLower, "photo") ||
+			strings.Contains(altLower, "portrait") || strings.Contains(altLower, "bild") ||
+			strings.Contains(classesLower, "photo") || strings.Contains(classesLower, "portrait") ||
+			strings.Contains(classesLower, "profile") {
+			details.PhotoURL = resolveURL(profileURL, src)
+		}
+	})
+
+	// Extract research interests/areas
+	// Look for sections about research, forschung, schwerpunkte
+	doc.Find("*").Each(func(i int, s *goquery.Selection) {
+		if len(details.ResearchInterests) > 0 {
+			return
+		}
+		text := strings.ToLower(s.Text())
+		if strings.Contains(text, "forschung") || strings.Contains(text, "research") ||
+			strings.Contains(text, "schwerpunkt") || strings.Contains(text, "interest") {
+			// Check if parent has a list of items
+			s.Parent().Find("li").Each(func(j int, li *goquery.Selection) {
+				interest := strings.TrimSpace(li.Text())
+				if len(interest) > 3 && len(interest) < 200 {
+					details.ResearchInterests = append(details.ResearchInterests, interest)
+				}
+			})
+		}
+	})
+
+	return details, nil
+}
@@ -0,0 +1,495 @@
+package staff
+
+import (
+	"bytes"
+	"context"
+	"log"
+	"strings"
+
+	"github.com/PuerkitoBio/goquery"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+)
+
+// extractStaffFromPage extracts staff information from a staff listing page
+func (c *StaffCrawler) extractStaffFromPage(ctx context.Context, pageURL string, uni *database.University) ([]*database.UniversityStaff, error) {
+	body, err := c.fetchPage(ctx, pageURL)
+	if err != nil {
+		return nil, err
+	}
+
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+	if err != nil {
+		return nil, err
+	}
+
+	var staff []*database.UniversityStaff
+
+	// Try different extraction strategies
+	extractors := []func(*goquery.Document, string) []*database.UniversityStaff{
+		c.extractFromUOLPatterns, // UOL-specific patterns first
+		c.extractFromPersonCards,
+		c.extractFromTable,
+		c.extractFromList,
+		c.extractFromVCard,
+	}
+
+	for _, extractor := range extractors {
+		extracted := extractor(doc, pageURL)
+		if len(extracted) > 0 {
+			staff = append(staff, extracted...)
+		}
+	}
+
+	return staff, nil
+}
+
+// extractFromUOLPatterns extracts staff using Uni Oldenburg specific patterns
+// UOL uses: nav#left-nav for person lists, p.mit-icon.person for person links,
+// and /suche/person?username=XXX for person API
+// Also captures hierarchy from section headers (Leitung, Mitarbeiter, etc.)
+func (c *StaffCrawler) extractFromUOLPatterns(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
+	var staff []*database.UniversityStaff
+	seen := make(map[string]bool)
+
+	// Extract department name from page title or breadcrumb
+	deptName := ""
+	doc.Find("h1").First().Each(func(i int, s *goquery.Selection) {
+		deptName = strings.TrimSpace(s.Text())
+	})
+
+	// Pattern 5 (NEW): Parse content with hierarchy headers
+	// UOL pages have structure like:
+	// #### Leitung
+	// <ul><li><a href="...">Prof. Dr. Name</a></li></ul>
+	// #### Wissenschaftliche Mitarbeiterinnen und Mitarbeiter
+	// <ul><li><a href="...">M. Sc. Name</a></li></ul>
+	currentRole := ""
+	var leaderName string // Track the department head for supervisor assignment
+
+	// Walk through content area looking for headers and lists
+	doc.Find("#content h4, #content h3, #content ul li a, .inhalt h4, .inhalt h3, .inhalt ul li a").Each(func(i int, s *goquery.Selection) {
+		tagName := goquery.NodeName(s)
+
+		// Check if this is a section header
+		if tagName == "h3" || tagName == "h4" {
+			headerText := strings.ToLower(strings.TrimSpace(s.Text()))
+			if strings.Contains(headerText, "leitung") {
+				currentRole = "leitung"
+			} else if strings.Contains(headerText, "sekretariat") {
+				currentRole = "sekretariat"
+			} else if strings.Contains(headerText, "wissenschaftlich") || strings.Contains(headerText, "mitarbeiter") {
+				currentRole = "mitarbeiter"
+			} else if strings.Contains(headerText, "doktorand") || strings.Contains(headerText, "promovierend") {
+				currentRole = "doktorand"
+			} else if strings.Contains(headerText, "technisch") {
+				currentRole = "technisch"
+			} else if strings.Contains(headerText, "extern") {
+				currentRole = "extern"
+			} else if strings.Contains(headerText, "student") || strings.Contains(headerText, "hilfskr") || strings.Contains(headerText, "hiwi") {
+				currentRole = "hiwi"
+			}
+			return
+		}
+
+		// Process person links under current header
+		if tagName == "a" {
+			href, exists := s.Attr("href")
+			if !exists {
+				return
+			}
+
+			// Check if this looks like a person page link
+			if !strings.Contains(href, "/personen/") && !strings.Contains(href, "suche/person") {
+				return
+			}
+
+			name := strings.TrimSpace(s.Text())
+			if name == "" || seen[name] || !c.looksLikeName(name) {
+				return
+			}
+			seen[name] = true
+
+			person := &database.UniversityStaff{}
+			person.FullName = &name
+			c.parseName(name, person)
+
+			if person.LastName != "" {
+				fullURL := resolveURL(baseURL, href)
+				person.ProfileURL = &fullURL
+
+				// Set team role based on current section
+				if currentRole != "" {
+					person.TeamRole = &currentRole
+				}
+
+				// Track leader for supervisor assignment
+				if currentRole == "leitung" && leaderName == "" {
+					leaderName = name
+					person.IsProfessor = true
+					posType := "professor"
+					person.PositionType = &posType
+				}
+
+				staff = append(staff, person)
+			}
+		}
+	})
+
+	// Pattern 1: nav#left-nav ul li a - side navigation with person links
+	// Format: /abteilung/personen/prof-dr-name or /abteilung/personen/m-sc-name
+	doc.Find("nav#left-nav ul li a, #left-navi li a").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists {
+			return
+		}
+
+		// Check if this looks like a person page link
+		if !strings.Contains(href, "/personen/") {
+			return
+		}
+
+		name := strings.TrimSpace(s.Text())
+		if name == "" || seen[name] {
+			return
+		}
+		seen[name] = true
+
+		person := &database.UniversityStaff{}
+		person.FullName = &name
+		c.parseName(name, person)
+
+		if person.LastName != "" {
+			fullURL := resolveURL(baseURL, href)
+			person.ProfileURL = &fullURL
+			staff = append(staff, person)
+		}
+	})
+
+	// Pattern 2: p.mit-icon.person a - inline person references
+	// Format: <p class="mit-icon person"><a href="/suche/person/USERNAME">Prof. Dr. Name</a></p>
+	// OR: <p class="mit-icon person"><a href="/abteilung/personen/prof-dr-name">Prof. Dr. Name</a></p>
+	doc.Find("p.mit-icon.person a, .mit-icon.person a").Each(func(i int, s *goquery.Selection) {
+		name := strings.TrimSpace(s.Text())
+		if name == "" || seen[name] {
+			return
+		}
+		seen[name] = true
+
+		person := &database.UniversityStaff{}
+		person.FullName = &name
+		c.parseName(name, person)
+
+		if person.LastName != "" {
+			href, exists := s.Attr("href")
+			if exists {
+				fullURL := resolveURL(baseURL, href)
+				person.ProfileURL = &fullURL
+			}
+			staff = append(staff, person)
+		}
+	})
+
+	// Pattern 3: Links to /suche/person?username=XXX
+	doc.Find("a[href*='suche/person']").Each(func(i int, s *goquery.Selection) {
+		name := strings.TrimSpace(s.Text())
+		// Skip non-person text like "Internetkoordinator"
+		if name == "" || seen[name] || !c.looksLikeName(name) {
+			return
+		}
+		seen[name] = true
+
+		person := &database.UniversityStaff{}
+		person.FullName = &name
+		c.parseName(name, person)
+
+		if person.LastName != "" {
+			href, exists := s.Attr("href")
+			if exists {
+				fullURL := resolveURL(baseURL, href)
+				person.ProfileURL = &fullURL
+			}
+			staff = append(staff, person)
+		}
+	})
+
+	// Pattern 4: Breadcrumb navigation sublinks with person names
+	// Format: <ul class="sublinks"><li><a href="/dept/personen/name">Prof. Dr. Name</a></li>
+	doc.Find(".sublinks li a, nav#navizeile .sublinks li a").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists || !strings.Contains(href, "/personen/") {
+			return
+		}
+
+		name := strings.TrimSpace(s.Text())
+		if name == "" || seen[name] {
+			return
+		}
+		seen[name] = true
+
+		person := &database.UniversityStaff{}
+		person.FullName = &name
+		c.parseName(name, person)
+
+		if person.LastName != "" {
+			fullURL := resolveURL(baseURL, href)
+			person.ProfileURL = &fullURL
+			staff = append(staff, person)
+		}
+	})
+
+	if len(staff) > 0 {
+		log.Printf("[UOL Extractor] Found %d staff members using UOL patterns (dept: %s)", len(staff), deptName)
+	}
+
+	return staff
+}
+
+// extractFromPersonCards extracts staff from card-style layouts
+func (c *StaffCrawler) extractFromPersonCards(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
+	var staff []*database.UniversityStaff
+
+	// Common card selectors
+	cardSelectors := []string{
+		".person-card",
+		".staff-card",
+		".team-member",
+		".mitarbeiter",
+		".person",
+		".employee",
+		"[itemtype='http://schema.org/Person']",
+		".vcard",
+	}
+
+	for _, selector := range cardSelectors {
+		doc.Find(selector).Each(func(i int, s *goquery.Selection) {
+			person := c.extractPersonFromElement(s, baseURL)
+			if person != nil && person.LastName != "" {
+				staff = append(staff, person)
+			}
+		})
+
+		if len(staff) > 0 {
+			break
+		}
+	}
+
+	return staff
+}
+
+// extractFromTable extracts staff from table layouts
+func (c *StaffCrawler) extractFromTable(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
+	var staff []*database.UniversityStaff
+
+	doc.Find("table").Each(func(i int, table *goquery.Selection) {
+		// Check if this looks like a staff table
+		headerText := strings.ToLower(table.Find("th").Text())
+		if !strings.Contains(headerText, "name") && !strings.Contains(headerText, "person") {
+			return
+		}
+
+		table.Find("tr").Each(func(j int, row *goquery.Selection) {
+			if row.Find("th").Length() > 0 {
+				return // Skip header row
+			}
+
+			cells := row.Find("td")
+			if cells.Length() < 2 {
+				return
+			}
+
+			person := &database.UniversityStaff{}
+
+			// First cell usually contains name
+			nameCell := cells.First()
+			name := strings.TrimSpace(nameCell.Text())
+			person.FullName = &name
+			c.parseName(name, person)
+
+			// Look for email
+			row.Find("a[href^='mailto:']").Each(func(k int, a *goquery.Selection) {
+				href, _ := a.Attr("href")
+				email := strings.TrimPrefix(href, "mailto:")
+				person.Email = &email
+			})
+
+			// Look for profile link
+			nameCell.Find("a[href]").Each(func(k int, a *goquery.Selection) {
+				href, exists := a.Attr("href")
+				if exists && !strings.HasPrefix(href, "mailto:") {
+					fullURL := resolveURL(baseURL, href)
+					person.ProfileURL = &fullURL
+				}
+			})
+
+			// Extract position from other cells
+			cells.Each(func(k int, cell *goquery.Selection) {
+				text := strings.TrimSpace(cell.Text())
+				if c.looksLikePosition(text) {
+					person.Position = &text
+					person.PositionType = c.classifyPosition(text)
+					person.IsProfessor = c.isProfessor(text)
+				}
+			})
+
+			if person.LastName != "" {
+				staff = append(staff, person)
+			}
+		})
+	})
+
+	return staff
+}
+
+// extractFromList extracts staff from list layouts
+func (c *StaffCrawler) extractFromList(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
+	var staff []*database.UniversityStaff
+
+	listSelectors := []string{"ul.staff", "ul.team", "ul.mitarbeiter", ".staff-list li", ".team-list li"}
+
+	for _, selector := range listSelectors {
+		doc.Find(selector).Each(func(i int, li *goquery.Selection) {
+			person := c.extractPersonFromElement(li, baseURL)
+			if person != nil && person.LastName != "" {
+				staff = append(staff, person)
+			}
+		})
+
+		if len(staff) > 0 {
+			break
+		}
+	}
+
+	return staff
+}
+
+// extractFromVCard extracts staff from vCard/hCard microformats
+func (c *StaffCrawler) extractFromVCard(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
+	var staff []*database.UniversityStaff
+
+	doc.Find(".vcard, .h-card").Each(func(i int, s *goquery.Selection) {
+		person := &database.UniversityStaff{}
+
+		// Name
+		fn := s.Find(".fn, .p-name").Text()
+		if fn != "" {
+			person.FullName = &fn
+			c.parseName(fn, person)
+		}
+
+		// Email
+		email := s.Find(".email, .u-email").Text()
+		if email == "" {
+			s.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
+				href, _ := a.Attr("href")
+				email = strings.TrimPrefix(href, "mailto:")
+			})
+		}
+		if email != "" {
+			person.Email = &email
+		}
+
+		// Title/Position
+		title := s.Find(".title, .p-job-title, .role").Text()
+		if title != "" {
+			person.Position = &title
+			person.PositionType = c.classifyPosition(title)
+			person.IsProfessor = c.isProfessor(title)
+		}
+
+		// Photo
+		s.Find(".photo, .u-photo, img").Each(func(j int, img *goquery.Selection) {
+			src, exists := img.Attr("src")
+			if exists {
+				fullURL := resolveURL(baseURL, src)
+				person.PhotoURL = &fullURL
+			}
+		})
+
+		// Profile URL
+		s.Find("a[href].url, a[href].u-url").Each(func(j int, a *goquery.Selection) {
+			href, exists := a.Attr("href")
+			if exists {
+				fullURL := resolveURL(baseURL, href)
+				person.ProfileURL = &fullURL
+			}
+		})
+
+		if person.LastName != "" {
+			staff = append(staff, person)
+		}
+	})
+
+	return staff
+}
+
+// extractPersonFromElement extracts a person from a generic HTML element
+func (c *StaffCrawler) extractPersonFromElement(s *goquery.Selection, baseURL string) *database.UniversityStaff {
+	person := &database.UniversityStaff{}
+
+	// Try to find name
+	nameSelectors := []string{".name", ".person-name", "h2", "h3", "h4", ".title", "strong", "b"}
+	for _, sel := range nameSelectors {
+		name := strings.TrimSpace(s.Find(sel).First().Text())
+		if name != "" && len(name) < 100 && !c.looksLikePosition(name) {
+			person.FullName = &name
+			c.parseName(name, person)
+			break
+		}
+	}
+
+	// If no name found, try the whole text
+	if person.LastName == "" {
+		text := strings.TrimSpace(s.Text())
+		lines := strings.Split(text, "\n")
+		if len(lines) > 0 {
+			firstLine := strings.TrimSpace(lines[0])
+			if len(firstLine) > 0 && len(firstLine) < 100 {
+				person.FullName = &firstLine
+				c.parseName(firstLine, person)
+			}
+		}
+	}
+
+	// Extract email
+	s.Find("a[href^='mailto:']").Each(func(i int, a *goquery.Selection) {
+		href, _ := a.Attr("href")
+		email := strings.TrimPrefix(href, "mailto:")
+		email = strings.Split(email, "?")[0] // Remove query params
+		person.Email = &email
+	})
+
+	// Extract position
+	positionSelectors := []string{".position", ".role", ".job-title", ".funktion", "small", ".subtitle"}
+	for _, sel := range positionSelectors {
+		pos := strings.TrimSpace(s.Find(sel).First().Text())
+		if pos != "" && c.looksLikePosition(pos) {
+			person.Position = &pos
+			person.PositionType = c.classifyPosition(pos)
+			person.IsProfessor = c.isProfessor(pos)
+			break
+		}
+	}
+
+	// Extract photo
+	s.Find("img").Each(func(i int, img *goquery.Selection) {
+		src, exists := img.Attr("src")
+		if exists && !strings.Contains(src, "placeholder") && !strings.Contains(src, "icon") {
+			fullURL := resolveURL(baseURL, src)
+			person.PhotoURL = &fullURL
+		}
+	})
+
+	// Extract profile link
+	s.Find("a[href]").Each(func(i int, a *goquery.Selection) {
+		href, exists := a.Attr("href")
+		if exists && !strings.HasPrefix(href, "mailto:") && !strings.HasPrefix(href, "tel:") {
+			fullURL := resolveURL(baseURL, href)
+			if person.ProfileURL == nil {
+				person.ProfileURL = &fullURL
+			}
+		}
+	})
+
+	return person
+}