feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
--- a/edu-search-service/internal/staff/orchestrator_adapter.go
+++ b/edu-search-service/internal/staff/orchestrator_adapter.go
@@ -0,0 +1,217 @@
+// Package staff provides university staff crawling functionality
+package staff
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"time"
+
+	"github.com/google/uuid"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+	"github.com/breakpilot/edu-search-service/internal/orchestrator"
+)
+
+// OrchestratorAdapter adapts the StaffCrawler to the orchestrator.StaffCrawlerInterface
+// This bridges the gap between the generic StaffCrawler and the multi-phase orchestrator
+type OrchestratorAdapter struct {
+	crawler *StaffCrawler
+	repo    *database.Repository
+}
+
+// NewOrchestratorAdapter creates a new adapter that connects StaffCrawler to the orchestrator
+func NewOrchestratorAdapter(crawler *StaffCrawler, repo *database.Repository) *OrchestratorAdapter {
+	return &OrchestratorAdapter{
+		crawler: crawler,
+		repo:    repo,
+	}
+}
+
+// DiscoverSampleProfessor finds at least one professor to validate crawling works for this university
+// This is Phase 1: Quick validation that the university website is crawlable
+func (a *OrchestratorAdapter) DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhaseDiscovery,
+		StartedAt: start,
+	}
+
+	log.Printf("[OrchestratorAdapter] Discovery phase for university %s", universityID)
+
+	// Get university from database
+	uni, err := a.repo.GetUniversityByID(ctx, universityID)
+	if err != nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
+		return progress, fmt.Errorf("failed to get university: %w", err)
+	}
+
+	if uni == nil {
+		progress.Errors = append(progress.Errors, "University not found")
+		return progress, fmt.Errorf("university not found: %s", universityID)
+	}
+
+	log.Printf("[OrchestratorAdapter] Discovering staff pages for %s (%s)", uni.Name, uni.URL)
+
+	// Use the crawler to find staff pages (discovery phase)
+	staffPages, err := a.crawler.findStaffPages(ctx, uni)
+	if err != nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to find staff pages: %v", err))
+		return progress, fmt.Errorf("failed to find staff pages: %w", err)
+	}
+
+	log.Printf("[OrchestratorAdapter] Found %d staff pages for %s", len(staffPages), uni.Name)
+
+	// Try to extract at least one professor as validation
+	var sampleFound int
+	for _, pageURL := range staffPages {
+		if sampleFound > 0 {
+			break // We just need to validate one works
+		}
+
+		staffMembers, err := a.crawler.extractStaffFromPage(ctx, pageURL, uni)
+		if err != nil {
+			log.Printf("[OrchestratorAdapter] Error extracting from %s: %v", pageURL, err)
+			continue
+		}
+
+		// Count professors found
+		for _, staff := range staffMembers {
+			if staff.IsProfessor {
+				sampleFound++
+				log.Printf("[OrchestratorAdapter] Found sample professor: %s %s",
+					stringValue(staff.FirstName), staff.LastName)
+				break
+			}
+		}
+
+		// Even non-professors validate the crawler works
+		if sampleFound == 0 && len(staffMembers) > 0 {
+			sampleFound = 1
+			log.Printf("[OrchestratorAdapter] Found sample staff member (not professor): %s %s",
+				stringValue(staffMembers[0].FirstName), staffMembers[0].LastName)
+		}
+	}
+
+	progress.ItemsFound = len(staffPages) // Number of crawlable pages found
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	if sampleFound == 0 && len(staffPages) > 0 {
+		// Pages found but no staff extracted - still consider it successful
+		log.Printf("[OrchestratorAdapter] Discovery completed: %d pages found, extraction may need tuning", len(staffPages))
+	} else if sampleFound == 0 {
+		progress.Errors = append(progress.Errors, "No staff pages found")
+		return progress, fmt.Errorf("no staff pages found for %s", uni.Name)
+	}
+
+	log.Printf("[OrchestratorAdapter] Discovery completed for %s: %d pages found", uni.Name, len(staffPages))
+	return progress, nil
+}
+
+// CrawlProfessors crawls all professors at a university
+// This is Phase 2: Focus on finding professors specifically
+func (a *OrchestratorAdapter) CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhaseProfessors,
+		StartedAt: start,
+	}
+
+	log.Printf("[OrchestratorAdapter] Professors phase for university %s", universityID)
+
+	// Get university
+	uni, err := a.repo.GetUniversityByID(ctx, universityID)
+	if err != nil || uni == nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
+		return progress, fmt.Errorf("failed to get university: %w", err)
+	}
+
+	// Perform full crawl
+	result, err := a.crawler.CrawlUniversity(ctx, uni)
+	if err != nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Crawl failed: %v", err))
+		return progress, err
+	}
+
+	// Count professors specifically
+	professorCount := 0
+	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
+		UniversityID: &universityID,
+		IsProfessor:  boolPtr(true),
+		Limit:        10000,
+	})
+	if err == nil {
+		professorCount = staffList.Total
+	}
+
+	progress.ItemsFound = professorCount
+	progress.ItemsProcessed = result.StaffFound
+	progress.Errors = result.Errors
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	log.Printf("[OrchestratorAdapter] Professors phase completed for %s: %d professors found", uni.Name, professorCount)
+	return progress, nil
+}
+
+// CrawlAllStaff crawls all staff members at a university
+// This is Phase 3: Get all staff (already done in Phase 2, but we verify/extend)
+func (a *OrchestratorAdapter) CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhaseAllStaff,
+		StartedAt: start,
+	}
+
+	log.Printf("[OrchestratorAdapter] All Staff phase for university %s", universityID)
+
+	// Get university
+	uni, err := a.repo.GetUniversityByID(ctx, universityID)
+	if err != nil || uni == nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
+		return progress, fmt.Errorf("failed to get university: %w", err)
+	}
+
+	// Run another crawl pass to catch any missed staff
+	result, err := a.crawler.CrawlUniversity(ctx, uni)
+	if err != nil {
+		progress.Errors = result.Errors
+		// Don't fail completely - we may have some staff already
+		log.Printf("[OrchestratorAdapter] All Staff crawl had errors: %v", err)
+	}
+
+	// Get total staff count
+	staffCount := 0
+	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
+		UniversityID: &universityID,
+		Limit:        1, // Just need count
+	})
+	if err == nil {
+		staffCount = staffList.Total
+	}
+
+	progress.ItemsFound = staffCount
+	if result != nil {
+		progress.ItemsProcessed = result.StaffFound
+		progress.Errors = result.Errors
+	}
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	log.Printf("[OrchestratorAdapter] All Staff phase completed for %s: %d total staff", uni.Name, staffCount)
+	return progress, nil
+}
+
+// Helper functions
+
+func stringValue(s *string) string {
+	if s == nil {
+		return ""
+	}
+	return *s
+}
+
+func boolPtr(b bool) *bool {
+	return &b
+}
--- a/edu-search-service/internal/staff/patterns.go
+++ b/edu-search-service/internal/staff/patterns.go
@@ -0,0 +1,342 @@
+package staff
+
+import (
+	"regexp"
+	"strings"
+)
+
+// UniversityPatterns contains URL patterns for specific universities
+type UniversityPatterns struct {
+	patterns map[string]UniversityConfig
+}
+
+// UniversityConfig contains crawling configuration for a specific university
+type UniversityConfig struct {
+	StaffListURLs    []string          // URLs to staff listing pages
+	StaffLinkPattern *regexp.Regexp    // Pattern to identify staff profile links
+	NameSelector     string            // CSS selector for person name
+	PositionSelector string            // CSS selector for position
+	EmailSelector    string            // CSS selector for email
+	PhotoSelector    string            // CSS selector for photo
+	Extractors       []string          // List of extractor types to use
+}
+
+// NewUniversityPatterns creates a new pattern registry with known patterns
+func NewUniversityPatterns() *UniversityPatterns {
+	p := &UniversityPatterns{
+		patterns: make(map[string]UniversityConfig),
+	}
+
+	// Register known university patterns
+	p.registerKnownPatterns()
+
+	return p
+}
+
+// GetConfig returns the configuration for a university domain
+func (p *UniversityPatterns) GetConfig(domain string) *UniversityConfig {
+	// Normalize domain
+	domain = strings.ToLower(domain)
+	domain = strings.TrimPrefix(domain, "www.")
+
+	if config, ok := p.patterns[domain]; ok {
+		return &config
+	}
+
+	// Try partial match
+	for key, config := range p.patterns {
+		if strings.Contains(domain, key) || strings.Contains(key, domain) {
+			return &config
+		}
+	}
+
+	return nil
+}
+
+// registerKnownPatterns registers patterns for known German universities
+func (p *UniversityPatterns) registerKnownPatterns() {
+	// KIT - Karlsruher Institut für Technologie
+	p.patterns["kit.edu"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.kit.edu/kit/fakultaeten.php",
+		},
+		StaffLinkPattern: regexp.MustCompile(`/personen/\d+`),
+		NameSelector:     ".person-name, h1.title",
+		PositionSelector: ".person-position, .position",
+		EmailSelector:    "a[href^='mailto:']",
+		PhotoSelector:    ".person-image img, .portrait img",
+	}
+
+	// TUM - Technische Universität München
+	p.patterns["tum.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.tum.de/die-tum/fakultaeten",
+		},
+		StaffLinkPattern: regexp.MustCompile(`/person/\w+`),
+		NameSelector:     ".person-name, h1",
+		PositionSelector: ".person-title, .function",
+		EmailSelector:    "a[href^='mailto:']",
+		PhotoSelector:    ".person-photo img",
+	}
+
+	// LMU - Ludwig-Maximilians-Universität München
+	p.patterns["lmu.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.lmu.de/de/die-lmu/struktur/fakultaeten-einrichtungen-zentren-und-weitere-institutionen/",
+		},
+		NameSelector:     ".person h2, .staff-name",
+		PositionSelector: ".person-position, .staff-position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// RWTH Aachen
+	p.patterns["rwth-aachen.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.rwth-aachen.de/cms/root/Die-RWTH/Fakultaeten/~ep/Fakultaeten-und-Einrichtungen/",
+		},
+		NameSelector:     ".person-name, h3.title",
+		PositionSelector: ".person-function, .position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// TU Berlin
+	p.patterns["tu-berlin.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.tu.berlin/ueber-die-tu-berlin/organisation/fakultaeten-und-einrichtungen",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".position, .function",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// FU Berlin
+	p.patterns["fu-berlin.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.fu-berlin.de/einrichtungen/fachbereiche/",
+		},
+		NameSelector:     ".person-fullname, h2",
+		PositionSelector: ".person-position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// HU Berlin
+	p.patterns["hu-berlin.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.hu-berlin.de/de/einrichtungen-organisation/fakultaeten-und-institute",
+		},
+		NameSelector:     ".person h2, .name",
+		PositionSelector: ".function, .position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Freiburg
+	p.patterns["uni-freiburg.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://uni-freiburg.de/universitaet/fakultaeten/",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".person-position, .function",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Heidelberg
+	p.patterns["uni-heidelberg.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-heidelberg.de/de/fakultaeten",
+		},
+		NameSelector:     ".person-fullname, h2",
+		PositionSelector: ".person-position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// TU Dresden
+	p.patterns["tu-dresden.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://tu-dresden.de/tu-dresden/organisation/bereiche-und-fakultaeten",
+		},
+		NameSelector:     ".person-name, h2.name",
+		PositionSelector: ".person-function, .funktion",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Leipzig
+	p.patterns["uni-leipzig.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-leipzig.de/universitaet/struktur/fakultaeten",
+		},
+		NameSelector:     ".person h2, .name",
+		PositionSelector: ".position, .funktion",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Köln
+	p.patterns["uni-koeln.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-koeln.de/",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".person-position, .function",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Bonn
+	p.patterns["uni-bonn.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-bonn.de/de/universitaet/fakultaeten",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".person-position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Münster
+	p.patterns["uni-muenster.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-muenster.de/de/fakultaeten.html",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".person-function",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Hamburg
+	p.patterns["uni-hamburg.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-hamburg.de/einrichtungen/fakultaeten.html",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// Universität Göttingen
+	p.patterns["uni-goettingen.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.uni-goettingen.de/de/fakultaeten/27952.html",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".person-position",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+
+	// TU Darmstadt
+	p.patterns["tu-darmstadt.de"] = UniversityConfig{
+		StaffListURLs: []string{
+			"https://www.tu-darmstadt.de/universitaet/fachbereiche/index.de.jsp",
+		},
+		NameSelector:     ".person-name, h2",
+		PositionSelector: ".person-position, .funktion",
+		EmailSelector:    "a[href^='mailto:']",
+	}
+}
+
+// CommonStaffPagePaths returns common paths where staff listings are found
+func CommonStaffPagePaths() []string {
+	return []string{
+		"/personen",
+		"/team",
+		"/mitarbeiter",
+		"/mitarbeitende",
+		"/staff",
+		"/people",
+		"/ueber-uns/team",
+		"/about/team",
+		"/fakultaet/personen",
+		"/institut/mitarbeiter",
+		"/lehrstuhl/team",
+		"/personal",
+		"/beschaeftigte",
+		"/dozenten",
+		"/professoren",
+	}
+}
+
+// CommonPersonSelectors returns common CSS selectors for person elements
+func CommonPersonSelectors() []string {
+	return []string{
+		".person",
+		".person-card",
+		".staff-member",
+		".team-member",
+		".mitarbeiter",
+		".employee",
+		".vcard",
+		".h-card",
+		"[itemtype='http://schema.org/Person']",
+		".person-entry",
+		".staff-entry",
+		".profile-card",
+	}
+}
+
+// TitlePrefixes returns common German academic title prefixes
+func TitlePrefixes() []string {
+	return []string{
+		"Prof. Dr. Dr. h.c. mult.",
+		"Prof. Dr. Dr. h.c.",
+		"Prof. Dr. Dr.",
+		"Prof. Dr.-Ing.",
+		"Prof. Dr. rer. nat.",
+		"Prof. Dr. phil.",
+		"Prof. Dr. jur.",
+		"Prof. Dr. med.",
+		"Prof. Dr.",
+		"Prof.",
+		"PD Dr.",
+		"apl. Prof. Dr.",
+		"Jun.-Prof. Dr.",
+		"Dr.-Ing.",
+		"Dr. rer. nat.",
+		"Dr. phil.",
+		"Dr. jur.",
+		"Dr. med.",
+		"Dr.",
+		"Dipl.-Ing.",
+		"Dipl.-Inf.",
+		"Dipl.-Phys.",
+		"Dipl.-Math.",
+		"Dipl.-Kfm.",
+		"M.Sc.",
+		"M.A.",
+		"M.Eng.",
+		"B.Sc.",
+		"B.A.",
+	}
+}
+
+// PositionKeywords returns keywords that indicate staff positions
+func PositionKeywords() []string {
+	return []string{
+		// Professors
+		"Professor", "Professorin",
+		"Ordinarius",
+		"Lehrstuhlinhaber", "Lehrstuhlinhaberin",
+		"Dekan", "Dekanin",
+		"Rektor", "Rektorin",
+
+		// Research staff
+		"Wissenschaftlicher Mitarbeiter", "Wissenschaftliche Mitarbeiterin",
+		"Akademischer Rat", "Akademische Rätin",
+		"Postdoktorand", "Postdoktorandin",
+		"Doktorand", "Doktorandin",
+		"Promovend", "Promovendin",
+		"Forscher", "Forscherin",
+		"Researcher",
+
+		// Teaching
+		"Dozent", "Dozentin",
+		"Lektor", "Lektorin",
+		"Lehrbeauftragter", "Lehrbeauftragte",
+
+		// Administrative
+		"Sekretär", "Sekretärin",
+		"Geschäftsführer", "Geschäftsführerin",
+		"Verwaltungsleiter", "Verwaltungsleiterin",
+		"Referent", "Referentin",
+
+		// Students
+		"Studentische Hilfskraft",
+		"Wissenschaftliche Hilfskraft",
+		"Tutor", "Tutorin",
+	}
+}
--- a/edu-search-service/internal/staff/publication_adapter.go
+++ b/edu-search-service/internal/staff/publication_adapter.go
@@ -0,0 +1,78 @@
+// Package staff provides university staff and publication crawling functionality
+package staff
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"github.com/google/uuid"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+	"github.com/breakpilot/edu-search-service/internal/orchestrator"
+)
+
+// PublicationOrchestratorAdapter adapts publication crawling to the orchestrator interface
+// Note: This is a stub for now - publication crawling is a future feature
+type PublicationOrchestratorAdapter struct {
+	repo *database.Repository
+}
+
+// NewPublicationOrchestratorAdapter creates a new publication crawler adapter
+func NewPublicationOrchestratorAdapter(repo *database.Repository) *PublicationOrchestratorAdapter {
+	return &PublicationOrchestratorAdapter{
+		repo: repo,
+	}
+}
+
+// CrawlPublicationsForUniversity crawls publications for all staff at a university
+// This is Phase 4: Publication discovery (future implementation)
+func (a *PublicationOrchestratorAdapter) CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhasePublications,
+		StartedAt: start,
+	}
+
+	log.Printf("[PublicationAdapter] Publications phase for university %s", universityID)
+
+	// Get staff members for this university
+	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
+		UniversityID: &universityID,
+		Limit:        10000,
+	})
+	if err != nil {
+		progress.Errors = append(progress.Errors, err.Error())
+		return progress, err
+	}
+
+	log.Printf("[PublicationAdapter] Found %d staff members for publication crawling", staffList.Total)
+
+	// TODO: Implement actual publication crawling
+	// - For each staff member with ORCID/Google Scholar ID:
+	//   - Fetch publications from ORCID API
+	//   - Fetch publications from Google Scholar
+	//   - Match and deduplicate
+	//   - Store in database
+	//
+	// For now, we mark this phase as complete (no-op)
+
+	pubCount := 0
+
+	// Count existing publications for this university
+	for _, staff := range staffList.Staff {
+		pubs, err := a.repo.GetStaffPublications(ctx, staff.ID)
+		if err == nil {
+			pubCount += len(pubs)
+		}
+	}
+
+	progress.ItemsFound = pubCount
+	progress.ItemsProcessed = staffList.Total
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	log.Printf("[PublicationAdapter] Publications phase completed for university %s: %d existing publications found", universityID, pubCount)
+
+	return progress, nil
+}
--- a/edu-search-service/internal/staff/staff_crawler.go
+++ b/edu-search-service/internal/staff/staff_crawler.go
--- a/edu-search-service/internal/staff/staff_crawler_test.go
+++ b/edu-search-service/internal/staff/staff_crawler_test.go
@@ -0,0 +1,348 @@
+package staff
+
+import (
+	"testing"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+)
+
+func TestParseName_FullName_WithTitle(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name           string
+		fullName       string
+		expectedFirst  string
+		expectedLast   string
+		expectedTitle  bool
+	}{
+		{
+			name:           "Prof. Dr. with first and last name",
+			fullName:       "Prof. Dr. Hans Müller",
+			expectedFirst:  "Hans",
+			expectedLast:   "Müller",
+			expectedTitle:  true,
+		},
+		{
+			name:           "Dr. with first and last name",
+			fullName:       "Dr. Maria Schmidt",
+			expectedFirst:  "Maria",
+			expectedLast:   "Schmidt",
+			expectedTitle:  true,
+		},
+		{
+			name:           "Simple name without title",
+			fullName:       "Thomas Weber",
+			expectedFirst:  "Thomas",
+			expectedLast:   "Weber",
+			expectedTitle:  false,
+		},
+		{
+			name:           "Multiple first names",
+			fullName:       "Prof. Dr. Hans-Peter Meier",
+			expectedFirst:  "Hans-Peter",
+			expectedLast:   "Meier",
+			expectedTitle:  true,
+		},
+		{
+			name:           "Single name",
+			fullName:       "Müller",
+			expectedFirst:  "",
+			expectedLast:   "Müller",
+			expectedTitle:  false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			person := &database.UniversityStaff{}
+			crawler.parseName(tt.fullName, person)
+
+			firstName := ""
+			if person.FirstName != nil {
+				firstName = *person.FirstName
+			}
+
+			if firstName != tt.expectedFirst {
+				t.Errorf("First name: expected %q, got %q", tt.expectedFirst, firstName)
+			}
+			if person.LastName != tt.expectedLast {
+				t.Errorf("Last name: expected %q, got %q", tt.expectedLast, person.LastName)
+			}
+			hasTitle := person.Title != nil && *person.Title != ""
+			if hasTitle != tt.expectedTitle {
+				t.Errorf("Has title: expected %v, got %v", tt.expectedTitle, hasTitle)
+			}
+		})
+	}
+}
+
+func TestClassifyPosition_Professor(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+		expected string
+	}{
+		{"Full Professor", "Professor für Informatik", "professor"},
+		{"Prof abbreviation", "Prof. Dr. Müller", "professor"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.classifyPosition(tt.position)
+			if result == nil {
+				t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
+				return
+			}
+			if *result != tt.expected {
+				t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
+			}
+		})
+	}
+}
+
+func TestClassifyPosition_Postdoc(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+		expected string
+	}{
+		{"Postdoc", "Postdoc in Machine Learning", "postdoc"},
+		{"Post-Doc hyphenated", "Post-Doc", "postdoc"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.classifyPosition(tt.position)
+			if result == nil {
+				t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
+				return
+			}
+			if *result != tt.expected {
+				t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
+			}
+		})
+	}
+}
+
+func TestClassifyPosition_PhDStudent(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+		expected string
+	}{
+		{"Doktorand", "Doktorand", "phd_student"},
+		{"PhD Student", "PhD Student", "phd_student"},
+		{"Promovend", "Promovend", "phd_student"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.classifyPosition(tt.position)
+			if result == nil {
+				t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
+				return
+			}
+			if *result != tt.expected {
+				t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
+			}
+		})
+	}
+}
+
+func TestClassifyPosition_Admin(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+		expected string
+	}{
+		{"Sekretariat", "Sekretärin", "admin"},
+		{"Verwaltung", "Verwaltung", "admin"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.classifyPosition(tt.position)
+			if result == nil {
+				t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
+				return
+			}
+			if *result != tt.expected {
+				t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
+			}
+		})
+	}
+}
+
+func TestClassifyPosition_Researcher(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+		expected string
+	}{
+		{"Wissenschaftlicher Mitarbeiter", "Wissenschaftlicher Mitarbeiter", "researcher"},
+		{"Researcher", "Senior Researcher", "researcher"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.classifyPosition(tt.position)
+			if result == nil {
+				t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
+				return
+			}
+			if *result != tt.expected {
+				t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
+			}
+		})
+	}
+}
+
+func TestClassifyPosition_Student(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+		expected string
+	}{
+		{"Studentische Hilfskraft", "Studentische Hilfskraft", "student"},
+		{"HiWi", "Student (HiWi)", "student"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.classifyPosition(tt.position)
+			if result == nil {
+				t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
+				return
+			}
+			if *result != tt.expected {
+				t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
+			}
+		})
+	}
+}
+
+func TestIsProfessor_True(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+	}{
+		{"Professor keyword", "Professor für Mathematik"},
+		{"Prof. abbreviation", "Prof. Dr. Müller"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.isProfessor(tt.position)
+			if !result {
+				t.Errorf("Expected true for position=%q", tt.position)
+			}
+		})
+	}
+}
+
+func TestIsProfessor_False(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name     string
+		position string
+	}{
+		{"Dr. only", "Dr. Wissenschaftlicher Mitarbeiter"},
+		{"Doktorand", "Doktorand"},
+		{"Technical staff", "Laboringenieur"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.isProfessor(tt.position)
+			if result {
+				t.Errorf("Expected false for position=%q", tt.position)
+			}
+		})
+	}
+}
+
+func TestLooksLikePosition_True(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name string
+		text string
+	}{
+		{"Professor", "Professor für Informatik"},
+		{"Wissenschaftlicher Mitarbeiter", "Wissenschaftlicher Mitarbeiter"},
+		{"Doktorand", "Doktorand"},
+		{"Sekretär", "Sekretärin"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.looksLikePosition(tt.text)
+			if !result {
+				t.Errorf("Expected true for text=%q", tt.text)
+			}
+		})
+	}
+}
+
+func TestLooksLikePosition_False(t *testing.T) {
+	crawler := &StaffCrawler{}
+
+	tests := []struct {
+		name string
+		text string
+	}{
+		{"Name", "Hans Müller"},
+		{"Email", "test@example.com"},
+		{"Random text", "Room 123"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := crawler.looksLikePosition(tt.text)
+			if result {
+				t.Errorf("Expected false for text=%q", tt.text)
+			}
+		})
+	}
+}
+
+func TestResolveURL(t *testing.T) {
+	tests := []struct {
+		name     string
+		baseURL  string
+		href     string
+		expected string
+	}{
+		{"Absolute URL", "https://example.com", "https://other.com/page", "https://other.com/page"},
+		{"Relative path", "https://example.com/team", "/person/123", "https://example.com/person/123"},
+		{"Relative no slash", "https://example.com/team/", "member", "https://example.com/team/member"},
+		{"Empty href", "https://example.com", "", ""},
+		{"Root relative", "https://example.com/a/b/c", "/root", "https://example.com/root"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := resolveURL(tt.baseURL, tt.href)
+			if result != tt.expected {
+				t.Errorf("resolveURL(%q, %q) = %q, expected %q",
+					tt.baseURL, tt.href, result, tt.expected)
+			}
+		})
+	}
+}