feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
217
edu-search-service/internal/staff/orchestrator_adapter.go
Normal file
217
edu-search-service/internal/staff/orchestrator_adapter.go
Normal file
@@ -0,0 +1,217 @@
|
||||
// Package staff provides university staff crawling functionality
|
||||
package staff
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
"github.com/breakpilot/edu-search-service/internal/orchestrator"
|
||||
)
|
||||
|
||||
// OrchestratorAdapter adapts the StaffCrawler to the orchestrator.StaffCrawlerInterface
|
||||
// This bridges the gap between the generic StaffCrawler and the multi-phase orchestrator
|
||||
type OrchestratorAdapter struct {
|
||||
crawler *StaffCrawler
|
||||
repo *database.Repository
|
||||
}
|
||||
|
||||
// NewOrchestratorAdapter creates a new adapter that connects StaffCrawler to the orchestrator
|
||||
func NewOrchestratorAdapter(crawler *StaffCrawler, repo *database.Repository) *OrchestratorAdapter {
|
||||
return &OrchestratorAdapter{
|
||||
crawler: crawler,
|
||||
repo: repo,
|
||||
}
|
||||
}
|
||||
|
||||
// DiscoverSampleProfessor finds at least one professor to validate crawling works for this university
|
||||
// This is Phase 1: Quick validation that the university website is crawlable
|
||||
func (a *OrchestratorAdapter) DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhaseDiscovery,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Discovery phase for university %s", universityID)
|
||||
|
||||
// Get university from database
|
||||
uni, err := a.repo.GetUniversityByID(ctx, universityID)
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
|
||||
return progress, fmt.Errorf("failed to get university: %w", err)
|
||||
}
|
||||
|
||||
if uni == nil {
|
||||
progress.Errors = append(progress.Errors, "University not found")
|
||||
return progress, fmt.Errorf("university not found: %s", universityID)
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Discovering staff pages for %s (%s)", uni.Name, uni.URL)
|
||||
|
||||
// Use the crawler to find staff pages (discovery phase)
|
||||
staffPages, err := a.crawler.findStaffPages(ctx, uni)
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to find staff pages: %v", err))
|
||||
return progress, fmt.Errorf("failed to find staff pages: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Found %d staff pages for %s", len(staffPages), uni.Name)
|
||||
|
||||
// Try to extract at least one professor as validation
|
||||
var sampleFound int
|
||||
for _, pageURL := range staffPages {
|
||||
if sampleFound > 0 {
|
||||
break // We just need to validate one works
|
||||
}
|
||||
|
||||
staffMembers, err := a.crawler.extractStaffFromPage(ctx, pageURL, uni)
|
||||
if err != nil {
|
||||
log.Printf("[OrchestratorAdapter] Error extracting from %s: %v", pageURL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Count professors found
|
||||
for _, staff := range staffMembers {
|
||||
if staff.IsProfessor {
|
||||
sampleFound++
|
||||
log.Printf("[OrchestratorAdapter] Found sample professor: %s %s",
|
||||
stringValue(staff.FirstName), staff.LastName)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Even non-professors validate the crawler works
|
||||
if sampleFound == 0 && len(staffMembers) > 0 {
|
||||
sampleFound = 1
|
||||
log.Printf("[OrchestratorAdapter] Found sample staff member (not professor): %s %s",
|
||||
stringValue(staffMembers[0].FirstName), staffMembers[0].LastName)
|
||||
}
|
||||
}
|
||||
|
||||
progress.ItemsFound = len(staffPages) // Number of crawlable pages found
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
if sampleFound == 0 && len(staffPages) > 0 {
|
||||
// Pages found but no staff extracted - still consider it successful
|
||||
log.Printf("[OrchestratorAdapter] Discovery completed: %d pages found, extraction may need tuning", len(staffPages))
|
||||
} else if sampleFound == 0 {
|
||||
progress.Errors = append(progress.Errors, "No staff pages found")
|
||||
return progress, fmt.Errorf("no staff pages found for %s", uni.Name)
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Discovery completed for %s: %d pages found", uni.Name, len(staffPages))
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
// CrawlProfessors crawls all professors at a university
|
||||
// This is Phase 2: Focus on finding professors specifically
|
||||
func (a *OrchestratorAdapter) CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhaseProfessors,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Professors phase for university %s", universityID)
|
||||
|
||||
// Get university
|
||||
uni, err := a.repo.GetUniversityByID(ctx, universityID)
|
||||
if err != nil || uni == nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
|
||||
return progress, fmt.Errorf("failed to get university: %w", err)
|
||||
}
|
||||
|
||||
// Perform full crawl
|
||||
result, err := a.crawler.CrawlUniversity(ctx, uni)
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Crawl failed: %v", err))
|
||||
return progress, err
|
||||
}
|
||||
|
||||
// Count professors specifically
|
||||
professorCount := 0
|
||||
staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &universityID,
|
||||
IsProfessor: boolPtr(true),
|
||||
Limit: 10000,
|
||||
})
|
||||
if err == nil {
|
||||
professorCount = staffList.Total
|
||||
}
|
||||
|
||||
progress.ItemsFound = professorCount
|
||||
progress.ItemsProcessed = result.StaffFound
|
||||
progress.Errors = result.Errors
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Professors phase completed for %s: %d professors found", uni.Name, professorCount)
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
// CrawlAllStaff crawls all staff members at a university
|
||||
// This is Phase 3: Get all staff (already done in Phase 2, but we verify/extend)
|
||||
func (a *OrchestratorAdapter) CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhaseAllStaff,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] All Staff phase for university %s", universityID)
|
||||
|
||||
// Get university
|
||||
uni, err := a.repo.GetUniversityByID(ctx, universityID)
|
||||
if err != nil || uni == nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
|
||||
return progress, fmt.Errorf("failed to get university: %w", err)
|
||||
}
|
||||
|
||||
// Run another crawl pass to catch any missed staff
|
||||
result, err := a.crawler.CrawlUniversity(ctx, uni)
|
||||
if err != nil {
|
||||
progress.Errors = result.Errors
|
||||
// Don't fail completely - we may have some staff already
|
||||
log.Printf("[OrchestratorAdapter] All Staff crawl had errors: %v", err)
|
||||
}
|
||||
|
||||
// Get total staff count
|
||||
staffCount := 0
|
||||
staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &universityID,
|
||||
Limit: 1, // Just need count
|
||||
})
|
||||
if err == nil {
|
||||
staffCount = staffList.Total
|
||||
}
|
||||
|
||||
progress.ItemsFound = staffCount
|
||||
if result != nil {
|
||||
progress.ItemsProcessed = result.StaffFound
|
||||
progress.Errors = result.Errors
|
||||
}
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
log.Printf("[OrchestratorAdapter] All Staff phase completed for %s: %d total staff", uni.Name, staffCount)
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func stringValue(s *string) string {
|
||||
if s == nil {
|
||||
return ""
|
||||
}
|
||||
return *s
|
||||
}
|
||||
|
||||
func boolPtr(b bool) *bool {
|
||||
return &b
|
||||
}
|
||||
342
edu-search-service/internal/staff/patterns.go
Normal file
342
edu-search-service/internal/staff/patterns.go
Normal file
@@ -0,0 +1,342 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// UniversityPatterns contains URL patterns for specific universities
|
||||
type UniversityPatterns struct {
|
||||
patterns map[string]UniversityConfig
|
||||
}
|
||||
|
||||
// UniversityConfig contains crawling configuration for a specific university
|
||||
type UniversityConfig struct {
|
||||
StaffListURLs []string // URLs to staff listing pages
|
||||
StaffLinkPattern *regexp.Regexp // Pattern to identify staff profile links
|
||||
NameSelector string // CSS selector for person name
|
||||
PositionSelector string // CSS selector for position
|
||||
EmailSelector string // CSS selector for email
|
||||
PhotoSelector string // CSS selector for photo
|
||||
Extractors []string // List of extractor types to use
|
||||
}
|
||||
|
||||
// NewUniversityPatterns creates a new pattern registry with known patterns
|
||||
func NewUniversityPatterns() *UniversityPatterns {
|
||||
p := &UniversityPatterns{
|
||||
patterns: make(map[string]UniversityConfig),
|
||||
}
|
||||
|
||||
// Register known university patterns
|
||||
p.registerKnownPatterns()
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
// GetConfig returns the configuration for a university domain
|
||||
func (p *UniversityPatterns) GetConfig(domain string) *UniversityConfig {
|
||||
// Normalize domain
|
||||
domain = strings.ToLower(domain)
|
||||
domain = strings.TrimPrefix(domain, "www.")
|
||||
|
||||
if config, ok := p.patterns[domain]; ok {
|
||||
return &config
|
||||
}
|
||||
|
||||
// Try partial match
|
||||
for key, config := range p.patterns {
|
||||
if strings.Contains(domain, key) || strings.Contains(key, domain) {
|
||||
return &config
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// registerKnownPatterns registers patterns for known German universities
|
||||
func (p *UniversityPatterns) registerKnownPatterns() {
|
||||
// KIT - Karlsruher Institut für Technologie
|
||||
p.patterns["kit.edu"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.kit.edu/kit/fakultaeten.php",
|
||||
},
|
||||
StaffLinkPattern: regexp.MustCompile(`/personen/\d+`),
|
||||
NameSelector: ".person-name, h1.title",
|
||||
PositionSelector: ".person-position, .position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
PhotoSelector: ".person-image img, .portrait img",
|
||||
}
|
||||
|
||||
// TUM - Technische Universität München
|
||||
p.patterns["tum.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.tum.de/die-tum/fakultaeten",
|
||||
},
|
||||
StaffLinkPattern: regexp.MustCompile(`/person/\w+`),
|
||||
NameSelector: ".person-name, h1",
|
||||
PositionSelector: ".person-title, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
PhotoSelector: ".person-photo img",
|
||||
}
|
||||
|
||||
// LMU - Ludwig-Maximilians-Universität München
|
||||
p.patterns["lmu.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.lmu.de/de/die-lmu/struktur/fakultaeten-einrichtungen-zentren-und-weitere-institutionen/",
|
||||
},
|
||||
NameSelector: ".person h2, .staff-name",
|
||||
PositionSelector: ".person-position, .staff-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// RWTH Aachen
|
||||
p.patterns["rwth-aachen.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.rwth-aachen.de/cms/root/Die-RWTH/Fakultaeten/~ep/Fakultaeten-und-Einrichtungen/",
|
||||
},
|
||||
NameSelector: ".person-name, h3.title",
|
||||
PositionSelector: ".person-function, .position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// TU Berlin
|
||||
p.patterns["tu-berlin.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.tu.berlin/ueber-die-tu-berlin/organisation/fakultaeten-und-einrichtungen",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".position, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// FU Berlin
|
||||
p.patterns["fu-berlin.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.fu-berlin.de/einrichtungen/fachbereiche/",
|
||||
},
|
||||
NameSelector: ".person-fullname, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// HU Berlin
|
||||
p.patterns["hu-berlin.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.hu-berlin.de/de/einrichtungen-organisation/fakultaeten-und-institute",
|
||||
},
|
||||
NameSelector: ".person h2, .name",
|
||||
PositionSelector: ".function, .position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Freiburg
|
||||
p.patterns["uni-freiburg.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://uni-freiburg.de/universitaet/fakultaeten/",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Heidelberg
|
||||
p.patterns["uni-heidelberg.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-heidelberg.de/de/fakultaeten",
|
||||
},
|
||||
NameSelector: ".person-fullname, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// TU Dresden
|
||||
p.patterns["tu-dresden.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://tu-dresden.de/tu-dresden/organisation/bereiche-und-fakultaeten",
|
||||
},
|
||||
NameSelector: ".person-name, h2.name",
|
||||
PositionSelector: ".person-function, .funktion",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Leipzig
|
||||
p.patterns["uni-leipzig.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-leipzig.de/universitaet/struktur/fakultaeten",
|
||||
},
|
||||
NameSelector: ".person h2, .name",
|
||||
PositionSelector: ".position, .funktion",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Köln
|
||||
p.patterns["uni-koeln.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-koeln.de/",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Bonn
|
||||
p.patterns["uni-bonn.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-bonn.de/de/universitaet/fakultaeten",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Münster
|
||||
p.patterns["uni-muenster.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-muenster.de/de/fakultaeten.html",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Hamburg
|
||||
p.patterns["uni-hamburg.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-hamburg.de/einrichtungen/fakultaeten.html",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Göttingen
|
||||
p.patterns["uni-goettingen.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-goettingen.de/de/fakultaeten/27952.html",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// TU Darmstadt
|
||||
p.patterns["tu-darmstadt.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.tu-darmstadt.de/universitaet/fachbereiche/index.de.jsp",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position, .funktion",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
}
|
||||
|
||||
// CommonStaffPagePaths returns common paths where staff listings are found
|
||||
func CommonStaffPagePaths() []string {
|
||||
return []string{
|
||||
"/personen",
|
||||
"/team",
|
||||
"/mitarbeiter",
|
||||
"/mitarbeitende",
|
||||
"/staff",
|
||||
"/people",
|
||||
"/ueber-uns/team",
|
||||
"/about/team",
|
||||
"/fakultaet/personen",
|
||||
"/institut/mitarbeiter",
|
||||
"/lehrstuhl/team",
|
||||
"/personal",
|
||||
"/beschaeftigte",
|
||||
"/dozenten",
|
||||
"/professoren",
|
||||
}
|
||||
}
|
||||
|
||||
// CommonPersonSelectors returns common CSS selectors for person elements
|
||||
func CommonPersonSelectors() []string {
|
||||
return []string{
|
||||
".person",
|
||||
".person-card",
|
||||
".staff-member",
|
||||
".team-member",
|
||||
".mitarbeiter",
|
||||
".employee",
|
||||
".vcard",
|
||||
".h-card",
|
||||
"[itemtype='http://schema.org/Person']",
|
||||
".person-entry",
|
||||
".staff-entry",
|
||||
".profile-card",
|
||||
}
|
||||
}
|
||||
|
||||
// TitlePrefixes returns common German academic title prefixes
|
||||
func TitlePrefixes() []string {
|
||||
return []string{
|
||||
"Prof. Dr. Dr. h.c. mult.",
|
||||
"Prof. Dr. Dr. h.c.",
|
||||
"Prof. Dr. Dr.",
|
||||
"Prof. Dr.-Ing.",
|
||||
"Prof. Dr. rer. nat.",
|
||||
"Prof. Dr. phil.",
|
||||
"Prof. Dr. jur.",
|
||||
"Prof. Dr. med.",
|
||||
"Prof. Dr.",
|
||||
"Prof.",
|
||||
"PD Dr.",
|
||||
"apl. Prof. Dr.",
|
||||
"Jun.-Prof. Dr.",
|
||||
"Dr.-Ing.",
|
||||
"Dr. rer. nat.",
|
||||
"Dr. phil.",
|
||||
"Dr. jur.",
|
||||
"Dr. med.",
|
||||
"Dr.",
|
||||
"Dipl.-Ing.",
|
||||
"Dipl.-Inf.",
|
||||
"Dipl.-Phys.",
|
||||
"Dipl.-Math.",
|
||||
"Dipl.-Kfm.",
|
||||
"M.Sc.",
|
||||
"M.A.",
|
||||
"M.Eng.",
|
||||
"B.Sc.",
|
||||
"B.A.",
|
||||
}
|
||||
}
|
||||
|
||||
// PositionKeywords returns keywords that indicate staff positions
|
||||
func PositionKeywords() []string {
|
||||
return []string{
|
||||
// Professors
|
||||
"Professor", "Professorin",
|
||||
"Ordinarius",
|
||||
"Lehrstuhlinhaber", "Lehrstuhlinhaberin",
|
||||
"Dekan", "Dekanin",
|
||||
"Rektor", "Rektorin",
|
||||
|
||||
// Research staff
|
||||
"Wissenschaftlicher Mitarbeiter", "Wissenschaftliche Mitarbeiterin",
|
||||
"Akademischer Rat", "Akademische Rätin",
|
||||
"Postdoktorand", "Postdoktorandin",
|
||||
"Doktorand", "Doktorandin",
|
||||
"Promovend", "Promovendin",
|
||||
"Forscher", "Forscherin",
|
||||
"Researcher",
|
||||
|
||||
// Teaching
|
||||
"Dozent", "Dozentin",
|
||||
"Lektor", "Lektorin",
|
||||
"Lehrbeauftragter", "Lehrbeauftragte",
|
||||
|
||||
// Administrative
|
||||
"Sekretär", "Sekretärin",
|
||||
"Geschäftsführer", "Geschäftsführerin",
|
||||
"Verwaltungsleiter", "Verwaltungsleiterin",
|
||||
"Referent", "Referentin",
|
||||
|
||||
// Students
|
||||
"Studentische Hilfskraft",
|
||||
"Wissenschaftliche Hilfskraft",
|
||||
"Tutor", "Tutorin",
|
||||
}
|
||||
}
|
||||
78
edu-search-service/internal/staff/publication_adapter.go
Normal file
78
edu-search-service/internal/staff/publication_adapter.go
Normal file
@@ -0,0 +1,78 @@
|
||||
// Package staff provides university staff and publication crawling functionality
|
||||
package staff
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
"github.com/breakpilot/edu-search-service/internal/orchestrator"
|
||||
)
|
||||
|
||||
// PublicationOrchestratorAdapter adapts publication crawling to the orchestrator interface
|
||||
// Note: This is a stub for now - publication crawling is a future feature
|
||||
type PublicationOrchestratorAdapter struct {
|
||||
repo *database.Repository
|
||||
}
|
||||
|
||||
// NewPublicationOrchestratorAdapter creates a new publication crawler adapter
|
||||
func NewPublicationOrchestratorAdapter(repo *database.Repository) *PublicationOrchestratorAdapter {
|
||||
return &PublicationOrchestratorAdapter{
|
||||
repo: repo,
|
||||
}
|
||||
}
|
||||
|
||||
// CrawlPublicationsForUniversity crawls publications for all staff at a university
|
||||
// This is Phase 4: Publication discovery (future implementation)
|
||||
func (a *PublicationOrchestratorAdapter) CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhasePublications,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[PublicationAdapter] Publications phase for university %s", universityID)
|
||||
|
||||
// Get staff members for this university
|
||||
staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &universityID,
|
||||
Limit: 10000,
|
||||
})
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, err.Error())
|
||||
return progress, err
|
||||
}
|
||||
|
||||
log.Printf("[PublicationAdapter] Found %d staff members for publication crawling", staffList.Total)
|
||||
|
||||
// TODO: Implement actual publication crawling
|
||||
// - For each staff member with ORCID/Google Scholar ID:
|
||||
// - Fetch publications from ORCID API
|
||||
// - Fetch publications from Google Scholar
|
||||
// - Match and deduplicate
|
||||
// - Store in database
|
||||
//
|
||||
// For now, we mark this phase as complete (no-op)
|
||||
|
||||
pubCount := 0
|
||||
|
||||
// Count existing publications for this university
|
||||
for _, staff := range staffList.Staff {
|
||||
pubs, err := a.repo.GetStaffPublications(ctx, staff.ID)
|
||||
if err == nil {
|
||||
pubCount += len(pubs)
|
||||
}
|
||||
}
|
||||
|
||||
progress.ItemsFound = pubCount
|
||||
progress.ItemsProcessed = staffList.Total
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
log.Printf("[PublicationAdapter] Publications phase completed for university %s: %d existing publications found", universityID, pubCount)
|
||||
|
||||
return progress, nil
|
||||
}
|
||||
1402
edu-search-service/internal/staff/staff_crawler.go
Normal file
1402
edu-search-service/internal/staff/staff_crawler.go
Normal file
File diff suppressed because it is too large
Load Diff
348
edu-search-service/internal/staff/staff_crawler_test.go
Normal file
348
edu-search-service/internal/staff/staff_crawler_test.go
Normal file
@@ -0,0 +1,348 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
func TestParseName_FullName_WithTitle(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
fullName string
|
||||
expectedFirst string
|
||||
expectedLast string
|
||||
expectedTitle bool
|
||||
}{
|
||||
{
|
||||
name: "Prof. Dr. with first and last name",
|
||||
fullName: "Prof. Dr. Hans Müller",
|
||||
expectedFirst: "Hans",
|
||||
expectedLast: "Müller",
|
||||
expectedTitle: true,
|
||||
},
|
||||
{
|
||||
name: "Dr. with first and last name",
|
||||
fullName: "Dr. Maria Schmidt",
|
||||
expectedFirst: "Maria",
|
||||
expectedLast: "Schmidt",
|
||||
expectedTitle: true,
|
||||
},
|
||||
{
|
||||
name: "Simple name without title",
|
||||
fullName: "Thomas Weber",
|
||||
expectedFirst: "Thomas",
|
||||
expectedLast: "Weber",
|
||||
expectedTitle: false,
|
||||
},
|
||||
{
|
||||
name: "Multiple first names",
|
||||
fullName: "Prof. Dr. Hans-Peter Meier",
|
||||
expectedFirst: "Hans-Peter",
|
||||
expectedLast: "Meier",
|
||||
expectedTitle: true,
|
||||
},
|
||||
{
|
||||
name: "Single name",
|
||||
fullName: "Müller",
|
||||
expectedFirst: "",
|
||||
expectedLast: "Müller",
|
||||
expectedTitle: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
person := &database.UniversityStaff{}
|
||||
crawler.parseName(tt.fullName, person)
|
||||
|
||||
firstName := ""
|
||||
if person.FirstName != nil {
|
||||
firstName = *person.FirstName
|
||||
}
|
||||
|
||||
if firstName != tt.expectedFirst {
|
||||
t.Errorf("First name: expected %q, got %q", tt.expectedFirst, firstName)
|
||||
}
|
||||
if person.LastName != tt.expectedLast {
|
||||
t.Errorf("Last name: expected %q, got %q", tt.expectedLast, person.LastName)
|
||||
}
|
||||
hasTitle := person.Title != nil && *person.Title != ""
|
||||
if hasTitle != tt.expectedTitle {
|
||||
t.Errorf("Has title: expected %v, got %v", tt.expectedTitle, hasTitle)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyPosition_Professor(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
expected string
|
||||
}{
|
||||
{"Full Professor", "Professor für Informatik", "professor"},
|
||||
{"Prof abbreviation", "Prof. Dr. Müller", "professor"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.classifyPosition(tt.position)
|
||||
if result == nil {
|
||||
t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
|
||||
return
|
||||
}
|
||||
if *result != tt.expected {
|
||||
t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyPosition_Postdoc(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
expected string
|
||||
}{
|
||||
{"Postdoc", "Postdoc in Machine Learning", "postdoc"},
|
||||
{"Post-Doc hyphenated", "Post-Doc", "postdoc"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.classifyPosition(tt.position)
|
||||
if result == nil {
|
||||
t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
|
||||
return
|
||||
}
|
||||
if *result != tt.expected {
|
||||
t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyPosition_PhDStudent(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
expected string
|
||||
}{
|
||||
{"Doktorand", "Doktorand", "phd_student"},
|
||||
{"PhD Student", "PhD Student", "phd_student"},
|
||||
{"Promovend", "Promovend", "phd_student"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.classifyPosition(tt.position)
|
||||
if result == nil {
|
||||
t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
|
||||
return
|
||||
}
|
||||
if *result != tt.expected {
|
||||
t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyPosition_Admin(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
expected string
|
||||
}{
|
||||
{"Sekretariat", "Sekretärin", "admin"},
|
||||
{"Verwaltung", "Verwaltung", "admin"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.classifyPosition(tt.position)
|
||||
if result == nil {
|
||||
t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
|
||||
return
|
||||
}
|
||||
if *result != tt.expected {
|
||||
t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyPosition_Researcher(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
expected string
|
||||
}{
|
||||
{"Wissenschaftlicher Mitarbeiter", "Wissenschaftlicher Mitarbeiter", "researcher"},
|
||||
{"Researcher", "Senior Researcher", "researcher"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.classifyPosition(tt.position)
|
||||
if result == nil {
|
||||
t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
|
||||
return
|
||||
}
|
||||
if *result != tt.expected {
|
||||
t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyPosition_Student(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
expected string
|
||||
}{
|
||||
{"Studentische Hilfskraft", "Studentische Hilfskraft", "student"},
|
||||
{"HiWi", "Student (HiWi)", "student"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.classifyPosition(tt.position)
|
||||
if result == nil {
|
||||
t.Errorf("Expected %q, got nil for position %q", tt.expected, tt.position)
|
||||
return
|
||||
}
|
||||
if *result != tt.expected {
|
||||
t.Errorf("Expected %q, got %q for position %q", tt.expected, *result, tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsProfessor_True(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
}{
|
||||
{"Professor keyword", "Professor für Mathematik"},
|
||||
{"Prof. abbreviation", "Prof. Dr. Müller"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.isProfessor(tt.position)
|
||||
if !result {
|
||||
t.Errorf("Expected true for position=%q", tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsProfessor_False(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
position string
|
||||
}{
|
||||
{"Dr. only", "Dr. Wissenschaftlicher Mitarbeiter"},
|
||||
{"Doktorand", "Doktorand"},
|
||||
{"Technical staff", "Laboringenieur"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.isProfessor(tt.position)
|
||||
if result {
|
||||
t.Errorf("Expected false for position=%q", tt.position)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLooksLikePosition_True(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
text string
|
||||
}{
|
||||
{"Professor", "Professor für Informatik"},
|
||||
{"Wissenschaftlicher Mitarbeiter", "Wissenschaftlicher Mitarbeiter"},
|
||||
{"Doktorand", "Doktorand"},
|
||||
{"Sekretär", "Sekretärin"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.looksLikePosition(tt.text)
|
||||
if !result {
|
||||
t.Errorf("Expected true for text=%q", tt.text)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLooksLikePosition_False(t *testing.T) {
|
||||
crawler := &StaffCrawler{}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
text string
|
||||
}{
|
||||
{"Name", "Hans Müller"},
|
||||
{"Email", "test@example.com"},
|
||||
{"Random text", "Room 123"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := crawler.looksLikePosition(tt.text)
|
||||
if result {
|
||||
t.Errorf("Expected false for text=%q", tt.text)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveURL(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
baseURL string
|
||||
href string
|
||||
expected string
|
||||
}{
|
||||
{"Absolute URL", "https://example.com", "https://other.com/page", "https://other.com/page"},
|
||||
{"Relative path", "https://example.com/team", "/person/123", "https://example.com/person/123"},
|
||||
{"Relative no slash", "https://example.com/team/", "member", "https://example.com/team/member"},
|
||||
{"Empty href", "https://example.com", "", ""},
|
||||
{"Root relative", "https://example.com/a/b/c", "/root", "https://example.com/root"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := resolveURL(tt.baseURL, tt.href)
|
||||
if result != tt.expected {
|
||||
t.Errorf("resolveURL(%q, %q) = %q, expected %q",
|
||||
tt.baseURL, tt.href, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user