Files
breakpilot-lehrer/edu-search-service/internal/staff/patterns.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

343 lines
9.2 KiB
Go

package staff
import (
"regexp"
"strings"
)
// UniversityPatterns contains URL patterns for specific universities
type UniversityPatterns struct {
patterns map[string]UniversityConfig
}
// UniversityConfig contains crawling configuration for a specific university
type UniversityConfig struct {
StaffListURLs []string // URLs to staff listing pages
StaffLinkPattern *regexp.Regexp // Pattern to identify staff profile links
NameSelector string // CSS selector for person name
PositionSelector string // CSS selector for position
EmailSelector string // CSS selector for email
PhotoSelector string // CSS selector for photo
Extractors []string // List of extractor types to use
}
// NewUniversityPatterns creates a new pattern registry with known patterns
func NewUniversityPatterns() *UniversityPatterns {
p := &UniversityPatterns{
patterns: make(map[string]UniversityConfig),
}
// Register known university patterns
p.registerKnownPatterns()
return p
}
// GetConfig returns the configuration for a university domain
func (p *UniversityPatterns) GetConfig(domain string) *UniversityConfig {
// Normalize domain
domain = strings.ToLower(domain)
domain = strings.TrimPrefix(domain, "www.")
if config, ok := p.patterns[domain]; ok {
return &config
}
// Try partial match
for key, config := range p.patterns {
if strings.Contains(domain, key) || strings.Contains(key, domain) {
return &config
}
}
return nil
}
// registerKnownPatterns registers patterns for known German universities
func (p *UniversityPatterns) registerKnownPatterns() {
// KIT - Karlsruher Institut für Technologie
p.patterns["kit.edu"] = UniversityConfig{
StaffListURLs: []string{
"https://www.kit.edu/kit/fakultaeten.php",
},
StaffLinkPattern: regexp.MustCompile(`/personen/\d+`),
NameSelector: ".person-name, h1.title",
PositionSelector: ".person-position, .position",
EmailSelector: "a[href^='mailto:']",
PhotoSelector: ".person-image img, .portrait img",
}
// TUM - Technische Universität München
p.patterns["tum.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.tum.de/die-tum/fakultaeten",
},
StaffLinkPattern: regexp.MustCompile(`/person/\w+`),
NameSelector: ".person-name, h1",
PositionSelector: ".person-title, .function",
EmailSelector: "a[href^='mailto:']",
PhotoSelector: ".person-photo img",
}
// LMU - Ludwig-Maximilians-Universität München
p.patterns["lmu.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.lmu.de/de/die-lmu/struktur/fakultaeten-einrichtungen-zentren-und-weitere-institutionen/",
},
NameSelector: ".person h2, .staff-name",
PositionSelector: ".person-position, .staff-position",
EmailSelector: "a[href^='mailto:']",
}
// RWTH Aachen
p.patterns["rwth-aachen.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.rwth-aachen.de/cms/root/Die-RWTH/Fakultaeten/~ep/Fakultaeten-und-Einrichtungen/",
},
NameSelector: ".person-name, h3.title",
PositionSelector: ".person-function, .position",
EmailSelector: "a[href^='mailto:']",
}
// TU Berlin
p.patterns["tu-berlin.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.tu.berlin/ueber-die-tu-berlin/organisation/fakultaeten-und-einrichtungen",
},
NameSelector: ".person-name, h2",
PositionSelector: ".position, .function",
EmailSelector: "a[href^='mailto:']",
}
// FU Berlin
p.patterns["fu-berlin.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.fu-berlin.de/einrichtungen/fachbereiche/",
},
NameSelector: ".person-fullname, h2",
PositionSelector: ".person-position",
EmailSelector: "a[href^='mailto:']",
}
// HU Berlin
p.patterns["hu-berlin.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.hu-berlin.de/de/einrichtungen-organisation/fakultaeten-und-institute",
},
NameSelector: ".person h2, .name",
PositionSelector: ".function, .position",
EmailSelector: "a[href^='mailto:']",
}
// Universität Freiburg
p.patterns["uni-freiburg.de"] = UniversityConfig{
StaffListURLs: []string{
"https://uni-freiburg.de/universitaet/fakultaeten/",
},
NameSelector: ".person-name, h2",
PositionSelector: ".person-position, .function",
EmailSelector: "a[href^='mailto:']",
}
// Universität Heidelberg
p.patterns["uni-heidelberg.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-heidelberg.de/de/fakultaeten",
},
NameSelector: ".person-fullname, h2",
PositionSelector: ".person-position",
EmailSelector: "a[href^='mailto:']",
}
// TU Dresden
p.patterns["tu-dresden.de"] = UniversityConfig{
StaffListURLs: []string{
"https://tu-dresden.de/tu-dresden/organisation/bereiche-und-fakultaeten",
},
NameSelector: ".person-name, h2.name",
PositionSelector: ".person-function, .funktion",
EmailSelector: "a[href^='mailto:']",
}
// Universität Leipzig
p.patterns["uni-leipzig.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-leipzig.de/universitaet/struktur/fakultaeten",
},
NameSelector: ".person h2, .name",
PositionSelector: ".position, .funktion",
EmailSelector: "a[href^='mailto:']",
}
// Universität Köln
p.patterns["uni-koeln.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-koeln.de/",
},
NameSelector: ".person-name, h2",
PositionSelector: ".person-position, .function",
EmailSelector: "a[href^='mailto:']",
}
// Universität Bonn
p.patterns["uni-bonn.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-bonn.de/de/universitaet/fakultaeten",
},
NameSelector: ".person-name, h2",
PositionSelector: ".person-position",
EmailSelector: "a[href^='mailto:']",
}
// Universität Münster
p.patterns["uni-muenster.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-muenster.de/de/fakultaeten.html",
},
NameSelector: ".person-name, h2",
PositionSelector: ".person-function",
EmailSelector: "a[href^='mailto:']",
}
// Universität Hamburg
p.patterns["uni-hamburg.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-hamburg.de/einrichtungen/fakultaeten.html",
},
NameSelector: ".person-name, h2",
PositionSelector: ".position",
EmailSelector: "a[href^='mailto:']",
}
// Universität Göttingen
p.patterns["uni-goettingen.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.uni-goettingen.de/de/fakultaeten/27952.html",
},
NameSelector: ".person-name, h2",
PositionSelector: ".person-position",
EmailSelector: "a[href^='mailto:']",
}
// TU Darmstadt
p.patterns["tu-darmstadt.de"] = UniversityConfig{
StaffListURLs: []string{
"https://www.tu-darmstadt.de/universitaet/fachbereiche/index.de.jsp",
},
NameSelector: ".person-name, h2",
PositionSelector: ".person-position, .funktion",
EmailSelector: "a[href^='mailto:']",
}
}
// CommonStaffPagePaths returns common paths where staff listings are found
func CommonStaffPagePaths() []string {
return []string{
"/personen",
"/team",
"/mitarbeiter",
"/mitarbeitende",
"/staff",
"/people",
"/ueber-uns/team",
"/about/team",
"/fakultaet/personen",
"/institut/mitarbeiter",
"/lehrstuhl/team",
"/personal",
"/beschaeftigte",
"/dozenten",
"/professoren",
}
}
// CommonPersonSelectors returns common CSS selectors for person elements
func CommonPersonSelectors() []string {
return []string{
".person",
".person-card",
".staff-member",
".team-member",
".mitarbeiter",
".employee",
".vcard",
".h-card",
"[itemtype='http://schema.org/Person']",
".person-entry",
".staff-entry",
".profile-card",
}
}
// TitlePrefixes returns common German academic title prefixes
func TitlePrefixes() []string {
return []string{
"Prof. Dr. Dr. h.c. mult.",
"Prof. Dr. Dr. h.c.",
"Prof. Dr. Dr.",
"Prof. Dr.-Ing.",
"Prof. Dr. rer. nat.",
"Prof. Dr. phil.",
"Prof. Dr. jur.",
"Prof. Dr. med.",
"Prof. Dr.",
"Prof.",
"PD Dr.",
"apl. Prof. Dr.",
"Jun.-Prof. Dr.",
"Dr.-Ing.",
"Dr. rer. nat.",
"Dr. phil.",
"Dr. jur.",
"Dr. med.",
"Dr.",
"Dipl.-Ing.",
"Dipl.-Inf.",
"Dipl.-Phys.",
"Dipl.-Math.",
"Dipl.-Kfm.",
"M.Sc.",
"M.A.",
"M.Eng.",
"B.Sc.",
"B.A.",
}
}
// PositionKeywords returns keywords that indicate staff positions
func PositionKeywords() []string {
return []string{
// Professors
"Professor", "Professorin",
"Ordinarius",
"Lehrstuhlinhaber", "Lehrstuhlinhaberin",
"Dekan", "Dekanin",
"Rektor", "Rektorin",
// Research staff
"Wissenschaftlicher Mitarbeiter", "Wissenschaftliche Mitarbeiterin",
"Akademischer Rat", "Akademische Rätin",
"Postdoktorand", "Postdoktorandin",
"Doktorand", "Doktorandin",
"Promovend", "Promovendin",
"Forscher", "Forscherin",
"Researcher",
// Teaching
"Dozent", "Dozentin",
"Lektor", "Lektorin",
"Lehrbeauftragter", "Lehrbeauftragte",
// Administrative
"Sekretär", "Sekretärin",
"Geschäftsführer", "Geschäftsführerin",
"Verwaltungsleiter", "Verwaltungsleiterin",
"Referent", "Referentin",
// Students
"Studentische Hilfskraft",
"Wissenschaftliche Hilfskraft",
"Tutor", "Tutorin",
}
}