fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
342
edu-search-service/internal/staff/patterns.go
Normal file
342
edu-search-service/internal/staff/patterns.go
Normal file
@@ -0,0 +1,342 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// UniversityPatterns contains URL patterns for specific universities
|
||||
type UniversityPatterns struct {
|
||||
patterns map[string]UniversityConfig
|
||||
}
|
||||
|
||||
// UniversityConfig contains crawling configuration for a specific university
|
||||
type UniversityConfig struct {
|
||||
StaffListURLs []string // URLs to staff listing pages
|
||||
StaffLinkPattern *regexp.Regexp // Pattern to identify staff profile links
|
||||
NameSelector string // CSS selector for person name
|
||||
PositionSelector string // CSS selector for position
|
||||
EmailSelector string // CSS selector for email
|
||||
PhotoSelector string // CSS selector for photo
|
||||
Extractors []string // List of extractor types to use
|
||||
}
|
||||
|
||||
// NewUniversityPatterns creates a new pattern registry with known patterns
|
||||
func NewUniversityPatterns() *UniversityPatterns {
|
||||
p := &UniversityPatterns{
|
||||
patterns: make(map[string]UniversityConfig),
|
||||
}
|
||||
|
||||
// Register known university patterns
|
||||
p.registerKnownPatterns()
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
// GetConfig returns the configuration for a university domain
|
||||
func (p *UniversityPatterns) GetConfig(domain string) *UniversityConfig {
|
||||
// Normalize domain
|
||||
domain = strings.ToLower(domain)
|
||||
domain = strings.TrimPrefix(domain, "www.")
|
||||
|
||||
if config, ok := p.patterns[domain]; ok {
|
||||
return &config
|
||||
}
|
||||
|
||||
// Try partial match
|
||||
for key, config := range p.patterns {
|
||||
if strings.Contains(domain, key) || strings.Contains(key, domain) {
|
||||
return &config
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// registerKnownPatterns registers patterns for known German universities
|
||||
func (p *UniversityPatterns) registerKnownPatterns() {
|
||||
// KIT - Karlsruher Institut für Technologie
|
||||
p.patterns["kit.edu"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.kit.edu/kit/fakultaeten.php",
|
||||
},
|
||||
StaffLinkPattern: regexp.MustCompile(`/personen/\d+`),
|
||||
NameSelector: ".person-name, h1.title",
|
||||
PositionSelector: ".person-position, .position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
PhotoSelector: ".person-image img, .portrait img",
|
||||
}
|
||||
|
||||
// TUM - Technische Universität München
|
||||
p.patterns["tum.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.tum.de/die-tum/fakultaeten",
|
||||
},
|
||||
StaffLinkPattern: regexp.MustCompile(`/person/\w+`),
|
||||
NameSelector: ".person-name, h1",
|
||||
PositionSelector: ".person-title, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
PhotoSelector: ".person-photo img",
|
||||
}
|
||||
|
||||
// LMU - Ludwig-Maximilians-Universität München
|
||||
p.patterns["lmu.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.lmu.de/de/die-lmu/struktur/fakultaeten-einrichtungen-zentren-und-weitere-institutionen/",
|
||||
},
|
||||
NameSelector: ".person h2, .staff-name",
|
||||
PositionSelector: ".person-position, .staff-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// RWTH Aachen
|
||||
p.patterns["rwth-aachen.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.rwth-aachen.de/cms/root/Die-RWTH/Fakultaeten/~ep/Fakultaeten-und-Einrichtungen/",
|
||||
},
|
||||
NameSelector: ".person-name, h3.title",
|
||||
PositionSelector: ".person-function, .position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// TU Berlin
|
||||
p.patterns["tu-berlin.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.tu.berlin/ueber-die-tu-berlin/organisation/fakultaeten-und-einrichtungen",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".position, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// FU Berlin
|
||||
p.patterns["fu-berlin.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.fu-berlin.de/einrichtungen/fachbereiche/",
|
||||
},
|
||||
NameSelector: ".person-fullname, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// HU Berlin
|
||||
p.patterns["hu-berlin.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.hu-berlin.de/de/einrichtungen-organisation/fakultaeten-und-institute",
|
||||
},
|
||||
NameSelector: ".person h2, .name",
|
||||
PositionSelector: ".function, .position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Freiburg
|
||||
p.patterns["uni-freiburg.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://uni-freiburg.de/universitaet/fakultaeten/",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Heidelberg
|
||||
p.patterns["uni-heidelberg.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-heidelberg.de/de/fakultaeten",
|
||||
},
|
||||
NameSelector: ".person-fullname, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// TU Dresden
|
||||
p.patterns["tu-dresden.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://tu-dresden.de/tu-dresden/organisation/bereiche-und-fakultaeten",
|
||||
},
|
||||
NameSelector: ".person-name, h2.name",
|
||||
PositionSelector: ".person-function, .funktion",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Leipzig
|
||||
p.patterns["uni-leipzig.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-leipzig.de/universitaet/struktur/fakultaeten",
|
||||
},
|
||||
NameSelector: ".person h2, .name",
|
||||
PositionSelector: ".position, .funktion",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Köln
|
||||
p.patterns["uni-koeln.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-koeln.de/",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position, .function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Bonn
|
||||
p.patterns["uni-bonn.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-bonn.de/de/universitaet/fakultaeten",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Münster
|
||||
p.patterns["uni-muenster.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-muenster.de/de/fakultaeten.html",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-function",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Hamburg
|
||||
p.patterns["uni-hamburg.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-hamburg.de/einrichtungen/fakultaeten.html",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// Universität Göttingen
|
||||
p.patterns["uni-goettingen.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.uni-goettingen.de/de/fakultaeten/27952.html",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
|
||||
// TU Darmstadt
|
||||
p.patterns["tu-darmstadt.de"] = UniversityConfig{
|
||||
StaffListURLs: []string{
|
||||
"https://www.tu-darmstadt.de/universitaet/fachbereiche/index.de.jsp",
|
||||
},
|
||||
NameSelector: ".person-name, h2",
|
||||
PositionSelector: ".person-position, .funktion",
|
||||
EmailSelector: "a[href^='mailto:']",
|
||||
}
|
||||
}
|
||||
|
||||
// CommonStaffPagePaths returns common paths where staff listings are found
|
||||
func CommonStaffPagePaths() []string {
|
||||
return []string{
|
||||
"/personen",
|
||||
"/team",
|
||||
"/mitarbeiter",
|
||||
"/mitarbeitende",
|
||||
"/staff",
|
||||
"/people",
|
||||
"/ueber-uns/team",
|
||||
"/about/team",
|
||||
"/fakultaet/personen",
|
||||
"/institut/mitarbeiter",
|
||||
"/lehrstuhl/team",
|
||||
"/personal",
|
||||
"/beschaeftigte",
|
||||
"/dozenten",
|
||||
"/professoren",
|
||||
}
|
||||
}
|
||||
|
||||
// CommonPersonSelectors returns common CSS selectors for person elements
|
||||
func CommonPersonSelectors() []string {
|
||||
return []string{
|
||||
".person",
|
||||
".person-card",
|
||||
".staff-member",
|
||||
".team-member",
|
||||
".mitarbeiter",
|
||||
".employee",
|
||||
".vcard",
|
||||
".h-card",
|
||||
"[itemtype='http://schema.org/Person']",
|
||||
".person-entry",
|
||||
".staff-entry",
|
||||
".profile-card",
|
||||
}
|
||||
}
|
||||
|
||||
// TitlePrefixes returns common German academic title prefixes
|
||||
func TitlePrefixes() []string {
|
||||
return []string{
|
||||
"Prof. Dr. Dr. h.c. mult.",
|
||||
"Prof. Dr. Dr. h.c.",
|
||||
"Prof. Dr. Dr.",
|
||||
"Prof. Dr.-Ing.",
|
||||
"Prof. Dr. rer. nat.",
|
||||
"Prof. Dr. phil.",
|
||||
"Prof. Dr. jur.",
|
||||
"Prof. Dr. med.",
|
||||
"Prof. Dr.",
|
||||
"Prof.",
|
||||
"PD Dr.",
|
||||
"apl. Prof. Dr.",
|
||||
"Jun.-Prof. Dr.",
|
||||
"Dr.-Ing.",
|
||||
"Dr. rer. nat.",
|
||||
"Dr. phil.",
|
||||
"Dr. jur.",
|
||||
"Dr. med.",
|
||||
"Dr.",
|
||||
"Dipl.-Ing.",
|
||||
"Dipl.-Inf.",
|
||||
"Dipl.-Phys.",
|
||||
"Dipl.-Math.",
|
||||
"Dipl.-Kfm.",
|
||||
"M.Sc.",
|
||||
"M.A.",
|
||||
"M.Eng.",
|
||||
"B.Sc.",
|
||||
"B.A.",
|
||||
}
|
||||
}
|
||||
|
||||
// PositionKeywords returns keywords that indicate staff positions
|
||||
func PositionKeywords() []string {
|
||||
return []string{
|
||||
// Professors
|
||||
"Professor", "Professorin",
|
||||
"Ordinarius",
|
||||
"Lehrstuhlinhaber", "Lehrstuhlinhaberin",
|
||||
"Dekan", "Dekanin",
|
||||
"Rektor", "Rektorin",
|
||||
|
||||
// Research staff
|
||||
"Wissenschaftlicher Mitarbeiter", "Wissenschaftliche Mitarbeiterin",
|
||||
"Akademischer Rat", "Akademische Rätin",
|
||||
"Postdoktorand", "Postdoktorandin",
|
||||
"Doktorand", "Doktorandin",
|
||||
"Promovend", "Promovendin",
|
||||
"Forscher", "Forscherin",
|
||||
"Researcher",
|
||||
|
||||
// Teaching
|
||||
"Dozent", "Dozentin",
|
||||
"Lektor", "Lektorin",
|
||||
"Lehrbeauftragter", "Lehrbeauftragte",
|
||||
|
||||
// Administrative
|
||||
"Sekretär", "Sekretärin",
|
||||
"Geschäftsführer", "Geschäftsführerin",
|
||||
"Verwaltungsleiter", "Verwaltungsleiterin",
|
||||
"Referent", "Referentin",
|
||||
|
||||
// Students
|
||||
"Studentische Hilfskraft",
|
||||
"Wissenschaftliche Hilfskraft",
|
||||
"Tutor", "Tutorin",
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user