package staff import ( "regexp" "strings" ) // UniversityPatterns contains URL patterns for specific universities type UniversityPatterns struct { patterns map[string]UniversityConfig } // UniversityConfig contains crawling configuration for a specific university type UniversityConfig struct { StaffListURLs []string // URLs to staff listing pages StaffLinkPattern *regexp.Regexp // Pattern to identify staff profile links NameSelector string // CSS selector for person name PositionSelector string // CSS selector for position EmailSelector string // CSS selector for email PhotoSelector string // CSS selector for photo Extractors []string // List of extractor types to use } // NewUniversityPatterns creates a new pattern registry with known patterns func NewUniversityPatterns() *UniversityPatterns { p := &UniversityPatterns{ patterns: make(map[string]UniversityConfig), } // Register known university patterns p.registerKnownPatterns() return p } // GetConfig returns the configuration for a university domain func (p *UniversityPatterns) GetConfig(domain string) *UniversityConfig { // Normalize domain domain = strings.ToLower(domain) domain = strings.TrimPrefix(domain, "www.") if config, ok := p.patterns[domain]; ok { return &config } // Try partial match for key, config := range p.patterns { if strings.Contains(domain, key) || strings.Contains(key, domain) { return &config } } return nil } // registerKnownPatterns registers patterns for known German universities func (p *UniversityPatterns) registerKnownPatterns() { // KIT - Karlsruher Institut für Technologie p.patterns["kit.edu"] = UniversityConfig{ StaffListURLs: []string{ "https://www.kit.edu/kit/fakultaeten.php", }, StaffLinkPattern: regexp.MustCompile(`/personen/\d+`), NameSelector: ".person-name, h1.title", PositionSelector: ".person-position, .position", EmailSelector: "a[href^='mailto:']", PhotoSelector: ".person-image img, .portrait img", } // TUM - Technische Universität München p.patterns["tum.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.tum.de/die-tum/fakultaeten", }, StaffLinkPattern: regexp.MustCompile(`/person/\w+`), NameSelector: ".person-name, h1", PositionSelector: ".person-title, .function", EmailSelector: "a[href^='mailto:']", PhotoSelector: ".person-photo img", } // LMU - Ludwig-Maximilians-Universität München p.patterns["lmu.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.lmu.de/de/die-lmu/struktur/fakultaeten-einrichtungen-zentren-und-weitere-institutionen/", }, NameSelector: ".person h2, .staff-name", PositionSelector: ".person-position, .staff-position", EmailSelector: "a[href^='mailto:']", } // RWTH Aachen p.patterns["rwth-aachen.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.rwth-aachen.de/cms/root/Die-RWTH/Fakultaeten/~ep/Fakultaeten-und-Einrichtungen/", }, NameSelector: ".person-name, h3.title", PositionSelector: ".person-function, .position", EmailSelector: "a[href^='mailto:']", } // TU Berlin p.patterns["tu-berlin.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.tu.berlin/ueber-die-tu-berlin/organisation/fakultaeten-und-einrichtungen", }, NameSelector: ".person-name, h2", PositionSelector: ".position, .function", EmailSelector: "a[href^='mailto:']", } // FU Berlin p.patterns["fu-berlin.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.fu-berlin.de/einrichtungen/fachbereiche/", }, NameSelector: ".person-fullname, h2", PositionSelector: ".person-position", EmailSelector: "a[href^='mailto:']", } // HU Berlin p.patterns["hu-berlin.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.hu-berlin.de/de/einrichtungen-organisation/fakultaeten-und-institute", }, NameSelector: ".person h2, .name", PositionSelector: ".function, .position", EmailSelector: "a[href^='mailto:']", } // Universität Freiburg p.patterns["uni-freiburg.de"] = UniversityConfig{ StaffListURLs: []string{ "https://uni-freiburg.de/universitaet/fakultaeten/", }, NameSelector: ".person-name, h2", PositionSelector: ".person-position, .function", EmailSelector: "a[href^='mailto:']", } // Universität Heidelberg p.patterns["uni-heidelberg.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-heidelberg.de/de/fakultaeten", }, NameSelector: ".person-fullname, h2", PositionSelector: ".person-position", EmailSelector: "a[href^='mailto:']", } // TU Dresden p.patterns["tu-dresden.de"] = UniversityConfig{ StaffListURLs: []string{ "https://tu-dresden.de/tu-dresden/organisation/bereiche-und-fakultaeten", }, NameSelector: ".person-name, h2.name", PositionSelector: ".person-function, .funktion", EmailSelector: "a[href^='mailto:']", } // Universität Leipzig p.patterns["uni-leipzig.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-leipzig.de/universitaet/struktur/fakultaeten", }, NameSelector: ".person h2, .name", PositionSelector: ".position, .funktion", EmailSelector: "a[href^='mailto:']", } // Universität Köln p.patterns["uni-koeln.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-koeln.de/", }, NameSelector: ".person-name, h2", PositionSelector: ".person-position, .function", EmailSelector: "a[href^='mailto:']", } // Universität Bonn p.patterns["uni-bonn.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-bonn.de/de/universitaet/fakultaeten", }, NameSelector: ".person-name, h2", PositionSelector: ".person-position", EmailSelector: "a[href^='mailto:']", } // Universität Münster p.patterns["uni-muenster.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-muenster.de/de/fakultaeten.html", }, NameSelector: ".person-name, h2", PositionSelector: ".person-function", EmailSelector: "a[href^='mailto:']", } // Universität Hamburg p.patterns["uni-hamburg.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-hamburg.de/einrichtungen/fakultaeten.html", }, NameSelector: ".person-name, h2", PositionSelector: ".position", EmailSelector: "a[href^='mailto:']", } // Universität Göttingen p.patterns["uni-goettingen.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.uni-goettingen.de/de/fakultaeten/27952.html", }, NameSelector: ".person-name, h2", PositionSelector: ".person-position", EmailSelector: "a[href^='mailto:']", } // TU Darmstadt p.patterns["tu-darmstadt.de"] = UniversityConfig{ StaffListURLs: []string{ "https://www.tu-darmstadt.de/universitaet/fachbereiche/index.de.jsp", }, NameSelector: ".person-name, h2", PositionSelector: ".person-position, .funktion", EmailSelector: "a[href^='mailto:']", } } // CommonStaffPagePaths returns common paths where staff listings are found func CommonStaffPagePaths() []string { return []string{ "/personen", "/team", "/mitarbeiter", "/mitarbeitende", "/staff", "/people", "/ueber-uns/team", "/about/team", "/fakultaet/personen", "/institut/mitarbeiter", "/lehrstuhl/team", "/personal", "/beschaeftigte", "/dozenten", "/professoren", } } // CommonPersonSelectors returns common CSS selectors for person elements func CommonPersonSelectors() []string { return []string{ ".person", ".person-card", ".staff-member", ".team-member", ".mitarbeiter", ".employee", ".vcard", ".h-card", "[itemtype='http://schema.org/Person']", ".person-entry", ".staff-entry", ".profile-card", } } // TitlePrefixes returns common German academic title prefixes func TitlePrefixes() []string { return []string{ "Prof. Dr. Dr. h.c. mult.", "Prof. Dr. Dr. h.c.", "Prof. Dr. Dr.", "Prof. Dr.-Ing.", "Prof. Dr. rer. nat.", "Prof. Dr. phil.", "Prof. Dr. jur.", "Prof. Dr. med.", "Prof. Dr.", "Prof.", "PD Dr.", "apl. Prof. Dr.", "Jun.-Prof. Dr.", "Dr.-Ing.", "Dr. rer. nat.", "Dr. phil.", "Dr. jur.", "Dr. med.", "Dr.", "Dipl.-Ing.", "Dipl.-Inf.", "Dipl.-Phys.", "Dipl.-Math.", "Dipl.-Kfm.", "M.Sc.", "M.A.", "M.Eng.", "B.Sc.", "B.A.", } } // PositionKeywords returns keywords that indicate staff positions func PositionKeywords() []string { return []string{ // Professors "Professor", "Professorin", "Ordinarius", "Lehrstuhlinhaber", "Lehrstuhlinhaberin", "Dekan", "Dekanin", "Rektor", "Rektorin", // Research staff "Wissenschaftlicher Mitarbeiter", "Wissenschaftliche Mitarbeiterin", "Akademischer Rat", "Akademische Rätin", "Postdoktorand", "Postdoktorandin", "Doktorand", "Doktorandin", "Promovend", "Promovendin", "Forscher", "Forscherin", "Researcher", // Teaching "Dozent", "Dozentin", "Lektor", "Lektorin", "Lehrbeauftragter", "Lehrbeauftragte", // Administrative "Sekretär", "Sekretärin", "Geschäftsführer", "Geschäftsführerin", "Verwaltungsleiter", "Verwaltungsleiterin", "Referent", "Referentin", // Students "Studentische Hilfskraft", "Wissenschaftliche Hilfskraft", "Tutor", "Tutorin", } }