Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/edu-search-service/internal/staff/staff_crawler_discovery.go
+++ b/edu-search-service/internal/staff/staff_crawler_discovery.go
@@ -0,0 +1,247 @@
+package staff
+
+import (
+	"bytes"
+	"context"
+	"log"
+	"net/http"
+	"strings"
+
+	"github.com/PuerkitoBio/goquery"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+)
+
+// findStaffPages discovers staff listing pages on a university website
+func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) {
+	var pages []string
+
+	// Use custom pattern if available
+	if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" {
+		pages = append(pages, *uni.StaffPagePattern)
+		return pages, nil
+	}
+
+	// Try common patterns
+	baseURL := strings.TrimSuffix(uni.URL, "/")
+	commonPaths := []string{
+		"/personen",
+		"/team",
+		"/mitarbeiter",
+		"/mitarbeitende",
+		"/staff",
+		"/people",
+		"/ueber-uns/team",
+		"/about/team",
+		"/fakultaet/personen",
+		"/institute",
+	}
+
+	for _, path := range commonPaths {
+		testURL := baseURL + path
+		exists, err := c.checkPageExists(ctx, testURL)
+		if err == nil && exists {
+			pages = append(pages, testURL)
+		}
+	}
+
+	// Also try to find staff links on the main page
+	mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL)
+	if err == nil {
+		pages = append(pages, mainPageLinks...)
+	}
+
+	// UOL-specific: Find department/personen pages through navigation
+	// Check for both uol.de and uni-oldenburg.de (they are the same university)
+	if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") {
+		log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL)
+		uolPages, err := c.findUOLDepartmentPages(ctx, baseURL)
+		if err == nil {
+			log.Printf("[UOL] Found %d department pages", len(uolPages))
+			pages = append(pages, uolPages...)
+		} else {
+			log.Printf("[UOL] Error finding department pages: %v", err)
+		}
+	}
+
+	// Deduplicate
+	seen := make(map[string]bool)
+	var unique []string
+	for _, p := range pages {
+		if !seen[p] {
+			seen[p] = true
+			unique = append(unique, p)
+		}
+	}
+
+	return unique, nil
+}
+
+// findUOLDepartmentPages finds department person pages for Uni Oldenburg
+func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) {
+	var pages []string
+
+	// UOL uses both uol.de and uni-oldenburg.de domains
+	// Departments have /personen or /team subpages
+
+	// Helper to check if URL is UOL-related
+	isUOLURL := func(url string) bool {
+		lower := strings.ToLower(url)
+		return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de")
+	}
+
+	// First try to find department links from known starting points
+	startPages := []string{
+		"https://uol.de/informatik/department/abteilungen-und-einrichtungen",
+		"https://uol.de/fk2",
+		"https://uol.de/fk1",
+		"https://uol.de/fk3",
+		"https://uol.de/fk4",
+		"https://uol.de/fk5",
+		"https://uol.de/fk6",
+		baseURL,
+	}
+
+	deptPaths := make(map[string]bool)
+
+	for _, startURL := range startPages {
+		log.Printf("[UOL] Scanning start page: %s", startURL)
+		body, err := c.fetchPage(ctx, startURL)
+		if err != nil {
+			log.Printf("[UOL] Error fetching %s: %v", startURL, err)
+			continue
+		}
+
+		doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+		if err != nil {
+			continue
+		}
+
+		// Find links to department pages (they typically have /personen subpages)
+		doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
+			href, exists := s.Attr("href")
+			if !exists {
+				return
+			}
+
+			// Look for department-like paths
+			hrefLower := strings.ToLower(href)
+			isDeptPath := isUOLURL(href) &&
+				!strings.Contains(hrefLower, "/studium") &&
+				!strings.Contains(hrefLower, "/forschung") &&
+				!strings.Contains(hrefLower, "/aktuelles") &&
+				!strings.Contains(hrefLower, "/kontakt")
+
+			if isDeptPath {
+				fullURL := resolveURL(startURL, href)
+				if fullURL != "" && isUOLURL(fullURL) {
+					// Add personen page for this department
+					personenURL := strings.TrimSuffix(fullURL, "/") + "/personen"
+					deptPaths[personenURL] = true
+				}
+			}
+		})
+
+		// Also look for direct /personen or /team links
+		doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) {
+			href, exists := s.Attr("href")
+			if exists {
+				fullURL := resolveURL(startURL, href)
+				if fullURL != "" && isUOLURL(fullURL) {
+					deptPaths[fullURL] = true
+				}
+			}
+		})
+	}
+
+	// Add well-known department personen pages directly (these exist for sure)
+	knownDepts := []string{
+		"https://uol.de/socps/personen",
+		"https://uol.de/vlba/team",
+		"https://uol.de/informatik/department",
+		"https://uol.de/se/team",
+		"https://uol.de/ei/personen",
+		"https://uol.de/is/team",
+		"https://uol.de/paedagogik/personen",
+		"https://uol.de/psychologie/personen",
+		"https://uol.de/germanistik/personen",
+		"https://uol.de/physik/personen",
+		"https://uol.de/chemie/personen",
+		"https://uol.de/biologie/personen",
+		"https://uol.de/mathe/personen",
+	}
+	for _, dept := range knownDepts {
+		deptPaths[dept] = true
+	}
+
+	log.Printf("[UOL] Checking %d potential department pages", len(deptPaths))
+
+	// Verify which pages actually exist
+	for path := range deptPaths {
+		exists, err := c.checkPageExists(ctx, path)
+		if err == nil && exists {
+			log.Printf("[UOL] Found valid page: %s", path)
+			pages = append(pages, path)
+		}
+	}
+
+	log.Printf("[UOL] Found %d valid department/personen pages", len(pages))
+	return pages, nil
+}
+
+// checkPageExists checks if a URL returns a 200 status
+func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) {
+	c.waitForRateLimit(urlStr)
+
+	req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
+	if err != nil {
+		return false, err
+	}
+	req.Header.Set("User-Agent", c.userAgent)
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return false, err
+	}
+	defer resp.Body.Close()
+
+	return resp.StatusCode == http.StatusOK, nil
+}
+
+// findStaffLinksOnPage finds links to staff pages on a given page
+func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) {
+	body, err := c.fetchPage(ctx, pageURL)
+	if err != nil {
+		return nil, err
+	}
+
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
+	if err != nil {
+		return nil, err
+	}
+
+	var links []string
+	staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"}
+
+	doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists {
+			return
+		}
+
+		text := strings.ToLower(s.Text())
+		hrefLower := strings.ToLower(href)
+
+		for _, keyword := range staffKeywords {
+			if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) {
+				fullURL := resolveURL(pageURL, href)
+				if fullURL != "" {
+					links = append(links, fullURL)
+				}
+				break
+			}
+		}
+	})
+
+	return links, nil
+}