feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
@@ -0,0 +1,217 @@
+// Package staff provides university staff crawling functionality
+package staff
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"time"
+
+	"github.com/google/uuid"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+	"github.com/breakpilot/edu-search-service/internal/orchestrator"
+)
+
+// OrchestratorAdapter adapts the StaffCrawler to the orchestrator.StaffCrawlerInterface
+// This bridges the gap between the generic StaffCrawler and the multi-phase orchestrator
+type OrchestratorAdapter struct {
+	crawler *StaffCrawler
+	repo    *database.Repository
+}
+
+// NewOrchestratorAdapter creates a new adapter that connects StaffCrawler to the orchestrator
+func NewOrchestratorAdapter(crawler *StaffCrawler, repo *database.Repository) *OrchestratorAdapter {
+	return &OrchestratorAdapter{
+		crawler: crawler,
+		repo:    repo,
+	}
+}
+
+// DiscoverSampleProfessor finds at least one professor to validate crawling works for this university
+// This is Phase 1: Quick validation that the university website is crawlable
+func (a *OrchestratorAdapter) DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhaseDiscovery,
+		StartedAt: start,
+	}
+
+	log.Printf("[OrchestratorAdapter] Discovery phase for university %s", universityID)
+
+	// Get university from database
+	uni, err := a.repo.GetUniversityByID(ctx, universityID)
+	if err != nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
+		return progress, fmt.Errorf("failed to get university: %w", err)
+	}
+
+	if uni == nil {
+		progress.Errors = append(progress.Errors, "University not found")
+		return progress, fmt.Errorf("university not found: %s", universityID)
+	}
+
+	log.Printf("[OrchestratorAdapter] Discovering staff pages for %s (%s)", uni.Name, uni.URL)
+
+	// Use the crawler to find staff pages (discovery phase)
+	staffPages, err := a.crawler.findStaffPages(ctx, uni)
+	if err != nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to find staff pages: %v", err))
+		return progress, fmt.Errorf("failed to find staff pages: %w", err)
+	}
+
+	log.Printf("[OrchestratorAdapter] Found %d staff pages for %s", len(staffPages), uni.Name)
+
+	// Try to extract at least one professor as validation
+	var sampleFound int
+	for _, pageURL := range staffPages {
+		if sampleFound > 0 {
+			break // We just need to validate one works
+		}
+
+		staffMembers, err := a.crawler.extractStaffFromPage(ctx, pageURL, uni)
+		if err != nil {
+			log.Printf("[OrchestratorAdapter] Error extracting from %s: %v", pageURL, err)
+			continue
+		}
+
+		// Count professors found
+		for _, staff := range staffMembers {
+			if staff.IsProfessor {
+				sampleFound++
+				log.Printf("[OrchestratorAdapter] Found sample professor: %s %s",
+					stringValue(staff.FirstName), staff.LastName)
+				break
+			}
+		}
+
+		// Even non-professors validate the crawler works
+		if sampleFound == 0 && len(staffMembers) > 0 {
+			sampleFound = 1
+			log.Printf("[OrchestratorAdapter] Found sample staff member (not professor): %s %s",
+				stringValue(staffMembers[0].FirstName), staffMembers[0].LastName)
+		}
+	}
+
+	progress.ItemsFound = len(staffPages) // Number of crawlable pages found
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	if sampleFound == 0 && len(staffPages) > 0 {
+		// Pages found but no staff extracted - still consider it successful
+		log.Printf("[OrchestratorAdapter] Discovery completed: %d pages found, extraction may need tuning", len(staffPages))
+	} else if sampleFound == 0 {
+		progress.Errors = append(progress.Errors, "No staff pages found")
+		return progress, fmt.Errorf("no staff pages found for %s", uni.Name)
+	}
+
+	log.Printf("[OrchestratorAdapter] Discovery completed for %s: %d pages found", uni.Name, len(staffPages))
+	return progress, nil
+}
+
+// CrawlProfessors crawls all professors at a university
+// This is Phase 2: Focus on finding professors specifically
+func (a *OrchestratorAdapter) CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhaseProfessors,
+		StartedAt: start,
+	}
+
+	log.Printf("[OrchestratorAdapter] Professors phase for university %s", universityID)
+
+	// Get university
+	uni, err := a.repo.GetUniversityByID(ctx, universityID)
+	if err != nil || uni == nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
+		return progress, fmt.Errorf("failed to get university: %w", err)
+	}
+
+	// Perform full crawl
+	result, err := a.crawler.CrawlUniversity(ctx, uni)
+	if err != nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Crawl failed: %v", err))
+		return progress, err
+	}
+
+	// Count professors specifically
+	professorCount := 0
+	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
+		UniversityID: &universityID,
+		IsProfessor:  boolPtr(true),
+		Limit:        10000,
+	})
+	if err == nil {
+		professorCount = staffList.Total
+	}
+
+	progress.ItemsFound = professorCount
+	progress.ItemsProcessed = result.StaffFound
+	progress.Errors = result.Errors
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	log.Printf("[OrchestratorAdapter] Professors phase completed for %s: %d professors found", uni.Name, professorCount)
+	return progress, nil
+}
+
+// CrawlAllStaff crawls all staff members at a university
+// This is Phase 3: Get all staff (already done in Phase 2, but we verify/extend)
+func (a *OrchestratorAdapter) CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
+	start := time.Now()
+	progress := &orchestrator.CrawlProgress{
+		Phase:     orchestrator.PhaseAllStaff,
+		StartedAt: start,
+	}
+
+	log.Printf("[OrchestratorAdapter] All Staff phase for university %s", universityID)
+
+	// Get university
+	uni, err := a.repo.GetUniversityByID(ctx, universityID)
+	if err != nil || uni == nil {
+		progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
+		return progress, fmt.Errorf("failed to get university: %w", err)
+	}
+
+	// Run another crawl pass to catch any missed staff
+	result, err := a.crawler.CrawlUniversity(ctx, uni)
+	if err != nil {
+		progress.Errors = result.Errors
+		// Don't fail completely - we may have some staff already
+		log.Printf("[OrchestratorAdapter] All Staff crawl had errors: %v", err)
+	}
+
+	// Get total staff count
+	staffCount := 0
+	staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
+		UniversityID: &universityID,
+		Limit:        1, // Just need count
+	})
+	if err == nil {
+		staffCount = staffList.Total
+	}
+
+	progress.ItemsFound = staffCount
+	if result != nil {
+		progress.ItemsProcessed = result.StaffFound
+		progress.Errors = result.Errors
+	}
+	now := time.Now()
+	progress.CompletedAt = &now
+
+	log.Printf("[OrchestratorAdapter] All Staff phase completed for %s: %d total staff", uni.Name, staffCount)
+	return progress, nil
+}
+
+// Helper functions
+
+func stringValue(s *string) string {
+	if s == nil {
+		return ""
+	}
+	return *s
+}
+
+func boolPtr(b bool) *bool {
+	return &b
+}