feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
217
edu-search-service/internal/staff/orchestrator_adapter.go
Normal file
217
edu-search-service/internal/staff/orchestrator_adapter.go
Normal file
@@ -0,0 +1,217 @@
|
||||
// Package staff provides university staff crawling functionality
|
||||
package staff
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
"github.com/breakpilot/edu-search-service/internal/orchestrator"
|
||||
)
|
||||
|
||||
// OrchestratorAdapter adapts the StaffCrawler to the orchestrator.StaffCrawlerInterface
|
||||
// This bridges the gap between the generic StaffCrawler and the multi-phase orchestrator
|
||||
type OrchestratorAdapter struct {
|
||||
crawler *StaffCrawler
|
||||
repo *database.Repository
|
||||
}
|
||||
|
||||
// NewOrchestratorAdapter creates a new adapter that connects StaffCrawler to the orchestrator
|
||||
func NewOrchestratorAdapter(crawler *StaffCrawler, repo *database.Repository) *OrchestratorAdapter {
|
||||
return &OrchestratorAdapter{
|
||||
crawler: crawler,
|
||||
repo: repo,
|
||||
}
|
||||
}
|
||||
|
||||
// DiscoverSampleProfessor finds at least one professor to validate crawling works for this university
|
||||
// This is Phase 1: Quick validation that the university website is crawlable
|
||||
func (a *OrchestratorAdapter) DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhaseDiscovery,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Discovery phase for university %s", universityID)
|
||||
|
||||
// Get university from database
|
||||
uni, err := a.repo.GetUniversityByID(ctx, universityID)
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
|
||||
return progress, fmt.Errorf("failed to get university: %w", err)
|
||||
}
|
||||
|
||||
if uni == nil {
|
||||
progress.Errors = append(progress.Errors, "University not found")
|
||||
return progress, fmt.Errorf("university not found: %s", universityID)
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Discovering staff pages for %s (%s)", uni.Name, uni.URL)
|
||||
|
||||
// Use the crawler to find staff pages (discovery phase)
|
||||
staffPages, err := a.crawler.findStaffPages(ctx, uni)
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to find staff pages: %v", err))
|
||||
return progress, fmt.Errorf("failed to find staff pages: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Found %d staff pages for %s", len(staffPages), uni.Name)
|
||||
|
||||
// Try to extract at least one professor as validation
|
||||
var sampleFound int
|
||||
for _, pageURL := range staffPages {
|
||||
if sampleFound > 0 {
|
||||
break // We just need to validate one works
|
||||
}
|
||||
|
||||
staffMembers, err := a.crawler.extractStaffFromPage(ctx, pageURL, uni)
|
||||
if err != nil {
|
||||
log.Printf("[OrchestratorAdapter] Error extracting from %s: %v", pageURL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Count professors found
|
||||
for _, staff := range staffMembers {
|
||||
if staff.IsProfessor {
|
||||
sampleFound++
|
||||
log.Printf("[OrchestratorAdapter] Found sample professor: %s %s",
|
||||
stringValue(staff.FirstName), staff.LastName)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Even non-professors validate the crawler works
|
||||
if sampleFound == 0 && len(staffMembers) > 0 {
|
||||
sampleFound = 1
|
||||
log.Printf("[OrchestratorAdapter] Found sample staff member (not professor): %s %s",
|
||||
stringValue(staffMembers[0].FirstName), staffMembers[0].LastName)
|
||||
}
|
||||
}
|
||||
|
||||
progress.ItemsFound = len(staffPages) // Number of crawlable pages found
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
if sampleFound == 0 && len(staffPages) > 0 {
|
||||
// Pages found but no staff extracted - still consider it successful
|
||||
log.Printf("[OrchestratorAdapter] Discovery completed: %d pages found, extraction may need tuning", len(staffPages))
|
||||
} else if sampleFound == 0 {
|
||||
progress.Errors = append(progress.Errors, "No staff pages found")
|
||||
return progress, fmt.Errorf("no staff pages found for %s", uni.Name)
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Discovery completed for %s: %d pages found", uni.Name, len(staffPages))
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
// CrawlProfessors crawls all professors at a university
|
||||
// This is Phase 2: Focus on finding professors specifically
|
||||
func (a *OrchestratorAdapter) CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhaseProfessors,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Professors phase for university %s", universityID)
|
||||
|
||||
// Get university
|
||||
uni, err := a.repo.GetUniversityByID(ctx, universityID)
|
||||
if err != nil || uni == nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
|
||||
return progress, fmt.Errorf("failed to get university: %w", err)
|
||||
}
|
||||
|
||||
// Perform full crawl
|
||||
result, err := a.crawler.CrawlUniversity(ctx, uni)
|
||||
if err != nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Crawl failed: %v", err))
|
||||
return progress, err
|
||||
}
|
||||
|
||||
// Count professors specifically
|
||||
professorCount := 0
|
||||
staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &universityID,
|
||||
IsProfessor: boolPtr(true),
|
||||
Limit: 10000,
|
||||
})
|
||||
if err == nil {
|
||||
professorCount = staffList.Total
|
||||
}
|
||||
|
||||
progress.ItemsFound = professorCount
|
||||
progress.ItemsProcessed = result.StaffFound
|
||||
progress.Errors = result.Errors
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
log.Printf("[OrchestratorAdapter] Professors phase completed for %s: %d professors found", uni.Name, professorCount)
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
// CrawlAllStaff crawls all staff members at a university
|
||||
// This is Phase 3: Get all staff (already done in Phase 2, but we verify/extend)
|
||||
func (a *OrchestratorAdapter) CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*orchestrator.CrawlProgress, error) {
|
||||
start := time.Now()
|
||||
progress := &orchestrator.CrawlProgress{
|
||||
Phase: orchestrator.PhaseAllStaff,
|
||||
StartedAt: start,
|
||||
}
|
||||
|
||||
log.Printf("[OrchestratorAdapter] All Staff phase for university %s", universityID)
|
||||
|
||||
// Get university
|
||||
uni, err := a.repo.GetUniversityByID(ctx, universityID)
|
||||
if err != nil || uni == nil {
|
||||
progress.Errors = append(progress.Errors, fmt.Sprintf("Failed to get university: %v", err))
|
||||
return progress, fmt.Errorf("failed to get university: %w", err)
|
||||
}
|
||||
|
||||
// Run another crawl pass to catch any missed staff
|
||||
result, err := a.crawler.CrawlUniversity(ctx, uni)
|
||||
if err != nil {
|
||||
progress.Errors = result.Errors
|
||||
// Don't fail completely - we may have some staff already
|
||||
log.Printf("[OrchestratorAdapter] All Staff crawl had errors: %v", err)
|
||||
}
|
||||
|
||||
// Get total staff count
|
||||
staffCount := 0
|
||||
staffList, err := a.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &universityID,
|
||||
Limit: 1, // Just need count
|
||||
})
|
||||
if err == nil {
|
||||
staffCount = staffList.Total
|
||||
}
|
||||
|
||||
progress.ItemsFound = staffCount
|
||||
if result != nil {
|
||||
progress.ItemsProcessed = result.StaffFound
|
||||
progress.Errors = result.Errors
|
||||
}
|
||||
now := time.Now()
|
||||
progress.CompletedAt = &now
|
||||
|
||||
log.Printf("[OrchestratorAdapter] All Staff phase completed for %s: %d total staff", uni.Name, staffCount)
|
||||
return progress, nil
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func stringValue(s *string) string {
|
||||
if s == nil {
|
||||
return ""
|
||||
}
|
||||
return *s
|
||||
}
|
||||
|
||||
func boolPtr(b bool) *bool {
|
||||
return &b
|
||||
}
|
||||
Reference in New Issue
Block a user