feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
424
edu-search-service/internal/orchestrator/audiences.go
Normal file
424
edu-search-service/internal/orchestrator/audiences.go
Normal file
@@ -0,0 +1,424 @@
|
||||
// Package orchestrator implements multi-phase university crawling with queue management
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// Audience represents a target audience filter configuration
|
||||
type Audience struct {
|
||||
ID uuid.UUID `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Filters AudienceFilters `json:"filters"`
|
||||
MemberCount int `json:"member_count"`
|
||||
LastCountUpdate *time.Time `json:"last_count_update,omitempty"`
|
||||
CreatedBy string `json:"created_by,omitempty"`
|
||||
IsActive bool `json:"is_active"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// AudienceFilters defines the filter criteria for an audience
|
||||
type AudienceFilters struct {
|
||||
PositionTypes []string `json:"position_types,omitempty"` // professor, researcher, lecturer
|
||||
SubjectAreas []uuid.UUID `json:"subject_areas,omitempty"` // Subject area UUIDs
|
||||
States []string `json:"states,omitempty"` // BW, BY, etc.
|
||||
UniTypes []string `json:"uni_types,omitempty"` // UNI, PH, HAW
|
||||
Universities []uuid.UUID `json:"universities,omitempty"` // University UUIDs
|
||||
HasEmail *bool `json:"has_email,omitempty"`
|
||||
IsActive *bool `json:"is_active,omitempty"`
|
||||
Keywords []string `json:"keywords,omitempty"` // Keywords in name/research
|
||||
}
|
||||
|
||||
// AudienceExport tracks exports of audience data
|
||||
type AudienceExport struct {
|
||||
ID uuid.UUID `json:"id"`
|
||||
AudienceID uuid.UUID `json:"audience_id"`
|
||||
ExportType string `json:"export_type"` // csv, json, email_list
|
||||
RecordCount int `json:"record_count"`
|
||||
FilePath string `json:"file_path,omitempty"`
|
||||
ExportedBy string `json:"exported_by,omitempty"`
|
||||
Purpose string `json:"purpose,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
// AudienceMember represents a staff member in an audience preview
|
||||
type AudienceMember struct {
|
||||
ID uuid.UUID `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Email string `json:"email,omitempty"`
|
||||
Position string `json:"position,omitempty"`
|
||||
University string `json:"university"`
|
||||
Department string `json:"department,omitempty"`
|
||||
SubjectArea string `json:"subject_area,omitempty"`
|
||||
PublicationCount int `json:"publication_count"`
|
||||
}
|
||||
|
||||
// AudienceRepository extends Repository with audience operations
|
||||
type AudienceRepository interface {
|
||||
// Audience CRUD
|
||||
CreateAudience(ctx context.Context, audience *Audience) error
|
||||
GetAudience(ctx context.Context, id uuid.UUID) (*Audience, error)
|
||||
ListAudiences(ctx context.Context, activeOnly bool) ([]Audience, error)
|
||||
UpdateAudience(ctx context.Context, audience *Audience) error
|
||||
DeleteAudience(ctx context.Context, id uuid.UUID) error
|
||||
|
||||
// Audience members
|
||||
GetAudienceMembers(ctx context.Context, id uuid.UUID, limit, offset int) ([]AudienceMember, int, error)
|
||||
UpdateAudienceCount(ctx context.Context, id uuid.UUID) (int, error)
|
||||
|
||||
// Exports
|
||||
CreateExport(ctx context.Context, export *AudienceExport) error
|
||||
ListExports(ctx context.Context, audienceID uuid.UUID) ([]AudienceExport, error)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// POSTGRES IMPLEMENTATION
|
||||
// ============================================================================
|
||||
|
||||
// CreateAudience creates a new audience
|
||||
func (r *PostgresRepository) CreateAudience(ctx context.Context, audience *Audience) error {
|
||||
filtersJSON, err := json.Marshal(audience.Filters)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal filters: %w", err)
|
||||
}
|
||||
|
||||
query := `
|
||||
INSERT INTO audiences (name, description, filters, created_by, is_active)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
RETURNING id, member_count, created_at, updated_at
|
||||
`
|
||||
|
||||
return r.pool.QueryRow(ctx, query,
|
||||
audience.Name,
|
||||
audience.Description,
|
||||
filtersJSON,
|
||||
audience.CreatedBy,
|
||||
audience.IsActive,
|
||||
).Scan(&audience.ID, &audience.MemberCount, &audience.CreatedAt, &audience.UpdatedAt)
|
||||
}
|
||||
|
||||
// GetAudience retrieves an audience by ID
|
||||
func (r *PostgresRepository) GetAudience(ctx context.Context, id uuid.UUID) (*Audience, error) {
|
||||
query := `
|
||||
SELECT id, name, description, filters, member_count, last_count_update,
|
||||
created_by, is_active, created_at, updated_at
|
||||
FROM audiences
|
||||
WHERE id = $1
|
||||
`
|
||||
|
||||
var audience Audience
|
||||
var filtersJSON []byte
|
||||
|
||||
err := r.pool.QueryRow(ctx, query, id).Scan(
|
||||
&audience.ID, &audience.Name, &audience.Description, &filtersJSON,
|
||||
&audience.MemberCount, &audience.LastCountUpdate,
|
||||
&audience.CreatedBy, &audience.IsActive,
|
||||
&audience.CreatedAt, &audience.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(filtersJSON, &audience.Filters); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal filters: %w", err)
|
||||
}
|
||||
|
||||
return &audience, nil
|
||||
}
|
||||
|
||||
// ListAudiences lists all audiences
|
||||
func (r *PostgresRepository) ListAudiences(ctx context.Context, activeOnly bool) ([]Audience, error) {
|
||||
query := `
|
||||
SELECT id, name, description, filters, member_count, last_count_update,
|
||||
created_by, is_active, created_at, updated_at
|
||||
FROM audiences
|
||||
`
|
||||
if activeOnly {
|
||||
query += ` WHERE is_active = TRUE`
|
||||
}
|
||||
query += ` ORDER BY created_at DESC`
|
||||
|
||||
rows, err := r.pool.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to query audiences: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var audiences []Audience
|
||||
for rows.Next() {
|
||||
var audience Audience
|
||||
var filtersJSON []byte
|
||||
|
||||
if err := rows.Scan(
|
||||
&audience.ID, &audience.Name, &audience.Description, &filtersJSON,
|
||||
&audience.MemberCount, &audience.LastCountUpdate,
|
||||
&audience.CreatedBy, &audience.IsActive,
|
||||
&audience.CreatedAt, &audience.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("failed to scan audience: %w", err)
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(filtersJSON, &audience.Filters); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal filters: %w", err)
|
||||
}
|
||||
|
||||
audiences = append(audiences, audience)
|
||||
}
|
||||
|
||||
return audiences, rows.Err()
|
||||
}
|
||||
|
||||
// UpdateAudience updates an existing audience
|
||||
func (r *PostgresRepository) UpdateAudience(ctx context.Context, audience *Audience) error {
|
||||
filtersJSON, err := json.Marshal(audience.Filters)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal filters: %w", err)
|
||||
}
|
||||
|
||||
query := `
|
||||
UPDATE audiences
|
||||
SET name = $2, description = $3, filters = $4, is_active = $5, updated_at = NOW()
|
||||
WHERE id = $1
|
||||
RETURNING updated_at
|
||||
`
|
||||
|
||||
return r.pool.QueryRow(ctx, query,
|
||||
audience.ID,
|
||||
audience.Name,
|
||||
audience.Description,
|
||||
filtersJSON,
|
||||
audience.IsActive,
|
||||
).Scan(&audience.UpdatedAt)
|
||||
}
|
||||
|
||||
// DeleteAudience soft-deletes an audience (sets is_active = false)
|
||||
func (r *PostgresRepository) DeleteAudience(ctx context.Context, id uuid.UUID) error {
|
||||
query := `UPDATE audiences SET is_active = FALSE, updated_at = NOW() WHERE id = $1`
|
||||
_, err := r.pool.Exec(ctx, query, id)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetAudienceMembers retrieves members matching the audience filters
|
||||
func (r *PostgresRepository) GetAudienceMembers(ctx context.Context, id uuid.UUID, limit, offset int) ([]AudienceMember, int, error) {
|
||||
// First get the audience filters
|
||||
audience, err := r.GetAudience(ctx, id)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to get audience: %w", err)
|
||||
}
|
||||
|
||||
// Build dynamic query based on filters
|
||||
query, args := r.buildAudienceMemberQuery(audience.Filters, limit, offset, false)
|
||||
countQuery, countArgs := r.buildAudienceMemberQuery(audience.Filters, 0, 0, true)
|
||||
|
||||
// Get total count
|
||||
var totalCount int
|
||||
if err := r.pool.QueryRow(ctx, countQuery, countArgs...).Scan(&totalCount); err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count members: %w", err)
|
||||
}
|
||||
|
||||
// Get members
|
||||
rows, err := r.pool.Query(ctx, query, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to query members: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var members []AudienceMember
|
||||
for rows.Next() {
|
||||
var m AudienceMember
|
||||
if err := rows.Scan(
|
||||
&m.ID, &m.Name, &m.Email, &m.Position,
|
||||
&m.University, &m.Department, &m.SubjectArea, &m.PublicationCount,
|
||||
); err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan member: %w", err)
|
||||
}
|
||||
members = append(members, m)
|
||||
}
|
||||
|
||||
return members, totalCount, rows.Err()
|
||||
}
|
||||
|
||||
// buildAudienceMemberQuery constructs a SQL query for audience members
|
||||
func (r *PostgresRepository) buildAudienceMemberQuery(filters AudienceFilters, limit, offset int, countOnly bool) (string, []interface{}) {
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
|
||||
var selectClause string
|
||||
if countOnly {
|
||||
selectClause = "SELECT COUNT(*)"
|
||||
} else {
|
||||
selectClause = `
|
||||
SELECT
|
||||
s.id,
|
||||
COALESCE(s.title || ' ', '') || s.first_name || ' ' || s.last_name as name,
|
||||
COALESCE(s.email, '') as email,
|
||||
COALESCE(s.position_type, '') as position,
|
||||
u.name as university,
|
||||
COALESCE(d.name, '') as department,
|
||||
COALESCE(sa.name, '') as subject_area,
|
||||
(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
|
||||
`
|
||||
}
|
||||
|
||||
query := selectClause + `
|
||||
FROM university_staff s
|
||||
JOIN universities u ON s.university_id = u.id
|
||||
LEFT JOIN departments d ON s.department_id = d.id
|
||||
LEFT JOIN subject_areas sa ON s.subject_area_id = sa.id
|
||||
WHERE 1=1
|
||||
`
|
||||
|
||||
// Position types filter
|
||||
if len(filters.PositionTypes) > 0 {
|
||||
query += fmt.Sprintf(" AND s.position_type = ANY($%d)", argNum)
|
||||
args = append(args, filters.PositionTypes)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Subject areas filter
|
||||
if len(filters.SubjectAreas) > 0 {
|
||||
query += fmt.Sprintf(" AND s.subject_area_id = ANY($%d)", argNum)
|
||||
args = append(args, filters.SubjectAreas)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// States filter
|
||||
if len(filters.States) > 0 {
|
||||
query += fmt.Sprintf(" AND u.state = ANY($%d)", argNum)
|
||||
args = append(args, filters.States)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Uni types filter
|
||||
if len(filters.UniTypes) > 0 {
|
||||
query += fmt.Sprintf(" AND u.uni_type = ANY($%d)", argNum)
|
||||
args = append(args, filters.UniTypes)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Universities filter
|
||||
if len(filters.Universities) > 0 {
|
||||
query += fmt.Sprintf(" AND s.university_id = ANY($%d)", argNum)
|
||||
args = append(args, filters.Universities)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Has email filter
|
||||
if filters.HasEmail != nil && *filters.HasEmail {
|
||||
query += " AND s.email IS NOT NULL AND s.email != ''"
|
||||
}
|
||||
|
||||
// Is active filter
|
||||
if filters.IsActive != nil && *filters.IsActive {
|
||||
query += " AND s.is_active = TRUE"
|
||||
}
|
||||
|
||||
// Keywords filter (search in name and research_areas)
|
||||
if len(filters.Keywords) > 0 {
|
||||
for _, keyword := range filters.Keywords {
|
||||
query += fmt.Sprintf(" AND (s.first_name ILIKE $%d OR s.last_name ILIKE $%d OR s.research_areas ILIKE $%d)", argNum, argNum, argNum)
|
||||
args = append(args, "%"+keyword+"%")
|
||||
argNum++
|
||||
}
|
||||
}
|
||||
|
||||
if !countOnly {
|
||||
query += " ORDER BY s.last_name, s.first_name"
|
||||
|
||||
if limit > 0 {
|
||||
query += fmt.Sprintf(" LIMIT $%d", argNum)
|
||||
args = append(args, limit)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if offset > 0 {
|
||||
query += fmt.Sprintf(" OFFSET $%d", argNum)
|
||||
args = append(args, offset)
|
||||
}
|
||||
}
|
||||
|
||||
return query, args
|
||||
}
|
||||
|
||||
// UpdateAudienceCount updates the cached member count for an audience
|
||||
func (r *PostgresRepository) UpdateAudienceCount(ctx context.Context, id uuid.UUID) (int, error) {
|
||||
// Get the audience filters
|
||||
audience, err := r.GetAudience(ctx, id)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to get audience: %w", err)
|
||||
}
|
||||
|
||||
// Count members
|
||||
countQuery, countArgs := r.buildAudienceMemberQuery(audience.Filters, 0, 0, true)
|
||||
var count int
|
||||
if err := r.pool.QueryRow(ctx, countQuery, countArgs...).Scan(&count); err != nil {
|
||||
return 0, fmt.Errorf("failed to count members: %w", err)
|
||||
}
|
||||
|
||||
// Update the cached count
|
||||
updateQuery := `
|
||||
UPDATE audiences
|
||||
SET member_count = $2, last_count_update = NOW(), updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`
|
||||
if _, err := r.pool.Exec(ctx, updateQuery, id, count); err != nil {
|
||||
return 0, fmt.Errorf("failed to update count: %w", err)
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// CreateExport creates a new export record
|
||||
func (r *PostgresRepository) CreateExport(ctx context.Context, export *AudienceExport) error {
|
||||
query := `
|
||||
INSERT INTO audience_exports (audience_id, export_type, record_count, file_path, exported_by, purpose)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
RETURNING id, created_at
|
||||
`
|
||||
|
||||
return r.pool.QueryRow(ctx, query,
|
||||
export.AudienceID,
|
||||
export.ExportType,
|
||||
export.RecordCount,
|
||||
export.FilePath,
|
||||
export.ExportedBy,
|
||||
export.Purpose,
|
||||
).Scan(&export.ID, &export.CreatedAt)
|
||||
}
|
||||
|
||||
// ListExports lists exports for an audience
|
||||
func (r *PostgresRepository) ListExports(ctx context.Context, audienceID uuid.UUID) ([]AudienceExport, error) {
|
||||
query := `
|
||||
SELECT id, audience_id, export_type, record_count, file_path, exported_by, purpose, created_at
|
||||
FROM audience_exports
|
||||
WHERE audience_id = $1
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
rows, err := r.pool.Query(ctx, query, audienceID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to query exports: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var exports []AudienceExport
|
||||
for rows.Next() {
|
||||
var e AudienceExport
|
||||
if err := rows.Scan(
|
||||
&e.ID, &e.AudienceID, &e.ExportType, &e.RecordCount,
|
||||
&e.FilePath, &e.ExportedBy, &e.Purpose, &e.CreatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("failed to scan export: %w", err)
|
||||
}
|
||||
exports = append(exports, e)
|
||||
}
|
||||
|
||||
return exports, rows.Err()
|
||||
}
|
||||
407
edu-search-service/internal/orchestrator/orchestrator.go
Normal file
407
edu-search-service/internal/orchestrator/orchestrator.go
Normal file
@@ -0,0 +1,407 @@
|
||||
// Package orchestrator implements multi-phase university crawling with queue management
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// CrawlPhase represents a phase in the crawl process
|
||||
type CrawlPhase string
|
||||
|
||||
const (
|
||||
PhasePending CrawlPhase = "pending"
|
||||
PhaseDiscovery CrawlPhase = "discovery" // Find sample professor to validate crawling works
|
||||
PhaseProfessors CrawlPhase = "professors" // Crawl all professors
|
||||
PhaseAllStaff CrawlPhase = "all_staff" // Crawl all staff members
|
||||
PhasePublications CrawlPhase = "publications" // Crawl publications for all staff
|
||||
PhaseCompleted CrawlPhase = "completed"
|
||||
PhaseFailed CrawlPhase = "failed"
|
||||
PhasePaused CrawlPhase = "paused"
|
||||
)
|
||||
|
||||
// CrawlQueueItem represents a university in the crawl queue
|
||||
type CrawlQueueItem struct {
|
||||
ID uuid.UUID `json:"id"`
|
||||
UniversityID uuid.UUID `json:"university_id"`
|
||||
UniversityName string `json:"university_name"`
|
||||
UniversityShort string `json:"university_short"`
|
||||
QueuePosition *int `json:"queue_position"`
|
||||
Priority int `json:"priority"`
|
||||
CurrentPhase CrawlPhase `json:"current_phase"`
|
||||
DiscoveryCompleted bool `json:"discovery_completed"`
|
||||
DiscoveryCompletedAt *time.Time `json:"discovery_completed_at,omitempty"`
|
||||
ProfessorsCompleted bool `json:"professors_completed"`
|
||||
ProfessorsCompletedAt *time.Time `json:"professors_completed_at,omitempty"`
|
||||
AllStaffCompleted bool `json:"all_staff_completed"`
|
||||
AllStaffCompletedAt *time.Time `json:"all_staff_completed_at,omitempty"`
|
||||
PublicationsCompleted bool `json:"publications_completed"`
|
||||
PublicationsCompletedAt *time.Time `json:"publications_completed_at,omitempty"`
|
||||
DiscoveryCount int `json:"discovery_count"`
|
||||
ProfessorsCount int `json:"professors_count"`
|
||||
StaffCount int `json:"staff_count"`
|
||||
PublicationsCount int `json:"publications_count"`
|
||||
RetryCount int `json:"retry_count"`
|
||||
MaxRetries int `json:"max_retries"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
||||
ProgressPercent int `json:"progress_percent"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// CrawlProgress represents progress for a single phase
|
||||
type CrawlProgress struct {
|
||||
Phase CrawlPhase `json:"phase"`
|
||||
ItemsFound int `json:"items_found"`
|
||||
ItemsProcessed int `json:"items_processed"`
|
||||
Errors []string `json:"errors,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
||||
}
|
||||
|
||||
// OrchestratorStatus represents the current state of the orchestrator
|
||||
type OrchestratorStatus struct {
|
||||
IsRunning bool `json:"is_running"`
|
||||
CurrentUniversity *CrawlQueueItem `json:"current_university,omitempty"`
|
||||
CurrentPhase CrawlPhase `json:"current_phase"`
|
||||
QueueLength int `json:"queue_length"`
|
||||
CompletedToday int `json:"completed_today"`
|
||||
TotalProcessed int `json:"total_processed"`
|
||||
LastActivity *time.Time `json:"last_activity,omitempty"`
|
||||
}
|
||||
|
||||
// StaffCrawlerInterface defines what the staff crawler must implement
|
||||
type StaffCrawlerInterface interface {
|
||||
// DiscoverSampleProfessor finds at least one professor to validate crawling works
|
||||
DiscoverSampleProfessor(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
||||
// CrawlProfessors crawls all professors at a university
|
||||
CrawlProfessors(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
||||
// CrawlAllStaff crawls all staff members at a university
|
||||
CrawlAllStaff(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
||||
}
|
||||
|
||||
// PublicationCrawlerInterface defines what the publication crawler must implement
|
||||
type PublicationCrawlerInterface interface {
|
||||
// CrawlPublicationsForUniversity crawls publications for all staff at a university
|
||||
CrawlPublicationsForUniversity(ctx context.Context, universityID uuid.UUID) (*CrawlProgress, error)
|
||||
}
|
||||
|
||||
// Repository defines database operations for the orchestrator
|
||||
type Repository interface {
|
||||
// Queue operations
|
||||
GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error)
|
||||
GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error)
|
||||
AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error)
|
||||
RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error
|
||||
UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error
|
||||
PauseQueueItem(ctx context.Context, universityID uuid.UUID) error
|
||||
ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error
|
||||
|
||||
// Phase updates
|
||||
CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error
|
||||
FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, err string) error
|
||||
|
||||
// Stats
|
||||
GetCompletedTodayCount(ctx context.Context) (int, error)
|
||||
GetTotalProcessedCount(ctx context.Context) (int, error)
|
||||
}
|
||||
|
||||
// Orchestrator manages the multi-phase crawl process
|
||||
type Orchestrator struct {
|
||||
repo Repository
|
||||
staffCrawler StaffCrawlerInterface
|
||||
pubCrawler PublicationCrawlerInterface
|
||||
|
||||
// Runtime state
|
||||
mu sync.RWMutex
|
||||
isRunning bool
|
||||
stopChan chan struct{}
|
||||
currentItem *CrawlQueueItem
|
||||
lastActivity time.Time
|
||||
|
||||
// Configuration
|
||||
phaseCooldown time.Duration // Wait time between phases
|
||||
retryCooldown time.Duration // Wait time after failure before retry
|
||||
maxConcurrent int // Max concurrent crawls (always 1 for now)
|
||||
}
|
||||
|
||||
// NewOrchestrator creates a new orchestrator instance
|
||||
func NewOrchestrator(repo Repository, staffCrawler StaffCrawlerInterface, pubCrawler PublicationCrawlerInterface) *Orchestrator {
|
||||
return &Orchestrator{
|
||||
repo: repo,
|
||||
staffCrawler: staffCrawler,
|
||||
pubCrawler: pubCrawler,
|
||||
phaseCooldown: 5 * time.Second, // Small pause between phases
|
||||
retryCooldown: 30 * time.Second, // Wait before retry after failure
|
||||
maxConcurrent: 1, // Sequential processing
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the orchestrator loop
|
||||
func (o *Orchestrator) Start() error {
|
||||
o.mu.Lock()
|
||||
if o.isRunning {
|
||||
o.mu.Unlock()
|
||||
return fmt.Errorf("orchestrator already running")
|
||||
}
|
||||
o.isRunning = true
|
||||
o.stopChan = make(chan struct{})
|
||||
o.mu.Unlock()
|
||||
|
||||
log.Println("[Orchestrator] Starting crawl orchestration loop")
|
||||
|
||||
go o.runLoop()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop gracefully stops the orchestrator
|
||||
func (o *Orchestrator) Stop() error {
|
||||
o.mu.Lock()
|
||||
if !o.isRunning {
|
||||
o.mu.Unlock()
|
||||
return fmt.Errorf("orchestrator not running")
|
||||
}
|
||||
close(o.stopChan)
|
||||
o.isRunning = false
|
||||
o.mu.Unlock()
|
||||
|
||||
log.Println("[Orchestrator] Stopped")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Status returns the current orchestrator status
|
||||
func (o *Orchestrator) Status(ctx context.Context) (*OrchestratorStatus, error) {
|
||||
o.mu.RLock()
|
||||
defer o.mu.RUnlock()
|
||||
|
||||
status := &OrchestratorStatus{
|
||||
IsRunning: o.isRunning,
|
||||
CurrentPhase: PhasePending,
|
||||
}
|
||||
|
||||
if o.currentItem != nil {
|
||||
status.CurrentUniversity = o.currentItem
|
||||
status.CurrentPhase = o.currentItem.CurrentPhase
|
||||
}
|
||||
|
||||
if !o.lastActivity.IsZero() {
|
||||
status.LastActivity = &o.lastActivity
|
||||
}
|
||||
|
||||
// Get queue stats from DB
|
||||
items, err := o.repo.GetQueueItems(ctx)
|
||||
if err == nil {
|
||||
status.QueueLength = len(items)
|
||||
}
|
||||
|
||||
completedToday, _ := o.repo.GetCompletedTodayCount(ctx)
|
||||
status.CompletedToday = completedToday
|
||||
|
||||
totalProcessed, _ := o.repo.GetTotalProcessedCount(ctx)
|
||||
status.TotalProcessed = totalProcessed
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// AddUniversity adds a university to the crawl queue
|
||||
func (o *Orchestrator) AddUniversity(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) {
|
||||
item, err := o.repo.AddToQueue(ctx, universityID, priority, initiatedBy)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to add to queue: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[Orchestrator] Added university %s to queue with priority %d", universityID, priority)
|
||||
return item, nil
|
||||
}
|
||||
|
||||
// RemoveUniversity removes a university from the queue
|
||||
func (o *Orchestrator) RemoveUniversity(ctx context.Context, universityID uuid.UUID) error {
|
||||
return o.repo.RemoveFromQueue(ctx, universityID)
|
||||
}
|
||||
|
||||
// PauseUniversity pauses crawling for a university
|
||||
func (o *Orchestrator) PauseUniversity(ctx context.Context, universityID uuid.UUID) error {
|
||||
return o.repo.PauseQueueItem(ctx, universityID)
|
||||
}
|
||||
|
||||
// ResumeUniversity resumes crawling for a paused university
|
||||
func (o *Orchestrator) ResumeUniversity(ctx context.Context, universityID uuid.UUID) error {
|
||||
return o.repo.ResumeQueueItem(ctx, universityID)
|
||||
}
|
||||
|
||||
// GetQueue returns all items in the queue
|
||||
func (o *Orchestrator) GetQueue(ctx context.Context) ([]CrawlQueueItem, error) {
|
||||
return o.repo.GetQueueItems(ctx)
|
||||
}
|
||||
|
||||
// runLoop is the main orchestration loop
|
||||
func (o *Orchestrator) runLoop() {
|
||||
ticker := time.NewTicker(10 * time.Second) // Check queue every 10 seconds
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-o.stopChan:
|
||||
return
|
||||
case <-ticker.C:
|
||||
o.processNextInQueue()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// processNextInQueue processes the next university in the queue
|
||||
func (o *Orchestrator) processNextInQueue() {
|
||||
ctx := context.Background()
|
||||
|
||||
// Get next item in queue
|
||||
item, err := o.repo.GetNextInQueue(ctx)
|
||||
if err != nil {
|
||||
log.Printf("[Orchestrator] Error getting next item: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if item == nil {
|
||||
// No items to process
|
||||
return
|
||||
}
|
||||
|
||||
// Check if paused
|
||||
if item.CurrentPhase == PhasePaused {
|
||||
return
|
||||
}
|
||||
|
||||
// Set current item
|
||||
o.mu.Lock()
|
||||
o.currentItem = item
|
||||
o.lastActivity = time.Now()
|
||||
o.mu.Unlock()
|
||||
|
||||
defer func() {
|
||||
o.mu.Lock()
|
||||
o.currentItem = nil
|
||||
o.mu.Unlock()
|
||||
}()
|
||||
|
||||
log.Printf("[Orchestrator] Processing university: %s (Phase: %s)", item.UniversityName, item.CurrentPhase)
|
||||
|
||||
// Process based on current phase
|
||||
switch item.CurrentPhase {
|
||||
case PhasePending:
|
||||
o.runPhase(ctx, item, PhaseDiscovery)
|
||||
case PhaseDiscovery:
|
||||
if item.DiscoveryCompleted {
|
||||
o.runPhase(ctx, item, PhaseProfessors)
|
||||
} else {
|
||||
o.runPhase(ctx, item, PhaseDiscovery)
|
||||
}
|
||||
case PhaseProfessors:
|
||||
if item.ProfessorsCompleted {
|
||||
o.runPhase(ctx, item, PhaseAllStaff)
|
||||
} else {
|
||||
o.runPhase(ctx, item, PhaseProfessors)
|
||||
}
|
||||
case PhaseAllStaff:
|
||||
if item.AllStaffCompleted {
|
||||
o.runPhase(ctx, item, PhasePublications)
|
||||
} else {
|
||||
o.runPhase(ctx, item, PhaseAllStaff)
|
||||
}
|
||||
case PhasePublications:
|
||||
if item.PublicationsCompleted {
|
||||
o.completeUniversity(ctx, item)
|
||||
} else {
|
||||
o.runPhase(ctx, item, PhasePublications)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runPhase executes a specific crawl phase
|
||||
func (o *Orchestrator) runPhase(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase) {
|
||||
log.Printf("[Orchestrator] Running phase %s for %s", phase, item.UniversityName)
|
||||
|
||||
// Update current phase
|
||||
item.CurrentPhase = phase
|
||||
if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
|
||||
log.Printf("[Orchestrator] Failed to update phase: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
var progress *CrawlProgress
|
||||
var err error
|
||||
|
||||
// Execute phase
|
||||
switch phase {
|
||||
case PhaseDiscovery:
|
||||
progress, err = o.staffCrawler.DiscoverSampleProfessor(ctx, item.UniversityID)
|
||||
case PhaseProfessors:
|
||||
progress, err = o.staffCrawler.CrawlProfessors(ctx, item.UniversityID)
|
||||
case PhaseAllStaff:
|
||||
progress, err = o.staffCrawler.CrawlAllStaff(ctx, item.UniversityID)
|
||||
case PhasePublications:
|
||||
progress, err = o.pubCrawler.CrawlPublicationsForUniversity(ctx, item.UniversityID)
|
||||
}
|
||||
|
||||
// Handle result
|
||||
if err != nil {
|
||||
log.Printf("[Orchestrator] Phase %s failed: %v", phase, err)
|
||||
o.handlePhaseFailure(ctx, item, phase, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Mark phase complete
|
||||
count := 0
|
||||
if progress != nil {
|
||||
count = progress.ItemsFound
|
||||
}
|
||||
|
||||
if err := o.repo.CompletePhase(ctx, item.UniversityID, phase, count); err != nil {
|
||||
log.Printf("[Orchestrator] Failed to complete phase: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("[Orchestrator] Phase %s completed for %s (found: %d)", phase, item.UniversityName, count)
|
||||
|
||||
// Wait before next phase
|
||||
time.Sleep(o.phaseCooldown)
|
||||
}
|
||||
|
||||
// handlePhaseFailure handles a phase failure
|
||||
func (o *Orchestrator) handlePhaseFailure(ctx context.Context, item *CrawlQueueItem, phase CrawlPhase, err error) {
|
||||
item.RetryCount++
|
||||
item.LastError = err.Error()
|
||||
|
||||
if item.RetryCount >= item.MaxRetries {
|
||||
// Max retries reached, mark as failed
|
||||
item.CurrentPhase = PhaseFailed
|
||||
log.Printf("[Orchestrator] University %s failed after %d retries", item.UniversityName, item.RetryCount)
|
||||
}
|
||||
|
||||
if updateErr := o.repo.FailPhase(ctx, item.UniversityID, phase, err.Error()); updateErr != nil {
|
||||
log.Printf("[Orchestrator] Failed to update failure status: %v", updateErr)
|
||||
}
|
||||
|
||||
// Wait before potential retry
|
||||
time.Sleep(o.retryCooldown)
|
||||
}
|
||||
|
||||
// completeUniversity marks a university as fully crawled
|
||||
func (o *Orchestrator) completeUniversity(ctx context.Context, item *CrawlQueueItem) {
|
||||
now := time.Now()
|
||||
item.CurrentPhase = PhaseCompleted
|
||||
item.CompletedAt = &now
|
||||
item.QueuePosition = nil // Remove from active queue
|
||||
|
||||
if err := o.repo.UpdateQueueItem(ctx, item); err != nil {
|
||||
log.Printf("[Orchestrator] Failed to complete university: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("[Orchestrator] University %s completed! Professors: %d, Staff: %d, Publications: %d",
|
||||
item.UniversityName, item.ProfessorsCount, item.StaffCount, item.PublicationsCount)
|
||||
}
|
||||
316
edu-search-service/internal/orchestrator/repository.go
Normal file
316
edu-search-service/internal/orchestrator/repository.go
Normal file
@@ -0,0 +1,316 @@
|
||||
// Package orchestrator implements multi-phase university crawling with queue management
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
// PostgresRepository implements the Repository interface using PostgreSQL
|
||||
type PostgresRepository struct {
|
||||
pool *pgxpool.Pool
|
||||
}
|
||||
|
||||
// NewPostgresRepository creates a new PostgresRepository
|
||||
func NewPostgresRepository(pool *pgxpool.Pool) *PostgresRepository {
|
||||
return &PostgresRepository{pool: pool}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// QUEUE OPERATIONS
|
||||
// ============================================================================
|
||||
|
||||
// GetQueueItems retrieves all items in the crawl queue
|
||||
func (r *PostgresRepository) GetQueueItems(ctx context.Context) ([]CrawlQueueItem, error) {
|
||||
query := `
|
||||
SELECT
|
||||
cq.id, cq.university_id, u.name, COALESCE(u.short_name, ''),
|
||||
cq.queue_position, cq.priority, cq.current_phase,
|
||||
cq.discovery_completed, cq.discovery_completed_at,
|
||||
cq.professors_completed, cq.professors_completed_at,
|
||||
cq.all_staff_completed, cq.all_staff_completed_at,
|
||||
cq.publications_completed, cq.publications_completed_at,
|
||||
cq.discovery_count, cq.professors_count, cq.staff_count, cq.publications_count,
|
||||
cq.retry_count, cq.max_retries, COALESCE(cq.last_error, ''),
|
||||
cq.started_at, cq.completed_at,
|
||||
CASE
|
||||
WHEN cq.current_phase = 'pending' THEN 0
|
||||
WHEN cq.current_phase = 'discovery' THEN 10
|
||||
WHEN cq.current_phase = 'professors' THEN 30
|
||||
WHEN cq.current_phase = 'all_staff' THEN 60
|
||||
WHEN cq.current_phase = 'publications' THEN 90
|
||||
WHEN cq.current_phase = 'completed' THEN 100
|
||||
ELSE 0
|
||||
END as progress_percent,
|
||||
cq.created_at, cq.updated_at
|
||||
FROM crawl_queue cq
|
||||
JOIN universities u ON cq.university_id = u.id
|
||||
ORDER BY cq.queue_position NULLS LAST, cq.priority DESC
|
||||
`
|
||||
|
||||
rows, err := r.pool.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to query queue items: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var items []CrawlQueueItem
|
||||
for rows.Next() {
|
||||
var item CrawlQueueItem
|
||||
var phase string
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.UniversityID, &item.UniversityName, &item.UniversityShort,
|
||||
&item.QueuePosition, &item.Priority, &phase,
|
||||
&item.DiscoveryCompleted, &item.DiscoveryCompletedAt,
|
||||
&item.ProfessorsCompleted, &item.ProfessorsCompletedAt,
|
||||
&item.AllStaffCompleted, &item.AllStaffCompletedAt,
|
||||
&item.PublicationsCompleted, &item.PublicationsCompletedAt,
|
||||
&item.DiscoveryCount, &item.ProfessorsCount, &item.StaffCount, &item.PublicationsCount,
|
||||
&item.RetryCount, &item.MaxRetries, &item.LastError,
|
||||
&item.StartedAt, &item.CompletedAt,
|
||||
&item.ProgressPercent,
|
||||
&item.CreatedAt, &item.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("failed to scan queue item: %w", err)
|
||||
}
|
||||
item.CurrentPhase = CrawlPhase(phase)
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
// GetNextInQueue retrieves the next item to process
|
||||
func (r *PostgresRepository) GetNextInQueue(ctx context.Context) (*CrawlQueueItem, error) {
|
||||
query := `
|
||||
SELECT
|
||||
cq.id, cq.university_id, u.name, COALESCE(u.short_name, ''),
|
||||
cq.queue_position, cq.priority, cq.current_phase,
|
||||
cq.discovery_completed, cq.discovery_completed_at,
|
||||
cq.professors_completed, cq.professors_completed_at,
|
||||
cq.all_staff_completed, cq.all_staff_completed_at,
|
||||
cq.publications_completed, cq.publications_completed_at,
|
||||
cq.discovery_count, cq.professors_count, cq.staff_count, cq.publications_count,
|
||||
cq.retry_count, cq.max_retries, COALESCE(cq.last_error, ''),
|
||||
cq.started_at, cq.completed_at,
|
||||
cq.created_at, cq.updated_at
|
||||
FROM crawl_queue cq
|
||||
JOIN universities u ON cq.university_id = u.id
|
||||
WHERE cq.current_phase NOT IN ('completed', 'failed', 'paused')
|
||||
AND cq.queue_position IS NOT NULL
|
||||
ORDER BY cq.queue_position ASC, cq.priority DESC
|
||||
LIMIT 1
|
||||
`
|
||||
|
||||
var item CrawlQueueItem
|
||||
var phase string
|
||||
err := r.pool.QueryRow(ctx, query).Scan(
|
||||
&item.ID, &item.UniversityID, &item.UniversityName, &item.UniversityShort,
|
||||
&item.QueuePosition, &item.Priority, &phase,
|
||||
&item.DiscoveryCompleted, &item.DiscoveryCompletedAt,
|
||||
&item.ProfessorsCompleted, &item.ProfessorsCompletedAt,
|
||||
&item.AllStaffCompleted, &item.AllStaffCompletedAt,
|
||||
&item.PublicationsCompleted, &item.PublicationsCompletedAt,
|
||||
&item.DiscoveryCount, &item.ProfessorsCount, &item.StaffCount, &item.PublicationsCount,
|
||||
&item.RetryCount, &item.MaxRetries, &item.LastError,
|
||||
&item.StartedAt, &item.CompletedAt,
|
||||
&item.CreatedAt, &item.UpdatedAt,
|
||||
)
|
||||
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get next queue item: %w", err)
|
||||
}
|
||||
|
||||
item.CurrentPhase = CrawlPhase(phase)
|
||||
return &item, nil
|
||||
}
|
||||
|
||||
// AddToQueue adds a university to the crawl queue
|
||||
func (r *PostgresRepository) AddToQueue(ctx context.Context, universityID uuid.UUID, priority int, initiatedBy string) (*CrawlQueueItem, error) {
|
||||
// Get next queue position
|
||||
var nextPosition int
|
||||
err := r.pool.QueryRow(ctx, `SELECT COALESCE(MAX(queue_position), 0) + 1 FROM crawl_queue WHERE queue_position IS NOT NULL`).Scan(&nextPosition)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get next queue position: %w", err)
|
||||
}
|
||||
|
||||
query := `
|
||||
INSERT INTO crawl_queue (university_id, queue_position, priority, initiated_by)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT (university_id) DO UPDATE SET
|
||||
queue_position = EXCLUDED.queue_position,
|
||||
priority = EXCLUDED.priority,
|
||||
current_phase = 'pending',
|
||||
retry_count = 0,
|
||||
last_error = NULL,
|
||||
updated_at = NOW()
|
||||
RETURNING id, created_at, updated_at
|
||||
`
|
||||
|
||||
item := &CrawlQueueItem{
|
||||
UniversityID: universityID,
|
||||
QueuePosition: &nextPosition,
|
||||
Priority: priority,
|
||||
CurrentPhase: PhasePending,
|
||||
MaxRetries: 3,
|
||||
}
|
||||
|
||||
err = r.pool.QueryRow(ctx, query, universityID, nextPosition, priority, initiatedBy).Scan(
|
||||
&item.ID, &item.CreatedAt, &item.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to add to queue: %w", err)
|
||||
}
|
||||
|
||||
// Get university name
|
||||
r.pool.QueryRow(ctx, `SELECT name, short_name FROM universities WHERE id = $1`, universityID).Scan(
|
||||
&item.UniversityName, &item.UniversityShort,
|
||||
)
|
||||
|
||||
return item, nil
|
||||
}
|
||||
|
||||
// RemoveFromQueue removes a university from the queue
|
||||
func (r *PostgresRepository) RemoveFromQueue(ctx context.Context, universityID uuid.UUID) error {
|
||||
_, err := r.pool.Exec(ctx, `DELETE FROM crawl_queue WHERE university_id = $1`, universityID)
|
||||
return err
|
||||
}
|
||||
|
||||
// UpdateQueueItem updates a queue item
|
||||
func (r *PostgresRepository) UpdateQueueItem(ctx context.Context, item *CrawlQueueItem) error {
|
||||
query := `
|
||||
UPDATE crawl_queue SET
|
||||
queue_position = $2,
|
||||
priority = $3,
|
||||
current_phase = $4,
|
||||
discovery_completed = $5,
|
||||
discovery_completed_at = $6,
|
||||
professors_completed = $7,
|
||||
professors_completed_at = $8,
|
||||
all_staff_completed = $9,
|
||||
all_staff_completed_at = $10,
|
||||
publications_completed = $11,
|
||||
publications_completed_at = $12,
|
||||
discovery_count = $13,
|
||||
professors_count = $14,
|
||||
staff_count = $15,
|
||||
publications_count = $16,
|
||||
retry_count = $17,
|
||||
last_error = $18,
|
||||
started_at = $19,
|
||||
completed_at = $20,
|
||||
updated_at = NOW()
|
||||
WHERE university_id = $1
|
||||
`
|
||||
|
||||
_, err := r.pool.Exec(ctx, query,
|
||||
item.UniversityID,
|
||||
item.QueuePosition, item.Priority, string(item.CurrentPhase),
|
||||
item.DiscoveryCompleted, item.DiscoveryCompletedAt,
|
||||
item.ProfessorsCompleted, item.ProfessorsCompletedAt,
|
||||
item.AllStaffCompleted, item.AllStaffCompletedAt,
|
||||
item.PublicationsCompleted, item.PublicationsCompletedAt,
|
||||
item.DiscoveryCount, item.ProfessorsCount, item.StaffCount, item.PublicationsCount,
|
||||
item.RetryCount, item.LastError,
|
||||
item.StartedAt, item.CompletedAt,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// PauseQueueItem pauses a crawl
|
||||
func (r *PostgresRepository) PauseQueueItem(ctx context.Context, universityID uuid.UUID) error {
|
||||
_, err := r.pool.Exec(ctx, `UPDATE crawl_queue SET current_phase = 'paused', updated_at = NOW() WHERE university_id = $1`, universityID)
|
||||
return err
|
||||
}
|
||||
|
||||
// ResumeQueueItem resumes a paused crawl
|
||||
func (r *PostgresRepository) ResumeQueueItem(ctx context.Context, universityID uuid.UUID) error {
|
||||
// Determine what phase to resume from
|
||||
query := `
|
||||
UPDATE crawl_queue SET
|
||||
current_phase = CASE
|
||||
WHEN NOT discovery_completed THEN 'discovery'
|
||||
WHEN NOT professors_completed THEN 'professors'
|
||||
WHEN NOT all_staff_completed THEN 'all_staff'
|
||||
WHEN NOT publications_completed THEN 'publications'
|
||||
ELSE 'pending'
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE university_id = $1 AND current_phase = 'paused'
|
||||
`
|
||||
_, err := r.pool.Exec(ctx, query, universityID)
|
||||
return err
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PHASE UPDATES
|
||||
// ============================================================================
|
||||
|
||||
// CompletePhase marks a phase as completed
|
||||
func (r *PostgresRepository) CompletePhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, count int) error {
|
||||
now := time.Now()
|
||||
var query string
|
||||
|
||||
switch phase {
|
||||
case PhaseDiscovery:
|
||||
query = `UPDATE crawl_queue SET discovery_completed = true, discovery_completed_at = $2, discovery_count = $3, updated_at = NOW() WHERE university_id = $1`
|
||||
case PhaseProfessors:
|
||||
query = `UPDATE crawl_queue SET professors_completed = true, professors_completed_at = $2, professors_count = $3, updated_at = NOW() WHERE university_id = $1`
|
||||
case PhaseAllStaff:
|
||||
query = `UPDATE crawl_queue SET all_staff_completed = true, all_staff_completed_at = $2, staff_count = $3, updated_at = NOW() WHERE university_id = $1`
|
||||
case PhasePublications:
|
||||
query = `UPDATE crawl_queue SET publications_completed = true, publications_completed_at = $2, publications_count = $3, updated_at = NOW() WHERE university_id = $1`
|
||||
default:
|
||||
return fmt.Errorf("unknown phase: %s", phase)
|
||||
}
|
||||
|
||||
_, err := r.pool.Exec(ctx, query, universityID, now, count)
|
||||
return err
|
||||
}
|
||||
|
||||
// FailPhase records a phase failure
|
||||
func (r *PostgresRepository) FailPhase(ctx context.Context, universityID uuid.UUID, phase CrawlPhase, errMsg string) error {
|
||||
query := `
|
||||
UPDATE crawl_queue SET
|
||||
retry_count = retry_count + 1,
|
||||
last_error = $2,
|
||||
current_phase = CASE
|
||||
WHEN retry_count + 1 >= max_retries THEN 'failed'
|
||||
ELSE current_phase
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE university_id = $1
|
||||
`
|
||||
_, err := r.pool.Exec(ctx, query, universityID, errMsg)
|
||||
return err
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// STATS
|
||||
// ============================================================================
|
||||
|
||||
// GetCompletedTodayCount returns the number of universities completed today
|
||||
func (r *PostgresRepository) GetCompletedTodayCount(ctx context.Context) (int, error) {
|
||||
var count int
|
||||
err := r.pool.QueryRow(ctx, `
|
||||
SELECT COUNT(*) FROM crawl_queue
|
||||
WHERE current_phase = 'completed'
|
||||
AND completed_at >= CURRENT_DATE
|
||||
`).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
// GetTotalProcessedCount returns the total number of processed universities
|
||||
func (r *PostgresRepository) GetTotalProcessedCount(ctx context.Context) (int, error) {
|
||||
var count int
|
||||
err := r.pool.QueryRow(ctx, `SELECT COUNT(*) FROM crawl_queue WHERE current_phase = 'completed'`).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
Reference in New Issue
Block a user