package handlers import ( "net/http" "time" "github.com/gin-gonic/gin" "github.com/google/uuid" "github.com/breakpilot/edu-search-service/internal/database" ) // AIExtractionHandlers handles AI-based profile extraction endpoints // These endpoints are designed for vast.ai or similar AI services to: // 1. Get profile URLs that need extraction // 2. Submit extracted data back type AIExtractionHandlers struct { repo *database.Repository } // NewAIExtractionHandlers creates new AI extraction handlers func NewAIExtractionHandlers(repo *database.Repository) *AIExtractionHandlers { return &AIExtractionHandlers{repo: repo} } // ProfileExtractionTask represents a profile URL to be processed by AI type ProfileExtractionTask struct { StaffID uuid.UUID `json:"staff_id"` ProfileURL string `json:"profile_url"` UniversityID uuid.UUID `json:"university_id"` UniversityURL string `json:"university_url,omitempty"` FullName string `json:"full_name,omitempty"` CurrentData struct { Email string `json:"email,omitempty"` Phone string `json:"phone,omitempty"` Office string `json:"office,omitempty"` Position string `json:"position,omitempty"` Department string `json:"department,omitempty"` } `json:"current_data"` } // GetPendingProfiles returns staff profiles that need AI extraction // GET /api/v1/ai/extraction/pending?limit=10&university_id=... func (h *AIExtractionHandlers) GetPendingProfiles(c *gin.Context) { limit := parseIntDefault(c.Query("limit"), 10) if limit > 100 { limit = 100 } var universityID *uuid.UUID if uniIDStr := c.Query("university_id"); uniIDStr != "" { id, err := uuid.Parse(uniIDStr) if err == nil { universityID = &id } } // Get staff that have profile URLs but missing key data params := database.StaffSearchParams{ UniversityID: universityID, Limit: limit * 2, // Get more to filter } result, err := h.repo.SearchStaff(c.Request.Context(), params) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } // Filter to only include profiles that need extraction var tasks []ProfileExtractionTask for _, staff := range result.Staff { // Skip if no profile URL if staff.ProfileURL == nil || *staff.ProfileURL == "" { continue } // Include if missing email or other important data needsExtraction := staff.Email == nil || *staff.Email == "" if needsExtraction { task := ProfileExtractionTask{ StaffID: staff.ID, ProfileURL: *staff.ProfileURL, UniversityID: staff.UniversityID, } if staff.FullName != nil { task.FullName = *staff.FullName } if staff.Email != nil { task.CurrentData.Email = *staff.Email } if staff.Phone != nil { task.CurrentData.Phone = *staff.Phone } if staff.Office != nil { task.CurrentData.Office = *staff.Office } if staff.Position != nil { task.CurrentData.Position = *staff.Position } if staff.DepartmentName != nil { task.CurrentData.Department = *staff.DepartmentName } tasks = append(tasks, task) if len(tasks) >= limit { break } } } c.JSON(http.StatusOK, gin.H{ "tasks": tasks, "total": len(tasks), }) } // ExtractedProfileData represents data extracted by AI from a profile page type ExtractedProfileData struct { StaffID uuid.UUID `json:"staff_id" binding:"required"` // Contact info Email string `json:"email,omitempty"` Phone string `json:"phone,omitempty"` Office string `json:"office,omitempty"` // Professional info Position string `json:"position,omitempty"` PositionType string `json:"position_type,omitempty"` // professor, researcher, phd_student, staff AcademicTitle string `json:"academic_title,omitempty"` IsProfessor *bool `json:"is_professor,omitempty"` DepartmentName string `json:"department_name,omitempty"` // Hierarchy SupervisorName string `json:"supervisor_name,omitempty"` TeamRole string `json:"team_role,omitempty"` // leitung, mitarbeiter, sekretariat, hiwi, doktorand // Research ResearchInterests []string `json:"research_interests,omitempty"` ResearchSummary string `json:"research_summary,omitempty"` // Teaching (Lehrveranstaltungen) TeachingTopics []string `json:"teaching_topics,omitempty"` // External profiles ORCID string `json:"orcid,omitempty"` GoogleScholarID string `json:"google_scholar_id,omitempty"` ResearchgateURL string `json:"researchgate_url,omitempty"` LinkedInURL string `json:"linkedin_url,omitempty"` PersonalWebsite string `json:"personal_website,omitempty"` PhotoURL string `json:"photo_url,omitempty"` // Institute/Department links discovered InstituteURL string `json:"institute_url,omitempty"` InstituteName string `json:"institute_name,omitempty"` // Confidence score (0-1) Confidence float64 `json:"confidence,omitempty"` } // SubmitExtractedData saves AI-extracted profile data // POST /api/v1/ai/extraction/submit func (h *AIExtractionHandlers) SubmitExtractedData(c *gin.Context) { var data ExtractedProfileData if err := c.ShouldBindJSON(&data); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()}) return } // Get existing staff record staff, err := h.repo.GetStaff(c.Request.Context(), data.StaffID) if err != nil { c.JSON(http.StatusNotFound, gin.H{"error": "Staff not found"}) return } // Update fields if provided and not empty updated := false if data.Email != "" && (staff.Email == nil || *staff.Email == "") { staff.Email = &data.Email updated = true } if data.Phone != "" && (staff.Phone == nil || *staff.Phone == "") { staff.Phone = &data.Phone updated = true } if data.Office != "" && (staff.Office == nil || *staff.Office == "") { staff.Office = &data.Office updated = true } if data.Position != "" && (staff.Position == nil || *staff.Position == "") { staff.Position = &data.Position updated = true } if data.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") { staff.PositionType = &data.PositionType updated = true } if data.AcademicTitle != "" && (staff.AcademicTitle == nil || *staff.AcademicTitle == "") { staff.AcademicTitle = &data.AcademicTitle updated = true } if data.IsProfessor != nil { staff.IsProfessor = *data.IsProfessor updated = true } if data.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") { staff.TeamRole = &data.TeamRole updated = true } if len(data.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 { staff.ResearchInterests = data.ResearchInterests updated = true } if data.ResearchSummary != "" && (staff.ResearchSummary == nil || *staff.ResearchSummary == "") { staff.ResearchSummary = &data.ResearchSummary updated = true } if data.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") { staff.ORCID = &data.ORCID updated = true } if data.GoogleScholarID != "" && (staff.GoogleScholarID == nil || *staff.GoogleScholarID == "") { staff.GoogleScholarID = &data.GoogleScholarID updated = true } if data.ResearchgateURL != "" && (staff.ResearchgateURL == nil || *staff.ResearchgateURL == "") { staff.ResearchgateURL = &data.ResearchgateURL updated = true } if data.LinkedInURL != "" && (staff.LinkedInURL == nil || *staff.LinkedInURL == "") { staff.LinkedInURL = &data.LinkedInURL updated = true } if data.PersonalWebsite != "" && (staff.PersonalWebsite == nil || *staff.PersonalWebsite == "") { staff.PersonalWebsite = &data.PersonalWebsite updated = true } if data.PhotoURL != "" && (staff.PhotoURL == nil || *staff.PhotoURL == "") { staff.PhotoURL = &data.PhotoURL updated = true } // Try to resolve supervisor by name if data.SupervisorName != "" && staff.SupervisorID == nil { // Search for supervisor in same university supervisorParams := database.StaffSearchParams{ Query: data.SupervisorName, UniversityID: &staff.UniversityID, Limit: 1, } result, err := h.repo.SearchStaff(c.Request.Context(), supervisorParams) if err == nil && len(result.Staff) > 0 { staff.SupervisorID = &result.Staff[0].ID updated = true } } // Update last verified timestamp now := time.Now() staff.LastVerified = &now if updated { err = h.repo.CreateStaff(c.Request.Context(), staff) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update: " + err.Error()}) return } } c.JSON(http.StatusOK, gin.H{ "status": "success", "updated": updated, "staff_id": staff.ID, }) } // SubmitBatchExtractedData saves multiple AI-extracted profile data items // POST /api/v1/ai/extraction/submit-batch func (h *AIExtractionHandlers) SubmitBatchExtractedData(c *gin.Context) { var batch struct { Items []ExtractedProfileData `json:"items" binding:"required"` } if err := c.ShouldBindJSON(&batch); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()}) return } results := make([]gin.H, 0, len(batch.Items)) successCount := 0 errorCount := 0 for _, item := range batch.Items { // Get existing staff record staff, err := h.repo.GetStaff(c.Request.Context(), item.StaffID) if err != nil { results = append(results, gin.H{ "staff_id": item.StaffID, "status": "error", "error": "Staff not found", }) errorCount++ continue } // Apply updates (same logic as single submit) updated := false if item.Email != "" && (staff.Email == nil || *staff.Email == "") { staff.Email = &item.Email updated = true } if item.Phone != "" && (staff.Phone == nil || *staff.Phone == "") { staff.Phone = &item.Phone updated = true } if item.Office != "" && (staff.Office == nil || *staff.Office == "") { staff.Office = &item.Office updated = true } if item.Position != "" && (staff.Position == nil || *staff.Position == "") { staff.Position = &item.Position updated = true } if item.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") { staff.PositionType = &item.PositionType updated = true } if item.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") { staff.TeamRole = &item.TeamRole updated = true } if len(item.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 { staff.ResearchInterests = item.ResearchInterests updated = true } if item.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") { staff.ORCID = &item.ORCID updated = true } // Update last verified now := time.Now() staff.LastVerified = &now if updated { err = h.repo.CreateStaff(c.Request.Context(), staff) if err != nil { results = append(results, gin.H{ "staff_id": item.StaffID, "status": "error", "error": err.Error(), }) errorCount++ continue } } results = append(results, gin.H{ "staff_id": item.StaffID, "status": "success", "updated": updated, }) successCount++ } c.JSON(http.StatusOK, gin.H{ "results": results, "success_count": successCount, "error_count": errorCount, "total": len(batch.Items), }) } // InstituteHierarchyTask represents an institute page to crawl for hierarchy type InstituteHierarchyTask struct { InstituteURL string `json:"institute_url"` InstituteName string `json:"institute_name,omitempty"` UniversityID uuid.UUID `json:"university_id"` } // GetInstitutePages returns institute pages that need hierarchy crawling // GET /api/v1/ai/extraction/institutes?university_id=... func (h *AIExtractionHandlers) GetInstitutePages(c *gin.Context) { var universityID *uuid.UUID if uniIDStr := c.Query("university_id"); uniIDStr != "" { id, err := uuid.Parse(uniIDStr) if err == nil { universityID = &id } } // Get unique institute/department URLs from staff profiles params := database.StaffSearchParams{ UniversityID: universityID, Limit: 1000, } result, err := h.repo.SearchStaff(c.Request.Context(), params) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } // Collect unique source URLs (these are typically department pages) urlSet := make(map[string]bool) var tasks []InstituteHierarchyTask for _, staff := range result.Staff { if staff.SourceURL != nil && *staff.SourceURL != "" { url := *staff.SourceURL if !urlSet[url] { urlSet[url] = true tasks = append(tasks, InstituteHierarchyTask{ InstituteURL: url, UniversityID: staff.UniversityID, }) } } } c.JSON(http.StatusOK, gin.H{ "institutes": tasks, "total": len(tasks), }) } // InstituteHierarchyData represents hierarchy data extracted from an institute page type InstituteHierarchyData struct { InstituteURL string `json:"institute_url" binding:"required"` UniversityID uuid.UUID `json:"university_id" binding:"required"` InstituteName string `json:"institute_name,omitempty"` // Leadership LeaderName string `json:"leader_name,omitempty"` LeaderTitle string `json:"leader_title,omitempty"` // e.g., "Professor", "Lehrstuhlinhaber" // Staff organization StaffGroups []struct { Role string `json:"role"` // e.g., "Leitung", "Wissenschaftliche Mitarbeiter", "Sekretariat" Members []string `json:"members"` // Names of people in this group } `json:"staff_groups,omitempty"` // Teaching info (Lehrveranstaltungen) TeachingCourses []struct { Title string `json:"title"` Teacher string `json:"teacher,omitempty"` } `json:"teaching_courses,omitempty"` } // SubmitInstituteHierarchy saves hierarchy data from an institute page // POST /api/v1/ai/extraction/institutes/submit func (h *AIExtractionHandlers) SubmitInstituteHierarchy(c *gin.Context) { var data InstituteHierarchyData if err := c.ShouldBindJSON(&data); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()}) return } // Find or create department dept := &database.Department{ UniversityID: data.UniversityID, Name: data.InstituteName, } if data.InstituteURL != "" { dept.URL = &data.InstituteURL } err := h.repo.CreateDepartment(c.Request.Context(), dept) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create department: " + err.Error()}) return } // Find leader and set as supervisor for all staff in this institute var leaderID *uuid.UUID if data.LeaderName != "" { // Search for leader leaderParams := database.StaffSearchParams{ Query: data.LeaderName, UniversityID: &data.UniversityID, Limit: 1, } result, err := h.repo.SearchStaff(c.Request.Context(), leaderParams) if err == nil && len(result.Staff) > 0 { leaderID = &result.Staff[0].ID // Update leader with department and role leader := &result.Staff[0] leader.DepartmentID = &dept.ID roleLeitung := "leitung" leader.TeamRole = &roleLeitung leader.IsProfessor = true if data.LeaderTitle != "" { leader.AcademicTitle = &data.LeaderTitle } h.repo.CreateStaff(c.Request.Context(), leader) } } // Process staff groups updatedCount := 0 for _, group := range data.StaffGroups { for _, memberName := range group.Members { // Find staff member memberParams := database.StaffSearchParams{ Query: memberName, UniversityID: &data.UniversityID, Limit: 1, } result, err := h.repo.SearchStaff(c.Request.Context(), memberParams) if err != nil || len(result.Staff) == 0 { continue } member := &result.Staff[0] member.DepartmentID = &dept.ID member.TeamRole = &group.Role // Set supervisor if leader was found and this is not the leader if leaderID != nil && member.ID != *leaderID { member.SupervisorID = leaderID } h.repo.CreateStaff(c.Request.Context(), member) updatedCount++ } } c.JSON(http.StatusOK, gin.H{ "status": "success", "department_id": dept.ID, "leader_id": leaderID, "members_updated": updatedCount, }) } // RegisterAIExtractionRoutes registers AI extraction routes func (h *AIExtractionHandlers) RegisterRoutes(r *gin.RouterGroup) { ai := r.Group("/ai/extraction") // Profile extraction endpoints ai.GET("/pending", h.GetPendingProfiles) ai.POST("/submit", h.SubmitExtractedData) ai.POST("/submit-batch", h.SubmitBatchExtractedData) // Institute hierarchy endpoints ai.GET("/institutes", h.GetInstitutePages) ai.POST("/institutes/submit", h.SubmitInstituteHierarchy) }