breakpilot-lehrer/edu-search-service/internal/api/handlers/ai_extraction_handlers.go

package handlers

import (
	"net/http"
	"time"

	"github.com/gin-gonic/gin"
	"github.com/google/uuid"

	"github.com/breakpilot/edu-search-service/internal/database"
)

// AIExtractionHandlers handles AI-based profile extraction endpoints
// These endpoints are designed for vast.ai or similar AI services to:
// 1. Get profile URLs that need extraction
// 2. Submit extracted data back
type AIExtractionHandlers struct {
	repo *database.Repository
}

// NewAIExtractionHandlers creates new AI extraction handlers
func NewAIExtractionHandlers(repo *database.Repository) *AIExtractionHandlers {
	return &AIExtractionHandlers{repo: repo}
}

// ProfileExtractionTask represents a profile URL to be processed by AI
type ProfileExtractionTask struct {
	StaffID       uuid.UUID `json:"staff_id"`
	ProfileURL    string    `json:"profile_url"`
	UniversityID  uuid.UUID `json:"university_id"`
	UniversityURL string    `json:"university_url,omitempty"`
	FullName      string    `json:"full_name,omitempty"`
	CurrentData   struct {
		Email      string `json:"email,omitempty"`
		Phone      string `json:"phone,omitempty"`
		Office     string `json:"office,omitempty"`
		Position   string `json:"position,omitempty"`
		Department string `json:"department,omitempty"`
	} `json:"current_data"`
}

// GetPendingProfiles returns staff profiles that need AI extraction
// GET /api/v1/ai/extraction/pending?limit=10&university_id=...
func (h *AIExtractionHandlers) GetPendingProfiles(c *gin.Context) {
	limit := parseIntDefault(c.Query("limit"), 10)
	if limit > 100 {
		limit = 100
	}

	var universityID *uuid.UUID
	if uniIDStr := c.Query("university_id"); uniIDStr != "" {
		id, err := uuid.Parse(uniIDStr)
		if err == nil {
			universityID = &id
		}
	}

	// Get staff that have profile URLs but missing key data
	params := database.StaffSearchParams{
		UniversityID: universityID,
		Limit:        limit * 2, // Get more to filter
	}

	result, err := h.repo.SearchStaff(c.Request.Context(), params)
	if err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
		return
	}

	// Filter to only include profiles that need extraction
	var tasks []ProfileExtractionTask
	for _, staff := range result.Staff {
		// Skip if no profile URL
		if staff.ProfileURL == nil || *staff.ProfileURL == "" {
			continue
		}

		// Include if missing email or other important data
		needsExtraction := staff.Email == nil || *staff.Email == ""

		if needsExtraction {
			task := ProfileExtractionTask{
				StaffID:      staff.ID,
				ProfileURL:   *staff.ProfileURL,
				UniversityID: staff.UniversityID,
			}

			if staff.FullName != nil {
				task.FullName = *staff.FullName
			}
			if staff.Email != nil {
				task.CurrentData.Email = *staff.Email
			}
			if staff.Phone != nil {
				task.CurrentData.Phone = *staff.Phone
			}
			if staff.Office != nil {
				task.CurrentData.Office = *staff.Office
			}
			if staff.Position != nil {
				task.CurrentData.Position = *staff.Position
			}
			if staff.DepartmentName != nil {
				task.CurrentData.Department = *staff.DepartmentName
			}

			tasks = append(tasks, task)
			if len(tasks) >= limit {
				break
			}
		}
	}

	c.JSON(http.StatusOK, gin.H{
		"tasks": tasks,
		"total": len(tasks),
	})
}

// ExtractedProfileData represents data extracted by AI from a profile page
type ExtractedProfileData struct {
	StaffID uuid.UUID `json:"staff_id" binding:"required"`

	// Contact info
	Email string `json:"email,omitempty"`
	Phone string `json:"phone,omitempty"`
	Office string `json:"office,omitempty"`

	// Professional info
	Position       string `json:"position,omitempty"`
	PositionType   string `json:"position_type,omitempty"` // professor, researcher, phd_student, staff
	AcademicTitle  string `json:"academic_title,omitempty"`
	IsProfessor    *bool  `json:"is_professor,omitempty"`
	DepartmentName string `json:"department_name,omitempty"`

	// Hierarchy
	SupervisorName string `json:"supervisor_name,omitempty"`
	TeamRole       string `json:"team_role,omitempty"` // leitung, mitarbeiter, sekretariat, hiwi, doktorand

	// Research
	ResearchInterests []string `json:"research_interests,omitempty"`
	ResearchSummary   string   `json:"research_summary,omitempty"`

	// Teaching (Lehrveranstaltungen)
	TeachingTopics []string `json:"teaching_topics,omitempty"`

	// External profiles
	ORCID           string `json:"orcid,omitempty"`
	GoogleScholarID string `json:"google_scholar_id,omitempty"`
	ResearchgateURL string `json:"researchgate_url,omitempty"`
	LinkedInURL     string `json:"linkedin_url,omitempty"`
	PersonalWebsite string `json:"personal_website,omitempty"`
	PhotoURL        string `json:"photo_url,omitempty"`

	// Institute/Department links discovered
	InstituteURL  string `json:"institute_url,omitempty"`
	InstituteName string `json:"institute_name,omitempty"`

	// Confidence score (0-1)
	Confidence float64 `json:"confidence,omitempty"`
}

// SubmitExtractedData saves AI-extracted profile data
// POST /api/v1/ai/extraction/submit
func (h *AIExtractionHandlers) SubmitExtractedData(c *gin.Context) {
	var data ExtractedProfileData
	if err := c.ShouldBindJSON(&data); err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
		return
	}

	// Get existing staff record
	staff, err := h.repo.GetStaff(c.Request.Context(), data.StaffID)
	if err != nil {
		c.JSON(http.StatusNotFound, gin.H{"error": "Staff not found"})
		return
	}

	// Update fields if provided and not empty
	updated := false

	if data.Email != "" && (staff.Email == nil || *staff.Email == "") {
		staff.Email = &data.Email
		updated = true
	}
	if data.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
		staff.Phone = &data.Phone
		updated = true
	}
	if data.Office != "" && (staff.Office == nil || *staff.Office == "") {
		staff.Office = &data.Office
		updated = true
	}
	if data.Position != "" && (staff.Position == nil || *staff.Position == "") {
		staff.Position = &data.Position
		updated = true
	}
	if data.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
		staff.PositionType = &data.PositionType
		updated = true
	}
	if data.AcademicTitle != "" && (staff.AcademicTitle == nil || *staff.AcademicTitle == "") {
		staff.AcademicTitle = &data.AcademicTitle
		updated = true
	}
	if data.IsProfessor != nil {
		staff.IsProfessor = *data.IsProfessor
		updated = true
	}
	if data.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
		staff.TeamRole = &data.TeamRole
		updated = true
	}
	if len(data.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
		staff.ResearchInterests = data.ResearchInterests
		updated = true
	}
	if data.ResearchSummary != "" && (staff.ResearchSummary == nil || *staff.ResearchSummary == "") {
		staff.ResearchSummary = &data.ResearchSummary
		updated = true
	}
	if data.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
		staff.ORCID = &data.ORCID
		updated = true
	}
	if data.GoogleScholarID != "" && (staff.GoogleScholarID == nil || *staff.GoogleScholarID == "") {
		staff.GoogleScholarID = &data.GoogleScholarID
		updated = true
	}
	if data.ResearchgateURL != "" && (staff.ResearchgateURL == nil || *staff.ResearchgateURL == "") {
		staff.ResearchgateURL = &data.ResearchgateURL
		updated = true
	}
	if data.LinkedInURL != "" && (staff.LinkedInURL == nil || *staff.LinkedInURL == "") {
		staff.LinkedInURL = &data.LinkedInURL
		updated = true
	}
	if data.PersonalWebsite != "" && (staff.PersonalWebsite == nil || *staff.PersonalWebsite == "") {
		staff.PersonalWebsite = &data.PersonalWebsite
		updated = true
	}
	if data.PhotoURL != "" && (staff.PhotoURL == nil || *staff.PhotoURL == "") {
		staff.PhotoURL = &data.PhotoURL
		updated = true
	}

	// Try to resolve supervisor by name
	if data.SupervisorName != "" && staff.SupervisorID == nil {
		// Search for supervisor in same university
		supervisorParams := database.StaffSearchParams{
			Query:        data.SupervisorName,
			UniversityID: &staff.UniversityID,
			Limit:        1,
		}
		result, err := h.repo.SearchStaff(c.Request.Context(), supervisorParams)
		if err == nil && len(result.Staff) > 0 {
			staff.SupervisorID = &result.Staff[0].ID
			updated = true
		}
	}

	// Update last verified timestamp
	now := time.Now()
	staff.LastVerified = &now

	if updated {
		err = h.repo.CreateStaff(c.Request.Context(), staff)
		if err != nil {
			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update: " + err.Error()})
			return
		}
	}

	c.JSON(http.StatusOK, gin.H{
		"status":   "success",
		"updated":  updated,
		"staff_id": staff.ID,
	})
}

// RegisterAIExtractionRoutes registers AI extraction routes
func (h *AIExtractionHandlers) RegisterRoutes(r *gin.RouterGroup) {
	ai := r.Group("/ai/extraction")

	// Profile extraction endpoints
	ai.GET("/pending", h.GetPendingProfiles)
	ai.POST("/submit", h.SubmitExtractedData)
	ai.POST("/submit-batch", h.SubmitBatchExtractedData)

	// Institute hierarchy endpoints
	ai.GET("/institutes", h.GetInstitutePages)
	ai.POST("/institutes/submit", h.SubmitInstituteHierarchy)
}