Files
breakpilot-lehrer/edu-search-service/internal/api/handlers/ai_extraction_handlers.go
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

294 lines
8.9 KiB
Go

package handlers
import (
"net/http"
"time"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"github.com/breakpilot/edu-search-service/internal/database"
)
// AIExtractionHandlers handles AI-based profile extraction endpoints
// These endpoints are designed for vast.ai or similar AI services to:
// 1. Get profile URLs that need extraction
// 2. Submit extracted data back
type AIExtractionHandlers struct {
repo *database.Repository
}
// NewAIExtractionHandlers creates new AI extraction handlers
func NewAIExtractionHandlers(repo *database.Repository) *AIExtractionHandlers {
return &AIExtractionHandlers{repo: repo}
}
// ProfileExtractionTask represents a profile URL to be processed by AI
type ProfileExtractionTask struct {
StaffID uuid.UUID `json:"staff_id"`
ProfileURL string `json:"profile_url"`
UniversityID uuid.UUID `json:"university_id"`
UniversityURL string `json:"university_url,omitempty"`
FullName string `json:"full_name,omitempty"`
CurrentData struct {
Email string `json:"email,omitempty"`
Phone string `json:"phone,omitempty"`
Office string `json:"office,omitempty"`
Position string `json:"position,omitempty"`
Department string `json:"department,omitempty"`
} `json:"current_data"`
}
// GetPendingProfiles returns staff profiles that need AI extraction
// GET /api/v1/ai/extraction/pending?limit=10&university_id=...
func (h *AIExtractionHandlers) GetPendingProfiles(c *gin.Context) {
limit := parseIntDefault(c.Query("limit"), 10)
if limit > 100 {
limit = 100
}
var universityID *uuid.UUID
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
id, err := uuid.Parse(uniIDStr)
if err == nil {
universityID = &id
}
}
// Get staff that have profile URLs but missing key data
params := database.StaffSearchParams{
UniversityID: universityID,
Limit: limit * 2, // Get more to filter
}
result, err := h.repo.SearchStaff(c.Request.Context(), params)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// Filter to only include profiles that need extraction
var tasks []ProfileExtractionTask
for _, staff := range result.Staff {
// Skip if no profile URL
if staff.ProfileURL == nil || *staff.ProfileURL == "" {
continue
}
// Include if missing email or other important data
needsExtraction := staff.Email == nil || *staff.Email == ""
if needsExtraction {
task := ProfileExtractionTask{
StaffID: staff.ID,
ProfileURL: *staff.ProfileURL,
UniversityID: staff.UniversityID,
}
if staff.FullName != nil {
task.FullName = *staff.FullName
}
if staff.Email != nil {
task.CurrentData.Email = *staff.Email
}
if staff.Phone != nil {
task.CurrentData.Phone = *staff.Phone
}
if staff.Office != nil {
task.CurrentData.Office = *staff.Office
}
if staff.Position != nil {
task.CurrentData.Position = *staff.Position
}
if staff.DepartmentName != nil {
task.CurrentData.Department = *staff.DepartmentName
}
tasks = append(tasks, task)
if len(tasks) >= limit {
break
}
}
}
c.JSON(http.StatusOK, gin.H{
"tasks": tasks,
"total": len(tasks),
})
}
// ExtractedProfileData represents data extracted by AI from a profile page
type ExtractedProfileData struct {
StaffID uuid.UUID `json:"staff_id" binding:"required"`
// Contact info
Email string `json:"email,omitempty"`
Phone string `json:"phone,omitempty"`
Office string `json:"office,omitempty"`
// Professional info
Position string `json:"position,omitempty"`
PositionType string `json:"position_type,omitempty"` // professor, researcher, phd_student, staff
AcademicTitle string `json:"academic_title,omitempty"`
IsProfessor *bool `json:"is_professor,omitempty"`
DepartmentName string `json:"department_name,omitempty"`
// Hierarchy
SupervisorName string `json:"supervisor_name,omitempty"`
TeamRole string `json:"team_role,omitempty"` // leitung, mitarbeiter, sekretariat, hiwi, doktorand
// Research
ResearchInterests []string `json:"research_interests,omitempty"`
ResearchSummary string `json:"research_summary,omitempty"`
// Teaching (Lehrveranstaltungen)
TeachingTopics []string `json:"teaching_topics,omitempty"`
// External profiles
ORCID string `json:"orcid,omitempty"`
GoogleScholarID string `json:"google_scholar_id,omitempty"`
ResearchgateURL string `json:"researchgate_url,omitempty"`
LinkedInURL string `json:"linkedin_url,omitempty"`
PersonalWebsite string `json:"personal_website,omitempty"`
PhotoURL string `json:"photo_url,omitempty"`
// Institute/Department links discovered
InstituteURL string `json:"institute_url,omitempty"`
InstituteName string `json:"institute_name,omitempty"`
// Confidence score (0-1)
Confidence float64 `json:"confidence,omitempty"`
}
// SubmitExtractedData saves AI-extracted profile data
// POST /api/v1/ai/extraction/submit
func (h *AIExtractionHandlers) SubmitExtractedData(c *gin.Context) {
var data ExtractedProfileData
if err := c.ShouldBindJSON(&data); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
return
}
// Get existing staff record
staff, err := h.repo.GetStaff(c.Request.Context(), data.StaffID)
if err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "Staff not found"})
return
}
// Update fields if provided and not empty
updated := false
if data.Email != "" && (staff.Email == nil || *staff.Email == "") {
staff.Email = &data.Email
updated = true
}
if data.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
staff.Phone = &data.Phone
updated = true
}
if data.Office != "" && (staff.Office == nil || *staff.Office == "") {
staff.Office = &data.Office
updated = true
}
if data.Position != "" && (staff.Position == nil || *staff.Position == "") {
staff.Position = &data.Position
updated = true
}
if data.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
staff.PositionType = &data.PositionType
updated = true
}
if data.AcademicTitle != "" && (staff.AcademicTitle == nil || *staff.AcademicTitle == "") {
staff.AcademicTitle = &data.AcademicTitle
updated = true
}
if data.IsProfessor != nil {
staff.IsProfessor = *data.IsProfessor
updated = true
}
if data.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
staff.TeamRole = &data.TeamRole
updated = true
}
if len(data.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
staff.ResearchInterests = data.ResearchInterests
updated = true
}
if data.ResearchSummary != "" && (staff.ResearchSummary == nil || *staff.ResearchSummary == "") {
staff.ResearchSummary = &data.ResearchSummary
updated = true
}
if data.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
staff.ORCID = &data.ORCID
updated = true
}
if data.GoogleScholarID != "" && (staff.GoogleScholarID == nil || *staff.GoogleScholarID == "") {
staff.GoogleScholarID = &data.GoogleScholarID
updated = true
}
if data.ResearchgateURL != "" && (staff.ResearchgateURL == nil || *staff.ResearchgateURL == "") {
staff.ResearchgateURL = &data.ResearchgateURL
updated = true
}
if data.LinkedInURL != "" && (staff.LinkedInURL == nil || *staff.LinkedInURL == "") {
staff.LinkedInURL = &data.LinkedInURL
updated = true
}
if data.PersonalWebsite != "" && (staff.PersonalWebsite == nil || *staff.PersonalWebsite == "") {
staff.PersonalWebsite = &data.PersonalWebsite
updated = true
}
if data.PhotoURL != "" && (staff.PhotoURL == nil || *staff.PhotoURL == "") {
staff.PhotoURL = &data.PhotoURL
updated = true
}
// Try to resolve supervisor by name
if data.SupervisorName != "" && staff.SupervisorID == nil {
// Search for supervisor in same university
supervisorParams := database.StaffSearchParams{
Query: data.SupervisorName,
UniversityID: &staff.UniversityID,
Limit: 1,
}
result, err := h.repo.SearchStaff(c.Request.Context(), supervisorParams)
if err == nil && len(result.Staff) > 0 {
staff.SupervisorID = &result.Staff[0].ID
updated = true
}
}
// Update last verified timestamp
now := time.Now()
staff.LastVerified = &now
if updated {
err = h.repo.CreateStaff(c.Request.Context(), staff)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update: " + err.Error()})
return
}
}
c.JSON(http.StatusOK, gin.H{
"status": "success",
"updated": updated,
"staff_id": staff.ID,
})
}
// RegisterAIExtractionRoutes registers AI extraction routes
func (h *AIExtractionHandlers) RegisterRoutes(r *gin.RouterGroup) {
ai := r.Group("/ai/extraction")
// Profile extraction endpoints
ai.GET("/pending", h.GetPendingProfiles)
ai.POST("/submit", h.SubmitExtractedData)
ai.POST("/submit-batch", h.SubmitBatchExtractedData)
// Institute hierarchy endpoints
ai.GET("/institutes", h.GetInstitutePages)
ai.POST("/institutes/submit", h.SubmitInstituteHierarchy)
}