All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
555 lines
16 KiB
Go
555 lines
16 KiB
Go
package handlers
|
|
|
|
import (
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/gin-gonic/gin"
|
|
"github.com/google/uuid"
|
|
|
|
"github.com/breakpilot/edu-search-service/internal/database"
|
|
)
|
|
|
|
// AIExtractionHandlers handles AI-based profile extraction endpoints
|
|
// These endpoints are designed for vast.ai or similar AI services to:
|
|
// 1. Get profile URLs that need extraction
|
|
// 2. Submit extracted data back
|
|
type AIExtractionHandlers struct {
|
|
repo *database.Repository
|
|
}
|
|
|
|
// NewAIExtractionHandlers creates new AI extraction handlers
|
|
func NewAIExtractionHandlers(repo *database.Repository) *AIExtractionHandlers {
|
|
return &AIExtractionHandlers{repo: repo}
|
|
}
|
|
|
|
// ProfileExtractionTask represents a profile URL to be processed by AI
|
|
type ProfileExtractionTask struct {
|
|
StaffID uuid.UUID `json:"staff_id"`
|
|
ProfileURL string `json:"profile_url"`
|
|
UniversityID uuid.UUID `json:"university_id"`
|
|
UniversityURL string `json:"university_url,omitempty"`
|
|
FullName string `json:"full_name,omitempty"`
|
|
CurrentData struct {
|
|
Email string `json:"email,omitempty"`
|
|
Phone string `json:"phone,omitempty"`
|
|
Office string `json:"office,omitempty"`
|
|
Position string `json:"position,omitempty"`
|
|
Department string `json:"department,omitempty"`
|
|
} `json:"current_data"`
|
|
}
|
|
|
|
// GetPendingProfiles returns staff profiles that need AI extraction
|
|
// GET /api/v1/ai/extraction/pending?limit=10&university_id=...
|
|
func (h *AIExtractionHandlers) GetPendingProfiles(c *gin.Context) {
|
|
limit := parseIntDefault(c.Query("limit"), 10)
|
|
if limit > 100 {
|
|
limit = 100
|
|
}
|
|
|
|
var universityID *uuid.UUID
|
|
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
|
|
id, err := uuid.Parse(uniIDStr)
|
|
if err == nil {
|
|
universityID = &id
|
|
}
|
|
}
|
|
|
|
// Get staff that have profile URLs but missing key data
|
|
params := database.StaffSearchParams{
|
|
UniversityID: universityID,
|
|
Limit: limit * 2, // Get more to filter
|
|
}
|
|
|
|
result, err := h.repo.SearchStaff(c.Request.Context(), params)
|
|
if err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
|
return
|
|
}
|
|
|
|
// Filter to only include profiles that need extraction
|
|
var tasks []ProfileExtractionTask
|
|
for _, staff := range result.Staff {
|
|
// Skip if no profile URL
|
|
if staff.ProfileURL == nil || *staff.ProfileURL == "" {
|
|
continue
|
|
}
|
|
|
|
// Include if missing email or other important data
|
|
needsExtraction := staff.Email == nil || *staff.Email == ""
|
|
|
|
if needsExtraction {
|
|
task := ProfileExtractionTask{
|
|
StaffID: staff.ID,
|
|
ProfileURL: *staff.ProfileURL,
|
|
UniversityID: staff.UniversityID,
|
|
}
|
|
|
|
if staff.FullName != nil {
|
|
task.FullName = *staff.FullName
|
|
}
|
|
if staff.Email != nil {
|
|
task.CurrentData.Email = *staff.Email
|
|
}
|
|
if staff.Phone != nil {
|
|
task.CurrentData.Phone = *staff.Phone
|
|
}
|
|
if staff.Office != nil {
|
|
task.CurrentData.Office = *staff.Office
|
|
}
|
|
if staff.Position != nil {
|
|
task.CurrentData.Position = *staff.Position
|
|
}
|
|
if staff.DepartmentName != nil {
|
|
task.CurrentData.Department = *staff.DepartmentName
|
|
}
|
|
|
|
tasks = append(tasks, task)
|
|
if len(tasks) >= limit {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"tasks": tasks,
|
|
"total": len(tasks),
|
|
})
|
|
}
|
|
|
|
// ExtractedProfileData represents data extracted by AI from a profile page
|
|
type ExtractedProfileData struct {
|
|
StaffID uuid.UUID `json:"staff_id" binding:"required"`
|
|
|
|
// Contact info
|
|
Email string `json:"email,omitempty"`
|
|
Phone string `json:"phone,omitempty"`
|
|
Office string `json:"office,omitempty"`
|
|
|
|
// Professional info
|
|
Position string `json:"position,omitempty"`
|
|
PositionType string `json:"position_type,omitempty"` // professor, researcher, phd_student, staff
|
|
AcademicTitle string `json:"academic_title,omitempty"`
|
|
IsProfessor *bool `json:"is_professor,omitempty"`
|
|
DepartmentName string `json:"department_name,omitempty"`
|
|
|
|
// Hierarchy
|
|
SupervisorName string `json:"supervisor_name,omitempty"`
|
|
TeamRole string `json:"team_role,omitempty"` // leitung, mitarbeiter, sekretariat, hiwi, doktorand
|
|
|
|
// Research
|
|
ResearchInterests []string `json:"research_interests,omitempty"`
|
|
ResearchSummary string `json:"research_summary,omitempty"`
|
|
|
|
// Teaching (Lehrveranstaltungen)
|
|
TeachingTopics []string `json:"teaching_topics,omitempty"`
|
|
|
|
// External profiles
|
|
ORCID string `json:"orcid,omitempty"`
|
|
GoogleScholarID string `json:"google_scholar_id,omitempty"`
|
|
ResearchgateURL string `json:"researchgate_url,omitempty"`
|
|
LinkedInURL string `json:"linkedin_url,omitempty"`
|
|
PersonalWebsite string `json:"personal_website,omitempty"`
|
|
PhotoURL string `json:"photo_url,omitempty"`
|
|
|
|
// Institute/Department links discovered
|
|
InstituteURL string `json:"institute_url,omitempty"`
|
|
InstituteName string `json:"institute_name,omitempty"`
|
|
|
|
// Confidence score (0-1)
|
|
Confidence float64 `json:"confidence,omitempty"`
|
|
}
|
|
|
|
// SubmitExtractedData saves AI-extracted profile data
|
|
// POST /api/v1/ai/extraction/submit
|
|
func (h *AIExtractionHandlers) SubmitExtractedData(c *gin.Context) {
|
|
var data ExtractedProfileData
|
|
if err := c.ShouldBindJSON(&data); err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
|
return
|
|
}
|
|
|
|
// Get existing staff record
|
|
staff, err := h.repo.GetStaff(c.Request.Context(), data.StaffID)
|
|
if err != nil {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "Staff not found"})
|
|
return
|
|
}
|
|
|
|
// Update fields if provided and not empty
|
|
updated := false
|
|
|
|
if data.Email != "" && (staff.Email == nil || *staff.Email == "") {
|
|
staff.Email = &data.Email
|
|
updated = true
|
|
}
|
|
if data.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
|
|
staff.Phone = &data.Phone
|
|
updated = true
|
|
}
|
|
if data.Office != "" && (staff.Office == nil || *staff.Office == "") {
|
|
staff.Office = &data.Office
|
|
updated = true
|
|
}
|
|
if data.Position != "" && (staff.Position == nil || *staff.Position == "") {
|
|
staff.Position = &data.Position
|
|
updated = true
|
|
}
|
|
if data.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
|
|
staff.PositionType = &data.PositionType
|
|
updated = true
|
|
}
|
|
if data.AcademicTitle != "" && (staff.AcademicTitle == nil || *staff.AcademicTitle == "") {
|
|
staff.AcademicTitle = &data.AcademicTitle
|
|
updated = true
|
|
}
|
|
if data.IsProfessor != nil {
|
|
staff.IsProfessor = *data.IsProfessor
|
|
updated = true
|
|
}
|
|
if data.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
|
|
staff.TeamRole = &data.TeamRole
|
|
updated = true
|
|
}
|
|
if len(data.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
|
|
staff.ResearchInterests = data.ResearchInterests
|
|
updated = true
|
|
}
|
|
if data.ResearchSummary != "" && (staff.ResearchSummary == nil || *staff.ResearchSummary == "") {
|
|
staff.ResearchSummary = &data.ResearchSummary
|
|
updated = true
|
|
}
|
|
if data.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
|
|
staff.ORCID = &data.ORCID
|
|
updated = true
|
|
}
|
|
if data.GoogleScholarID != "" && (staff.GoogleScholarID == nil || *staff.GoogleScholarID == "") {
|
|
staff.GoogleScholarID = &data.GoogleScholarID
|
|
updated = true
|
|
}
|
|
if data.ResearchgateURL != "" && (staff.ResearchgateURL == nil || *staff.ResearchgateURL == "") {
|
|
staff.ResearchgateURL = &data.ResearchgateURL
|
|
updated = true
|
|
}
|
|
if data.LinkedInURL != "" && (staff.LinkedInURL == nil || *staff.LinkedInURL == "") {
|
|
staff.LinkedInURL = &data.LinkedInURL
|
|
updated = true
|
|
}
|
|
if data.PersonalWebsite != "" && (staff.PersonalWebsite == nil || *staff.PersonalWebsite == "") {
|
|
staff.PersonalWebsite = &data.PersonalWebsite
|
|
updated = true
|
|
}
|
|
if data.PhotoURL != "" && (staff.PhotoURL == nil || *staff.PhotoURL == "") {
|
|
staff.PhotoURL = &data.PhotoURL
|
|
updated = true
|
|
}
|
|
|
|
// Try to resolve supervisor by name
|
|
if data.SupervisorName != "" && staff.SupervisorID == nil {
|
|
// Search for supervisor in same university
|
|
supervisorParams := database.StaffSearchParams{
|
|
Query: data.SupervisorName,
|
|
UniversityID: &staff.UniversityID,
|
|
Limit: 1,
|
|
}
|
|
result, err := h.repo.SearchStaff(c.Request.Context(), supervisorParams)
|
|
if err == nil && len(result.Staff) > 0 {
|
|
staff.SupervisorID = &result.Staff[0].ID
|
|
updated = true
|
|
}
|
|
}
|
|
|
|
// Update last verified timestamp
|
|
now := time.Now()
|
|
staff.LastVerified = &now
|
|
|
|
if updated {
|
|
err = h.repo.CreateStaff(c.Request.Context(), staff)
|
|
if err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update: " + err.Error()})
|
|
return
|
|
}
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"status": "success",
|
|
"updated": updated,
|
|
"staff_id": staff.ID,
|
|
})
|
|
}
|
|
|
|
// SubmitBatchExtractedData saves multiple AI-extracted profile data items
|
|
// POST /api/v1/ai/extraction/submit-batch
|
|
func (h *AIExtractionHandlers) SubmitBatchExtractedData(c *gin.Context) {
|
|
var batch struct {
|
|
Items []ExtractedProfileData `json:"items" binding:"required"`
|
|
}
|
|
|
|
if err := c.ShouldBindJSON(&batch); err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
|
return
|
|
}
|
|
|
|
results := make([]gin.H, 0, len(batch.Items))
|
|
successCount := 0
|
|
errorCount := 0
|
|
|
|
for _, item := range batch.Items {
|
|
// Get existing staff record
|
|
staff, err := h.repo.GetStaff(c.Request.Context(), item.StaffID)
|
|
if err != nil {
|
|
results = append(results, gin.H{
|
|
"staff_id": item.StaffID,
|
|
"status": "error",
|
|
"error": "Staff not found",
|
|
})
|
|
errorCount++
|
|
continue
|
|
}
|
|
|
|
// Apply updates (same logic as single submit)
|
|
updated := false
|
|
|
|
if item.Email != "" && (staff.Email == nil || *staff.Email == "") {
|
|
staff.Email = &item.Email
|
|
updated = true
|
|
}
|
|
if item.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
|
|
staff.Phone = &item.Phone
|
|
updated = true
|
|
}
|
|
if item.Office != "" && (staff.Office == nil || *staff.Office == "") {
|
|
staff.Office = &item.Office
|
|
updated = true
|
|
}
|
|
if item.Position != "" && (staff.Position == nil || *staff.Position == "") {
|
|
staff.Position = &item.Position
|
|
updated = true
|
|
}
|
|
if item.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
|
|
staff.PositionType = &item.PositionType
|
|
updated = true
|
|
}
|
|
if item.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
|
|
staff.TeamRole = &item.TeamRole
|
|
updated = true
|
|
}
|
|
if len(item.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
|
|
staff.ResearchInterests = item.ResearchInterests
|
|
updated = true
|
|
}
|
|
if item.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
|
|
staff.ORCID = &item.ORCID
|
|
updated = true
|
|
}
|
|
|
|
// Update last verified
|
|
now := time.Now()
|
|
staff.LastVerified = &now
|
|
|
|
if updated {
|
|
err = h.repo.CreateStaff(c.Request.Context(), staff)
|
|
if err != nil {
|
|
results = append(results, gin.H{
|
|
"staff_id": item.StaffID,
|
|
"status": "error",
|
|
"error": err.Error(),
|
|
})
|
|
errorCount++
|
|
continue
|
|
}
|
|
}
|
|
|
|
results = append(results, gin.H{
|
|
"staff_id": item.StaffID,
|
|
"status": "success",
|
|
"updated": updated,
|
|
})
|
|
successCount++
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"results": results,
|
|
"success_count": successCount,
|
|
"error_count": errorCount,
|
|
"total": len(batch.Items),
|
|
})
|
|
}
|
|
|
|
// InstituteHierarchyTask represents an institute page to crawl for hierarchy
|
|
type InstituteHierarchyTask struct {
|
|
InstituteURL string `json:"institute_url"`
|
|
InstituteName string `json:"institute_name,omitempty"`
|
|
UniversityID uuid.UUID `json:"university_id"`
|
|
}
|
|
|
|
// GetInstitutePages returns institute pages that need hierarchy crawling
|
|
// GET /api/v1/ai/extraction/institutes?university_id=...
|
|
func (h *AIExtractionHandlers) GetInstitutePages(c *gin.Context) {
|
|
var universityID *uuid.UUID
|
|
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
|
|
id, err := uuid.Parse(uniIDStr)
|
|
if err == nil {
|
|
universityID = &id
|
|
}
|
|
}
|
|
|
|
// Get unique institute/department URLs from staff profiles
|
|
params := database.StaffSearchParams{
|
|
UniversityID: universityID,
|
|
Limit: 1000,
|
|
}
|
|
|
|
result, err := h.repo.SearchStaff(c.Request.Context(), params)
|
|
if err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
|
return
|
|
}
|
|
|
|
// Collect unique source URLs (these are typically department pages)
|
|
urlSet := make(map[string]bool)
|
|
var tasks []InstituteHierarchyTask
|
|
|
|
for _, staff := range result.Staff {
|
|
if staff.SourceURL != nil && *staff.SourceURL != "" {
|
|
url := *staff.SourceURL
|
|
if !urlSet[url] {
|
|
urlSet[url] = true
|
|
tasks = append(tasks, InstituteHierarchyTask{
|
|
InstituteURL: url,
|
|
UniversityID: staff.UniversityID,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"institutes": tasks,
|
|
"total": len(tasks),
|
|
})
|
|
}
|
|
|
|
// InstituteHierarchyData represents hierarchy data extracted from an institute page
|
|
type InstituteHierarchyData struct {
|
|
InstituteURL string `json:"institute_url" binding:"required"`
|
|
UniversityID uuid.UUID `json:"university_id" binding:"required"`
|
|
InstituteName string `json:"institute_name,omitempty"`
|
|
|
|
// Leadership
|
|
LeaderName string `json:"leader_name,omitempty"`
|
|
LeaderTitle string `json:"leader_title,omitempty"` // e.g., "Professor", "Lehrstuhlinhaber"
|
|
|
|
// Staff organization
|
|
StaffGroups []struct {
|
|
Role string `json:"role"` // e.g., "Leitung", "Wissenschaftliche Mitarbeiter", "Sekretariat"
|
|
Members []string `json:"members"` // Names of people in this group
|
|
} `json:"staff_groups,omitempty"`
|
|
|
|
// Teaching info (Lehrveranstaltungen)
|
|
TeachingCourses []struct {
|
|
Title string `json:"title"`
|
|
Teacher string `json:"teacher,omitempty"`
|
|
} `json:"teaching_courses,omitempty"`
|
|
}
|
|
|
|
// SubmitInstituteHierarchy saves hierarchy data from an institute page
|
|
// POST /api/v1/ai/extraction/institutes/submit
|
|
func (h *AIExtractionHandlers) SubmitInstituteHierarchy(c *gin.Context) {
|
|
var data InstituteHierarchyData
|
|
if err := c.ShouldBindJSON(&data); err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
|
return
|
|
}
|
|
|
|
// Find or create department
|
|
dept := &database.Department{
|
|
UniversityID: data.UniversityID,
|
|
Name: data.InstituteName,
|
|
}
|
|
if data.InstituteURL != "" {
|
|
dept.URL = &data.InstituteURL
|
|
}
|
|
|
|
err := h.repo.CreateDepartment(c.Request.Context(), dept)
|
|
if err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create department: " + err.Error()})
|
|
return
|
|
}
|
|
|
|
// Find leader and set as supervisor for all staff in this institute
|
|
var leaderID *uuid.UUID
|
|
if data.LeaderName != "" {
|
|
// Search for leader
|
|
leaderParams := database.StaffSearchParams{
|
|
Query: data.LeaderName,
|
|
UniversityID: &data.UniversityID,
|
|
Limit: 1,
|
|
}
|
|
result, err := h.repo.SearchStaff(c.Request.Context(), leaderParams)
|
|
if err == nil && len(result.Staff) > 0 {
|
|
leaderID = &result.Staff[0].ID
|
|
|
|
// Update leader with department and role
|
|
leader := &result.Staff[0]
|
|
leader.DepartmentID = &dept.ID
|
|
roleLeitung := "leitung"
|
|
leader.TeamRole = &roleLeitung
|
|
leader.IsProfessor = true
|
|
if data.LeaderTitle != "" {
|
|
leader.AcademicTitle = &data.LeaderTitle
|
|
}
|
|
h.repo.CreateStaff(c.Request.Context(), leader)
|
|
}
|
|
}
|
|
|
|
// Process staff groups
|
|
updatedCount := 0
|
|
for _, group := range data.StaffGroups {
|
|
for _, memberName := range group.Members {
|
|
// Find staff member
|
|
memberParams := database.StaffSearchParams{
|
|
Query: memberName,
|
|
UniversityID: &data.UniversityID,
|
|
Limit: 1,
|
|
}
|
|
result, err := h.repo.SearchStaff(c.Request.Context(), memberParams)
|
|
if err != nil || len(result.Staff) == 0 {
|
|
continue
|
|
}
|
|
|
|
member := &result.Staff[0]
|
|
member.DepartmentID = &dept.ID
|
|
member.TeamRole = &group.Role
|
|
|
|
// Set supervisor if leader was found and this is not the leader
|
|
if leaderID != nil && member.ID != *leaderID {
|
|
member.SupervisorID = leaderID
|
|
}
|
|
|
|
h.repo.CreateStaff(c.Request.Context(), member)
|
|
updatedCount++
|
|
}
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{
|
|
"status": "success",
|
|
"department_id": dept.ID,
|
|
"leader_id": leaderID,
|
|
"members_updated": updatedCount,
|
|
})
|
|
}
|
|
|
|
// RegisterAIExtractionRoutes registers AI extraction routes
|
|
func (h *AIExtractionHandlers) RegisterRoutes(r *gin.RouterGroup) {
|
|
ai := r.Group("/ai/extraction")
|
|
|
|
// Profile extraction endpoints
|
|
ai.GET("/pending", h.GetPendingProfiles)
|
|
ai.POST("/submit", h.SubmitExtractedData)
|
|
ai.POST("/submit-batch", h.SubmitBatchExtractedData)
|
|
|
|
// Institute hierarchy endpoints
|
|
ai.GET("/institutes", h.GetInstitutePages)
|
|
ai.POST("/institutes/submit", h.SubmitInstituteHierarchy)
|
|
}
|