Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -278,267 +278,6 @@ func (h *AIExtractionHandlers) SubmitExtractedData(c *gin.Context) {
|
||||
})
|
||||
}
|
||||
|
||||
// SubmitBatchExtractedData saves multiple AI-extracted profile data items
|
||||
// POST /api/v1/ai/extraction/submit-batch
|
||||
func (h *AIExtractionHandlers) SubmitBatchExtractedData(c *gin.Context) {
|
||||
var batch struct {
|
||||
Items []ExtractedProfileData `json:"items" binding:"required"`
|
||||
}
|
||||
|
||||
if err := c.ShouldBindJSON(&batch); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
results := make([]gin.H, 0, len(batch.Items))
|
||||
successCount := 0
|
||||
errorCount := 0
|
||||
|
||||
for _, item := range batch.Items {
|
||||
// Get existing staff record
|
||||
staff, err := h.repo.GetStaff(c.Request.Context(), item.StaffID)
|
||||
if err != nil {
|
||||
results = append(results, gin.H{
|
||||
"staff_id": item.StaffID,
|
||||
"status": "error",
|
||||
"error": "Staff not found",
|
||||
})
|
||||
errorCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// Apply updates (same logic as single submit)
|
||||
updated := false
|
||||
|
||||
if item.Email != "" && (staff.Email == nil || *staff.Email == "") {
|
||||
staff.Email = &item.Email
|
||||
updated = true
|
||||
}
|
||||
if item.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
|
||||
staff.Phone = &item.Phone
|
||||
updated = true
|
||||
}
|
||||
if item.Office != "" && (staff.Office == nil || *staff.Office == "") {
|
||||
staff.Office = &item.Office
|
||||
updated = true
|
||||
}
|
||||
if item.Position != "" && (staff.Position == nil || *staff.Position == "") {
|
||||
staff.Position = &item.Position
|
||||
updated = true
|
||||
}
|
||||
if item.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
|
||||
staff.PositionType = &item.PositionType
|
||||
updated = true
|
||||
}
|
||||
if item.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
|
||||
staff.TeamRole = &item.TeamRole
|
||||
updated = true
|
||||
}
|
||||
if len(item.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
|
||||
staff.ResearchInterests = item.ResearchInterests
|
||||
updated = true
|
||||
}
|
||||
if item.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
|
||||
staff.ORCID = &item.ORCID
|
||||
updated = true
|
||||
}
|
||||
|
||||
// Update last verified
|
||||
now := time.Now()
|
||||
staff.LastVerified = &now
|
||||
|
||||
if updated {
|
||||
err = h.repo.CreateStaff(c.Request.Context(), staff)
|
||||
if err != nil {
|
||||
results = append(results, gin.H{
|
||||
"staff_id": item.StaffID,
|
||||
"status": "error",
|
||||
"error": err.Error(),
|
||||
})
|
||||
errorCount++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
results = append(results, gin.H{
|
||||
"staff_id": item.StaffID,
|
||||
"status": "success",
|
||||
"updated": updated,
|
||||
})
|
||||
successCount++
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"results": results,
|
||||
"success_count": successCount,
|
||||
"error_count": errorCount,
|
||||
"total": len(batch.Items),
|
||||
})
|
||||
}
|
||||
|
||||
// InstituteHierarchyTask represents an institute page to crawl for hierarchy
|
||||
type InstituteHierarchyTask struct {
|
||||
InstituteURL string `json:"institute_url"`
|
||||
InstituteName string `json:"institute_name,omitempty"`
|
||||
UniversityID uuid.UUID `json:"university_id"`
|
||||
}
|
||||
|
||||
// GetInstitutePages returns institute pages that need hierarchy crawling
|
||||
// GET /api/v1/ai/extraction/institutes?university_id=...
|
||||
func (h *AIExtractionHandlers) GetInstitutePages(c *gin.Context) {
|
||||
var universityID *uuid.UUID
|
||||
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
|
||||
id, err := uuid.Parse(uniIDStr)
|
||||
if err == nil {
|
||||
universityID = &id
|
||||
}
|
||||
}
|
||||
|
||||
// Get unique institute/department URLs from staff profiles
|
||||
params := database.StaffSearchParams{
|
||||
UniversityID: universityID,
|
||||
Limit: 1000,
|
||||
}
|
||||
|
||||
result, err := h.repo.SearchStaff(c.Request.Context(), params)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Collect unique source URLs (these are typically department pages)
|
||||
urlSet := make(map[string]bool)
|
||||
var tasks []InstituteHierarchyTask
|
||||
|
||||
for _, staff := range result.Staff {
|
||||
if staff.SourceURL != nil && *staff.SourceURL != "" {
|
||||
url := *staff.SourceURL
|
||||
if !urlSet[url] {
|
||||
urlSet[url] = true
|
||||
tasks = append(tasks, InstituteHierarchyTask{
|
||||
InstituteURL: url,
|
||||
UniversityID: staff.UniversityID,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"institutes": tasks,
|
||||
"total": len(tasks),
|
||||
})
|
||||
}
|
||||
|
||||
// InstituteHierarchyData represents hierarchy data extracted from an institute page
|
||||
type InstituteHierarchyData struct {
|
||||
InstituteURL string `json:"institute_url" binding:"required"`
|
||||
UniversityID uuid.UUID `json:"university_id" binding:"required"`
|
||||
InstituteName string `json:"institute_name,omitempty"`
|
||||
|
||||
// Leadership
|
||||
LeaderName string `json:"leader_name,omitempty"`
|
||||
LeaderTitle string `json:"leader_title,omitempty"` // e.g., "Professor", "Lehrstuhlinhaber"
|
||||
|
||||
// Staff organization
|
||||
StaffGroups []struct {
|
||||
Role string `json:"role"` // e.g., "Leitung", "Wissenschaftliche Mitarbeiter", "Sekretariat"
|
||||
Members []string `json:"members"` // Names of people in this group
|
||||
} `json:"staff_groups,omitempty"`
|
||||
|
||||
// Teaching info (Lehrveranstaltungen)
|
||||
TeachingCourses []struct {
|
||||
Title string `json:"title"`
|
||||
Teacher string `json:"teacher,omitempty"`
|
||||
} `json:"teaching_courses,omitempty"`
|
||||
}
|
||||
|
||||
// SubmitInstituteHierarchy saves hierarchy data from an institute page
|
||||
// POST /api/v1/ai/extraction/institutes/submit
|
||||
func (h *AIExtractionHandlers) SubmitInstituteHierarchy(c *gin.Context) {
|
||||
var data InstituteHierarchyData
|
||||
if err := c.ShouldBindJSON(&data); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Find or create department
|
||||
dept := &database.Department{
|
||||
UniversityID: data.UniversityID,
|
||||
Name: data.InstituteName,
|
||||
}
|
||||
if data.InstituteURL != "" {
|
||||
dept.URL = &data.InstituteURL
|
||||
}
|
||||
|
||||
err := h.repo.CreateDepartment(c.Request.Context(), dept)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create department: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Find leader and set as supervisor for all staff in this institute
|
||||
var leaderID *uuid.UUID
|
||||
if data.LeaderName != "" {
|
||||
// Search for leader
|
||||
leaderParams := database.StaffSearchParams{
|
||||
Query: data.LeaderName,
|
||||
UniversityID: &data.UniversityID,
|
||||
Limit: 1,
|
||||
}
|
||||
result, err := h.repo.SearchStaff(c.Request.Context(), leaderParams)
|
||||
if err == nil && len(result.Staff) > 0 {
|
||||
leaderID = &result.Staff[0].ID
|
||||
|
||||
// Update leader with department and role
|
||||
leader := &result.Staff[0]
|
||||
leader.DepartmentID = &dept.ID
|
||||
roleLeitung := "leitung"
|
||||
leader.TeamRole = &roleLeitung
|
||||
leader.IsProfessor = true
|
||||
if data.LeaderTitle != "" {
|
||||
leader.AcademicTitle = &data.LeaderTitle
|
||||
}
|
||||
h.repo.CreateStaff(c.Request.Context(), leader)
|
||||
}
|
||||
}
|
||||
|
||||
// Process staff groups
|
||||
updatedCount := 0
|
||||
for _, group := range data.StaffGroups {
|
||||
for _, memberName := range group.Members {
|
||||
// Find staff member
|
||||
memberParams := database.StaffSearchParams{
|
||||
Query: memberName,
|
||||
UniversityID: &data.UniversityID,
|
||||
Limit: 1,
|
||||
}
|
||||
result, err := h.repo.SearchStaff(c.Request.Context(), memberParams)
|
||||
if err != nil || len(result.Staff) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
member := &result.Staff[0]
|
||||
member.DepartmentID = &dept.ID
|
||||
member.TeamRole = &group.Role
|
||||
|
||||
// Set supervisor if leader was found and this is not the leader
|
||||
if leaderID != nil && member.ID != *leaderID {
|
||||
member.SupervisorID = leaderID
|
||||
}
|
||||
|
||||
h.repo.CreateStaff(c.Request.Context(), member)
|
||||
updatedCount++
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"status": "success",
|
||||
"department_id": dept.ID,
|
||||
"leader_id": leaderID,
|
||||
"members_updated": updatedCount,
|
||||
})
|
||||
}
|
||||
|
||||
// RegisterAIExtractionRoutes registers AI extraction routes
|
||||
func (h *AIExtractionHandlers) RegisterRoutes(r *gin.RouterGroup) {
|
||||
ai := r.Group("/ai/extraction")
|
||||
|
||||
@@ -0,0 +1,272 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
// SubmitBatchExtractedData saves multiple AI-extracted profile data items
|
||||
// POST /api/v1/ai/extraction/submit-batch
|
||||
func (h *AIExtractionHandlers) SubmitBatchExtractedData(c *gin.Context) {
|
||||
var batch struct {
|
||||
Items []ExtractedProfileData `json:"items" binding:"required"`
|
||||
}
|
||||
|
||||
if err := c.ShouldBindJSON(&batch); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
results := make([]gin.H, 0, len(batch.Items))
|
||||
successCount := 0
|
||||
errorCount := 0
|
||||
|
||||
for _, item := range batch.Items {
|
||||
// Get existing staff record
|
||||
staff, err := h.repo.GetStaff(c.Request.Context(), item.StaffID)
|
||||
if err != nil {
|
||||
results = append(results, gin.H{
|
||||
"staff_id": item.StaffID,
|
||||
"status": "error",
|
||||
"error": "Staff not found",
|
||||
})
|
||||
errorCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// Apply updates (same logic as single submit)
|
||||
updated := false
|
||||
|
||||
if item.Email != "" && (staff.Email == nil || *staff.Email == "") {
|
||||
staff.Email = &item.Email
|
||||
updated = true
|
||||
}
|
||||
if item.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
|
||||
staff.Phone = &item.Phone
|
||||
updated = true
|
||||
}
|
||||
if item.Office != "" && (staff.Office == nil || *staff.Office == "") {
|
||||
staff.Office = &item.Office
|
||||
updated = true
|
||||
}
|
||||
if item.Position != "" && (staff.Position == nil || *staff.Position == "") {
|
||||
staff.Position = &item.Position
|
||||
updated = true
|
||||
}
|
||||
if item.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
|
||||
staff.PositionType = &item.PositionType
|
||||
updated = true
|
||||
}
|
||||
if item.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
|
||||
staff.TeamRole = &item.TeamRole
|
||||
updated = true
|
||||
}
|
||||
if len(item.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
|
||||
staff.ResearchInterests = item.ResearchInterests
|
||||
updated = true
|
||||
}
|
||||
if item.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
|
||||
staff.ORCID = &item.ORCID
|
||||
updated = true
|
||||
}
|
||||
|
||||
// Update last verified
|
||||
now := time.Now()
|
||||
staff.LastVerified = &now
|
||||
|
||||
if updated {
|
||||
err = h.repo.CreateStaff(c.Request.Context(), staff)
|
||||
if err != nil {
|
||||
results = append(results, gin.H{
|
||||
"staff_id": item.StaffID,
|
||||
"status": "error",
|
||||
"error": err.Error(),
|
||||
})
|
||||
errorCount++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
results = append(results, gin.H{
|
||||
"staff_id": item.StaffID,
|
||||
"status": "success",
|
||||
"updated": updated,
|
||||
})
|
||||
successCount++
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"results": results,
|
||||
"success_count": successCount,
|
||||
"error_count": errorCount,
|
||||
"total": len(batch.Items),
|
||||
})
|
||||
}
|
||||
|
||||
// InstituteHierarchyTask represents an institute page to crawl for hierarchy
|
||||
type InstituteHierarchyTask struct {
|
||||
InstituteURL string `json:"institute_url"`
|
||||
InstituteName string `json:"institute_name,omitempty"`
|
||||
UniversityID uuid.UUID `json:"university_id"`
|
||||
}
|
||||
|
||||
// GetInstitutePages returns institute pages that need hierarchy crawling
|
||||
// GET /api/v1/ai/extraction/institutes?university_id=...
|
||||
func (h *AIExtractionHandlers) GetInstitutePages(c *gin.Context) {
|
||||
var universityID *uuid.UUID
|
||||
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
|
||||
id, err := uuid.Parse(uniIDStr)
|
||||
if err == nil {
|
||||
universityID = &id
|
||||
}
|
||||
}
|
||||
|
||||
// Get unique institute/department URLs from staff profiles
|
||||
params := database.StaffSearchParams{
|
||||
UniversityID: universityID,
|
||||
Limit: 1000,
|
||||
}
|
||||
|
||||
result, err := h.repo.SearchStaff(c.Request.Context(), params)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Collect unique source URLs (these are typically department pages)
|
||||
urlSet := make(map[string]bool)
|
||||
var tasks []InstituteHierarchyTask
|
||||
|
||||
for _, staff := range result.Staff {
|
||||
if staff.SourceURL != nil && *staff.SourceURL != "" {
|
||||
url := *staff.SourceURL
|
||||
if !urlSet[url] {
|
||||
urlSet[url] = true
|
||||
tasks = append(tasks, InstituteHierarchyTask{
|
||||
InstituteURL: url,
|
||||
UniversityID: staff.UniversityID,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"institutes": tasks,
|
||||
"total": len(tasks),
|
||||
})
|
||||
}
|
||||
|
||||
// InstituteHierarchyData represents hierarchy data extracted from an institute page
|
||||
type InstituteHierarchyData struct {
|
||||
InstituteURL string `json:"institute_url" binding:"required"`
|
||||
UniversityID uuid.UUID `json:"university_id" binding:"required"`
|
||||
InstituteName string `json:"institute_name,omitempty"`
|
||||
|
||||
// Leadership
|
||||
LeaderName string `json:"leader_name,omitempty"`
|
||||
LeaderTitle string `json:"leader_title,omitempty"` // e.g., "Professor", "Lehrstuhlinhaber"
|
||||
|
||||
// Staff organization
|
||||
StaffGroups []struct {
|
||||
Role string `json:"role"` // e.g., "Leitung", "Wissenschaftliche Mitarbeiter", "Sekretariat"
|
||||
Members []string `json:"members"` // Names of people in this group
|
||||
} `json:"staff_groups,omitempty"`
|
||||
|
||||
// Teaching info (Lehrveranstaltungen)
|
||||
TeachingCourses []struct {
|
||||
Title string `json:"title"`
|
||||
Teacher string `json:"teacher,omitempty"`
|
||||
} `json:"teaching_courses,omitempty"`
|
||||
}
|
||||
|
||||
// SubmitInstituteHierarchy saves hierarchy data from an institute page
|
||||
// POST /api/v1/ai/extraction/institutes/submit
|
||||
func (h *AIExtractionHandlers) SubmitInstituteHierarchy(c *gin.Context) {
|
||||
var data InstituteHierarchyData
|
||||
if err := c.ShouldBindJSON(&data); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Find or create department
|
||||
dept := &database.Department{
|
||||
UniversityID: data.UniversityID,
|
||||
Name: data.InstituteName,
|
||||
}
|
||||
if data.InstituteURL != "" {
|
||||
dept.URL = &data.InstituteURL
|
||||
}
|
||||
|
||||
err := h.repo.CreateDepartment(c.Request.Context(), dept)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create department: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Find leader and set as supervisor for all staff in this institute
|
||||
var leaderID *uuid.UUID
|
||||
if data.LeaderName != "" {
|
||||
// Search for leader
|
||||
leaderParams := database.StaffSearchParams{
|
||||
Query: data.LeaderName,
|
||||
UniversityID: &data.UniversityID,
|
||||
Limit: 1,
|
||||
}
|
||||
result, err := h.repo.SearchStaff(c.Request.Context(), leaderParams)
|
||||
if err == nil && len(result.Staff) > 0 {
|
||||
leaderID = &result.Staff[0].ID
|
||||
|
||||
// Update leader with department and role
|
||||
leader := &result.Staff[0]
|
||||
leader.DepartmentID = &dept.ID
|
||||
roleLeitung := "leitung"
|
||||
leader.TeamRole = &roleLeitung
|
||||
leader.IsProfessor = true
|
||||
if data.LeaderTitle != "" {
|
||||
leader.AcademicTitle = &data.LeaderTitle
|
||||
}
|
||||
h.repo.CreateStaff(c.Request.Context(), leader)
|
||||
}
|
||||
}
|
||||
|
||||
// Process staff groups
|
||||
updatedCount := 0
|
||||
for _, group := range data.StaffGroups {
|
||||
for _, memberName := range group.Members {
|
||||
// Find staff member
|
||||
memberParams := database.StaffSearchParams{
|
||||
Query: memberName,
|
||||
UniversityID: &data.UniversityID,
|
||||
Limit: 1,
|
||||
}
|
||||
result, err := h.repo.SearchStaff(c.Request.Context(), memberParams)
|
||||
if err != nil || len(result.Staff) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
member := &result.Staff[0]
|
||||
member.DepartmentID = &dept.ID
|
||||
member.TeamRole = &group.Role
|
||||
|
||||
// Set supervisor if leader was found and this is not the leader
|
||||
if leaderID != nil && member.ID != *leaderID {
|
||||
member.SupervisorID = leaderID
|
||||
}
|
||||
|
||||
h.repo.CreateStaff(c.Request.Context(), member)
|
||||
updatedCount++
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"status": "success",
|
||||
"department_id": dept.ID,
|
||||
"leader_id": leaderID,
|
||||
"members_updated": updatedCount,
|
||||
})
|
||||
}
|
||||
@@ -2,7 +2,6 @@ package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/policy"
|
||||
"github.com/gin-gonic/gin"
|
||||
@@ -349,289 +348,6 @@ func (h *PolicyHandler) UpdateOperationPermission(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, op)
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// PII RULES
|
||||
// =============================================================================
|
||||
|
||||
// ListPIIRules returns all PII detection rules.
|
||||
func (h *PolicyHandler) ListPIIRules(c *gin.Context) {
|
||||
activeOnly := c.Query("active_only") == "true"
|
||||
|
||||
rules, err := h.store.ListPIIRules(c.Request.Context(), activeOnly)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list PII rules", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"rules": rules,
|
||||
"total": len(rules),
|
||||
})
|
||||
}
|
||||
|
||||
// GetPIIRule returns a single PII rule by ID.
|
||||
func (h *PolicyHandler) GetPIIRule(c *gin.Context) {
|
||||
id, err := uuid.Parse(c.Param("id"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
if rule == nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, rule)
|
||||
}
|
||||
|
||||
// CreatePIIRule creates a new PII detection rule.
|
||||
func (h *PolicyHandler) CreatePIIRule(c *gin.Context) {
|
||||
var req policy.CreatePIIRuleRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := h.store.CreatePIIRule(c.Request.Context(), &req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Log audit
|
||||
userEmail := getUserEmail(c)
|
||||
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionCreate, policy.AuditEntityPIIRule, &rule.ID, nil, rule, userEmail)
|
||||
|
||||
c.JSON(http.StatusCreated, rule)
|
||||
}
|
||||
|
||||
// UpdatePIIRule updates an existing PII rule.
|
||||
func (h *PolicyHandler) UpdatePIIRule(c *gin.Context) {
|
||||
id, err := uuid.Parse(c.Param("id"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
|
||||
return
|
||||
}
|
||||
|
||||
// Get old value for audit
|
||||
oldRule, err := h.store.GetPIIRule(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
if oldRule == nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
|
||||
return
|
||||
}
|
||||
|
||||
var req policy.UpdatePIIRuleRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := h.store.UpdatePIIRule(c.Request.Context(), id, &req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Log audit
|
||||
userEmail := getUserEmail(c)
|
||||
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionUpdate, policy.AuditEntityPIIRule, &rule.ID, oldRule, rule, userEmail)
|
||||
|
||||
c.JSON(http.StatusOK, rule)
|
||||
}
|
||||
|
||||
// DeletePIIRule deletes a PII rule.
|
||||
func (h *PolicyHandler) DeletePIIRule(c *gin.Context) {
|
||||
id, err := uuid.Parse(c.Param("id"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
|
||||
return
|
||||
}
|
||||
|
||||
// Get rule for audit before deletion
|
||||
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
if rule == nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.store.DeletePIIRule(c.Request.Context(), id); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Log audit
|
||||
userEmail := getUserEmail(c)
|
||||
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionDelete, policy.AuditEntityPIIRule, &id, rule, nil, userEmail)
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"deleted": true, "id": id})
|
||||
}
|
||||
|
||||
// TestPIIRules tests PII detection against sample text.
|
||||
func (h *PolicyHandler) TestPIIRules(c *gin.Context) {
|
||||
var req policy.PIITestRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
response, err := h.enforcer.DetectPII(c.Request.Context(), req.Text)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to test PII detection", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AUDIT & COMPLIANCE
|
||||
// =============================================================================
|
||||
|
||||
// ListAuditLogs returns audit log entries.
|
||||
func (h *PolicyHandler) ListAuditLogs(c *gin.Context) {
|
||||
var filter policy.AuditLogFilter
|
||||
if err := c.ShouldBindQuery(&filter); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Set defaults
|
||||
if filter.Limit <= 0 || filter.Limit > 500 {
|
||||
filter.Limit = 100
|
||||
}
|
||||
|
||||
logs, total, err := h.store.ListAuditLogs(c.Request.Context(), &filter)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list audit logs", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"logs": logs,
|
||||
"total": total,
|
||||
"limit": filter.Limit,
|
||||
"offset": filter.Offset,
|
||||
})
|
||||
}
|
||||
|
||||
// ListBlockedContent returns blocked content log entries.
|
||||
func (h *PolicyHandler) ListBlockedContent(c *gin.Context) {
|
||||
var filter policy.BlockedContentFilter
|
||||
if err := c.ShouldBindQuery(&filter); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Set defaults
|
||||
if filter.Limit <= 0 || filter.Limit > 500 {
|
||||
filter.Limit = 100
|
||||
}
|
||||
|
||||
logs, total, err := h.store.ListBlockedContent(c.Request.Context(), &filter)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list blocked content", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"blocked": logs,
|
||||
"total": total,
|
||||
"limit": filter.Limit,
|
||||
"offset": filter.Offset,
|
||||
})
|
||||
}
|
||||
|
||||
// CheckCompliance performs a compliance check for a URL.
|
||||
func (h *PolicyHandler) CheckCompliance(c *gin.Context) {
|
||||
var req policy.CheckComplianceRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
response, err := h.enforcer.CheckCompliance(c.Request.Context(), &req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to check compliance", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GetPolicyStats returns aggregated statistics.
|
||||
func (h *PolicyHandler) GetPolicyStats(c *gin.Context) {
|
||||
stats, err := h.store.GetStats(c.Request.Context())
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get stats", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, stats)
|
||||
}
|
||||
|
||||
// GenerateComplianceReport generates an audit report.
|
||||
func (h *PolicyHandler) GenerateComplianceReport(c *gin.Context) {
|
||||
var auditFilter policy.AuditLogFilter
|
||||
var blockedFilter policy.BlockedContentFilter
|
||||
|
||||
// Parse date filters
|
||||
fromStr := c.Query("from")
|
||||
toStr := c.Query("to")
|
||||
|
||||
if fromStr != "" {
|
||||
from, err := time.Parse("2006-01-02", fromStr)
|
||||
if err == nil {
|
||||
auditFilter.FromDate = &from
|
||||
blockedFilter.FromDate = &from
|
||||
}
|
||||
}
|
||||
|
||||
if toStr != "" {
|
||||
to, err := time.Parse("2006-01-02", toStr)
|
||||
if err == nil {
|
||||
// Add 1 day to include the end date
|
||||
to = to.Add(24 * time.Hour)
|
||||
auditFilter.ToDate = &to
|
||||
blockedFilter.ToDate = &to
|
||||
}
|
||||
}
|
||||
|
||||
// No limit for report
|
||||
auditFilter.Limit = 10000
|
||||
blockedFilter.Limit = 10000
|
||||
|
||||
auditor := policy.NewAuditor(h.store)
|
||||
report, err := auditor.GenerateAuditReport(c.Request.Context(), &auditFilter, &blockedFilter)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to generate report", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Set filename for download
|
||||
format := c.Query("format")
|
||||
if format == "download" {
|
||||
filename := "compliance-report-" + time.Now().Format("2006-01-02") + ".json"
|
||||
c.Header("Content-Disposition", "attachment; filename="+filename)
|
||||
c.Header("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, report)
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// HELPERS
|
||||
// =============================================================================
|
||||
|
||||
@@ -0,0 +1,293 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/policy"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// =============================================================================
|
||||
// PII RULES
|
||||
// =============================================================================
|
||||
|
||||
// ListPIIRules returns all PII detection rules.
|
||||
func (h *PolicyHandler) ListPIIRules(c *gin.Context) {
|
||||
activeOnly := c.Query("active_only") == "true"
|
||||
|
||||
rules, err := h.store.ListPIIRules(c.Request.Context(), activeOnly)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list PII rules", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"rules": rules,
|
||||
"total": len(rules),
|
||||
})
|
||||
}
|
||||
|
||||
// GetPIIRule returns a single PII rule by ID.
|
||||
func (h *PolicyHandler) GetPIIRule(c *gin.Context) {
|
||||
id, err := uuid.Parse(c.Param("id"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
if rule == nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, rule)
|
||||
}
|
||||
|
||||
// CreatePIIRule creates a new PII detection rule.
|
||||
func (h *PolicyHandler) CreatePIIRule(c *gin.Context) {
|
||||
var req policy.CreatePIIRuleRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := h.store.CreatePIIRule(c.Request.Context(), &req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Log audit
|
||||
userEmail := getUserEmail(c)
|
||||
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionCreate, policy.AuditEntityPIIRule, &rule.ID, nil, rule, userEmail)
|
||||
|
||||
c.JSON(http.StatusCreated, rule)
|
||||
}
|
||||
|
||||
// UpdatePIIRule updates an existing PII rule.
|
||||
func (h *PolicyHandler) UpdatePIIRule(c *gin.Context) {
|
||||
id, err := uuid.Parse(c.Param("id"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
|
||||
return
|
||||
}
|
||||
|
||||
// Get old value for audit
|
||||
oldRule, err := h.store.GetPIIRule(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
if oldRule == nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
|
||||
return
|
||||
}
|
||||
|
||||
var req policy.UpdatePIIRuleRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := h.store.UpdatePIIRule(c.Request.Context(), id, &req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Log audit
|
||||
userEmail := getUserEmail(c)
|
||||
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionUpdate, policy.AuditEntityPIIRule, &rule.ID, oldRule, rule, userEmail)
|
||||
|
||||
c.JSON(http.StatusOK, rule)
|
||||
}
|
||||
|
||||
// DeletePIIRule deletes a PII rule.
|
||||
func (h *PolicyHandler) DeletePIIRule(c *gin.Context) {
|
||||
id, err := uuid.Parse(c.Param("id"))
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
|
||||
return
|
||||
}
|
||||
|
||||
// Get rule for audit before deletion
|
||||
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
if rule == nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.store.DeletePIIRule(c.Request.Context(), id); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete PII rule", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Log audit
|
||||
userEmail := getUserEmail(c)
|
||||
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionDelete, policy.AuditEntityPIIRule, &id, rule, nil, userEmail)
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"deleted": true, "id": id})
|
||||
}
|
||||
|
||||
// TestPIIRules tests PII detection against sample text.
|
||||
func (h *PolicyHandler) TestPIIRules(c *gin.Context) {
|
||||
var req policy.PIITestRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
response, err := h.enforcer.DetectPII(c.Request.Context(), req.Text)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to test PII detection", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AUDIT & COMPLIANCE
|
||||
// =============================================================================
|
||||
|
||||
// ListAuditLogs returns audit log entries.
|
||||
func (h *PolicyHandler) ListAuditLogs(c *gin.Context) {
|
||||
var filter policy.AuditLogFilter
|
||||
if err := c.ShouldBindQuery(&filter); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Set defaults
|
||||
if filter.Limit <= 0 || filter.Limit > 500 {
|
||||
filter.Limit = 100
|
||||
}
|
||||
|
||||
logs, total, err := h.store.ListAuditLogs(c.Request.Context(), &filter)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list audit logs", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"logs": logs,
|
||||
"total": total,
|
||||
"limit": filter.Limit,
|
||||
"offset": filter.Offset,
|
||||
})
|
||||
}
|
||||
|
||||
// ListBlockedContent returns blocked content log entries.
|
||||
func (h *PolicyHandler) ListBlockedContent(c *gin.Context) {
|
||||
var filter policy.BlockedContentFilter
|
||||
if err := c.ShouldBindQuery(&filter); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Set defaults
|
||||
if filter.Limit <= 0 || filter.Limit > 500 {
|
||||
filter.Limit = 100
|
||||
}
|
||||
|
||||
logs, total, err := h.store.ListBlockedContent(c.Request.Context(), &filter)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list blocked content", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"blocked": logs,
|
||||
"total": total,
|
||||
"limit": filter.Limit,
|
||||
"offset": filter.Offset,
|
||||
})
|
||||
}
|
||||
|
||||
// CheckCompliance performs a compliance check for a URL.
|
||||
func (h *PolicyHandler) CheckCompliance(c *gin.Context) {
|
||||
var req policy.CheckComplianceRequest
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
response, err := h.enforcer.CheckCompliance(c.Request.Context(), &req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to check compliance", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GetPolicyStats returns aggregated statistics.
|
||||
func (h *PolicyHandler) GetPolicyStats(c *gin.Context) {
|
||||
stats, err := h.store.GetStats(c.Request.Context())
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get stats", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, stats)
|
||||
}
|
||||
|
||||
// GenerateComplianceReport generates an audit report.
|
||||
func (h *PolicyHandler) GenerateComplianceReport(c *gin.Context) {
|
||||
var auditFilter policy.AuditLogFilter
|
||||
var blockedFilter policy.BlockedContentFilter
|
||||
|
||||
// Parse date filters
|
||||
fromStr := c.Query("from")
|
||||
toStr := c.Query("to")
|
||||
|
||||
if fromStr != "" {
|
||||
from, err := time.Parse("2006-01-02", fromStr)
|
||||
if err == nil {
|
||||
auditFilter.FromDate = &from
|
||||
blockedFilter.FromDate = &from
|
||||
}
|
||||
}
|
||||
|
||||
if toStr != "" {
|
||||
to, err := time.Parse("2006-01-02", toStr)
|
||||
if err == nil {
|
||||
// Add 1 day to include the end date
|
||||
to = to.Add(24 * time.Hour)
|
||||
auditFilter.ToDate = &to
|
||||
blockedFilter.ToDate = &to
|
||||
}
|
||||
}
|
||||
|
||||
// No limit for report
|
||||
auditFilter.Limit = 10000
|
||||
blockedFilter.Limit = 10000
|
||||
|
||||
auditor := policy.NewAuditor(h.store)
|
||||
report, err := auditor.GenerateAuditReport(c.Request.Context(), &auditFilter, &blockedFilter)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to generate report", "details": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Set filename for download
|
||||
format := c.Query("format")
|
||||
if format == "download" {
|
||||
filename := "compliance-report-" + time.Now().Format("2006-01-02") + ".json"
|
||||
c.Header("Content-Disposition", "attachment; filename="+filename)
|
||||
c.Header("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, report)
|
||||
}
|
||||
@@ -2,8 +2,6 @@ package database
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
@@ -145,395 +143,6 @@ func (r *Repository) GetDepartmentByName(ctx context.Context, uniID uuid.UUID, n
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// STAFF
|
||||
// ============================================================================
|
||||
|
||||
// CreateStaff creates or updates a staff member
|
||||
func (r *Repository) CreateStaff(ctx context.Context, s *UniversityStaff) error {
|
||||
query := `
|
||||
INSERT INTO university_staff (
|
||||
university_id, department_id, first_name, last_name, full_name,
|
||||
title, academic_title, position, position_type, is_professor,
|
||||
email, phone, office, profile_url, photo_url,
|
||||
orcid, google_scholar_id, researchgate_url, linkedin_url, personal_website,
|
||||
research_interests, research_summary, supervisor_id, team_role, source_url
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25
|
||||
)
|
||||
ON CONFLICT (university_id, first_name, last_name, COALESCE(department_id, '00000000-0000-0000-0000-000000000000'::uuid))
|
||||
DO UPDATE SET
|
||||
full_name = EXCLUDED.full_name,
|
||||
title = EXCLUDED.title,
|
||||
academic_title = EXCLUDED.academic_title,
|
||||
position = EXCLUDED.position,
|
||||
position_type = EXCLUDED.position_type,
|
||||
is_professor = EXCLUDED.is_professor,
|
||||
email = COALESCE(EXCLUDED.email, university_staff.email),
|
||||
phone = COALESCE(EXCLUDED.phone, university_staff.phone),
|
||||
office = COALESCE(EXCLUDED.office, university_staff.office),
|
||||
profile_url = COALESCE(EXCLUDED.profile_url, university_staff.profile_url),
|
||||
photo_url = COALESCE(EXCLUDED.photo_url, university_staff.photo_url),
|
||||
orcid = COALESCE(EXCLUDED.orcid, university_staff.orcid),
|
||||
google_scholar_id = COALESCE(EXCLUDED.google_scholar_id, university_staff.google_scholar_id),
|
||||
researchgate_url = COALESCE(EXCLUDED.researchgate_url, university_staff.researchgate_url),
|
||||
linkedin_url = COALESCE(EXCLUDED.linkedin_url, university_staff.linkedin_url),
|
||||
personal_website = COALESCE(EXCLUDED.personal_website, university_staff.personal_website),
|
||||
research_interests = COALESCE(EXCLUDED.research_interests, university_staff.research_interests),
|
||||
research_summary = COALESCE(EXCLUDED.research_summary, university_staff.research_summary),
|
||||
supervisor_id = COALESCE(EXCLUDED.supervisor_id, university_staff.supervisor_id),
|
||||
team_role = COALESCE(EXCLUDED.team_role, university_staff.team_role),
|
||||
source_url = COALESCE(EXCLUDED.source_url, university_staff.source_url),
|
||||
crawled_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING id, crawled_at, created_at, updated_at
|
||||
`
|
||||
return r.db.Pool.QueryRow(ctx, query,
|
||||
s.UniversityID, s.DepartmentID, s.FirstName, s.LastName, s.FullName,
|
||||
s.Title, s.AcademicTitle, s.Position, s.PositionType, s.IsProfessor,
|
||||
s.Email, s.Phone, s.Office, s.ProfileURL, s.PhotoURL,
|
||||
s.ORCID, s.GoogleScholarID, s.ResearchgateURL, s.LinkedInURL, s.PersonalWebsite,
|
||||
s.ResearchInterests, s.ResearchSummary, s.SupervisorID, s.TeamRole, s.SourceURL,
|
||||
).Scan(&s.ID, &s.CrawledAt, &s.CreatedAt, &s.UpdatedAt)
|
||||
}
|
||||
|
||||
// GetStaff retrieves a staff member by ID
|
||||
func (r *Repository) GetStaff(ctx context.Context, id uuid.UUID) (*UniversityStaff, error) {
|
||||
query := `SELECT * FROM v_staff_full WHERE id = $1`
|
||||
|
||||
s := &UniversityStaff{}
|
||||
err := r.db.Pool.QueryRow(ctx, query, id).Scan(
|
||||
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
|
||||
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
|
||||
&s.Email, &s.Phone, &s.Office, &s.ProfileURL, &s.PhotoURL,
|
||||
&s.ORCID, &s.GoogleScholarID, &s.ResearchgateURL, &s.LinkedInURL, &s.PersonalWebsite,
|
||||
&s.ResearchInterests, &s.ResearchSummary, &s.CrawledAt, &s.LastVerified, &s.IsActive, &s.SourceURL,
|
||||
&s.CreatedAt, &s.UpdatedAt, &s.UniversityName, &s.UniversityShort, nil, nil,
|
||||
&s.DepartmentName, nil, &s.PublicationCount,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// SearchStaff searches for staff members
|
||||
func (r *Repository) SearchStaff(ctx context.Context, params StaffSearchParams) (*StaffSearchResult, error) {
|
||||
// Build query dynamically
|
||||
var conditions []string
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
|
||||
baseQuery := `
|
||||
SELECT s.id, s.university_id, s.department_id, s.first_name, s.last_name, s.full_name,
|
||||
s.title, s.academic_title, s.position, s.position_type, s.is_professor,
|
||||
s.email, s.profile_url, s.photo_url, s.orcid,
|
||||
s.research_interests, s.crawled_at, s.is_active,
|
||||
u.name as university_name, u.short_name as university_short, u.state as university_state,
|
||||
d.name as department_name,
|
||||
(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
|
||||
FROM university_staff s
|
||||
JOIN universities u ON s.university_id = u.id
|
||||
LEFT JOIN departments d ON s.department_id = d.id
|
||||
`
|
||||
|
||||
if params.Query != "" {
|
||||
conditions = append(conditions, fmt.Sprintf(
|
||||
`(to_tsvector('german', COALESCE(s.full_name, '') || ' ' || COALESCE(s.research_summary, '')) @@ plainto_tsquery('german', $%d)
|
||||
OR s.full_name ILIKE '%%' || $%d || '%%'
|
||||
OR s.last_name ILIKE '%%' || $%d || '%%')`,
|
||||
argNum, argNum, argNum))
|
||||
args = append(args, params.Query)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.UniversityID != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.university_id = $%d", argNum))
|
||||
args = append(args, *params.UniversityID)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.DepartmentID != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.department_id = $%d", argNum))
|
||||
args = append(args, *params.DepartmentID)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.State != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("u.state = $%d", argNum))
|
||||
args = append(args, *params.State)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.UniType != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("u.uni_type = $%d", argNum))
|
||||
args = append(args, *params.UniType)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.PositionType != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.position_type = $%d", argNum))
|
||||
args = append(args, *params.PositionType)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.IsProfessor != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.is_professor = $%d", argNum))
|
||||
args = append(args, *params.IsProfessor)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Build WHERE clause
|
||||
whereClause := ""
|
||||
if len(conditions) > 0 {
|
||||
whereClause = "WHERE " + strings.Join(conditions, " AND ")
|
||||
}
|
||||
|
||||
// Count total
|
||||
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id %s", whereClause)
|
||||
var total int
|
||||
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Apply pagination
|
||||
limit := params.Limit
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
if limit > 100 {
|
||||
limit = 100
|
||||
}
|
||||
|
||||
offset := params.Offset
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
|
||||
// Full query with pagination
|
||||
fullQuery := fmt.Sprintf("%s %s ORDER BY s.is_professor DESC, s.last_name ASC LIMIT %d OFFSET %d",
|
||||
baseQuery, whereClause, limit, offset)
|
||||
|
||||
rows, err := r.db.Pool.Query(ctx, fullQuery, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var staff []UniversityStaff
|
||||
for rows.Next() {
|
||||
var s UniversityStaff
|
||||
var uniState *string
|
||||
if err := rows.Scan(
|
||||
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
|
||||
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
|
||||
&s.Email, &s.ProfileURL, &s.PhotoURL, &s.ORCID,
|
||||
&s.ResearchInterests, &s.CrawledAt, &s.IsActive,
|
||||
&s.UniversityName, &s.UniversityShort, &uniState,
|
||||
&s.DepartmentName, &s.PublicationCount,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
staff = append(staff, s)
|
||||
}
|
||||
|
||||
return &StaffSearchResult{
|
||||
Staff: staff,
|
||||
Total: total,
|
||||
Limit: limit,
|
||||
Offset: offset,
|
||||
Query: params.Query,
|
||||
}, rows.Err()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PUBLICATIONS
|
||||
// ============================================================================
|
||||
|
||||
// CreatePublication creates or updates a publication
|
||||
func (r *Repository) CreatePublication(ctx context.Context, p *Publication) error {
|
||||
query := `
|
||||
INSERT INTO publications (
|
||||
title, title_en, abstract, abstract_en, year, month,
|
||||
pub_type, venue, venue_short, publisher,
|
||||
doi, isbn, issn, arxiv_id, pubmed_id,
|
||||
url, pdf_url, citation_count, keywords, topics, source, raw_data
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22
|
||||
)
|
||||
ON CONFLICT (doi) WHERE doi IS NOT NULL DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
abstract = EXCLUDED.abstract,
|
||||
year = EXCLUDED.year,
|
||||
venue = EXCLUDED.venue,
|
||||
citation_count = EXCLUDED.citation_count,
|
||||
updated_at = NOW()
|
||||
RETURNING id, crawled_at, created_at, updated_at
|
||||
`
|
||||
|
||||
// Handle potential duplicate without DOI
|
||||
err := r.db.Pool.QueryRow(ctx, query,
|
||||
p.Title, p.TitleEN, p.Abstract, p.AbstractEN, p.Year, p.Month,
|
||||
p.PubType, p.Venue, p.VenueShort, p.Publisher,
|
||||
p.DOI, p.ISBN, p.ISSN, p.ArxivID, p.PubmedID,
|
||||
p.URL, p.PDFURL, p.CitationCount, p.Keywords, p.Topics, p.Source, p.RawData,
|
||||
).Scan(&p.ID, &p.CrawledAt, &p.CreatedAt, &p.UpdatedAt)
|
||||
|
||||
if err != nil && strings.Contains(err.Error(), "duplicate") {
|
||||
// Try to find existing publication by title and year
|
||||
findQuery := `SELECT id FROM publications WHERE title = $1 AND year = $2`
|
||||
err = r.db.Pool.QueryRow(ctx, findQuery, p.Title, p.Year).Scan(&p.ID)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// LinkStaffPublication creates a link between staff and publication
|
||||
func (r *Repository) LinkStaffPublication(ctx context.Context, sp *StaffPublication) error {
|
||||
query := `
|
||||
INSERT INTO staff_publications (staff_id, publication_id, author_position, is_corresponding)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT (staff_id, publication_id) DO UPDATE SET
|
||||
author_position = EXCLUDED.author_position,
|
||||
is_corresponding = EXCLUDED.is_corresponding
|
||||
`
|
||||
_, err := r.db.Pool.Exec(ctx, query,
|
||||
sp.StaffID, sp.PublicationID, sp.AuthorPosition, sp.IsCorresponding,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetStaffPublications retrieves all publications for a staff member
|
||||
func (r *Repository) GetStaffPublications(ctx context.Context, staffID uuid.UUID) ([]Publication, error) {
|
||||
query := `
|
||||
SELECT p.id, p.title, p.abstract, p.year, p.pub_type, p.venue, p.doi, p.url, p.citation_count
|
||||
FROM publications p
|
||||
JOIN staff_publications sp ON p.id = sp.publication_id
|
||||
WHERE sp.staff_id = $1
|
||||
ORDER BY p.year DESC NULLS LAST, p.title
|
||||
`
|
||||
|
||||
rows, err := r.db.Pool.Query(ctx, query, staffID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var pubs []Publication
|
||||
for rows.Next() {
|
||||
var p Publication
|
||||
if err := rows.Scan(
|
||||
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pubs = append(pubs, p)
|
||||
}
|
||||
return pubs, rows.Err()
|
||||
}
|
||||
|
||||
// SearchPublications searches for publications
|
||||
func (r *Repository) SearchPublications(ctx context.Context, params PublicationSearchParams) (*PublicationSearchResult, error) {
|
||||
var conditions []string
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
|
||||
if params.Query != "" {
|
||||
conditions = append(conditions, fmt.Sprintf(
|
||||
`to_tsvector('german', COALESCE(title, '') || ' ' || COALESCE(abstract, '')) @@ plainto_tsquery('german', $%d)`,
|
||||
argNum))
|
||||
args = append(args, params.Query)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.StaffID != nil {
|
||||
conditions = append(conditions, fmt.Sprintf(
|
||||
`id IN (SELECT publication_id FROM staff_publications WHERE staff_id = $%d)`,
|
||||
argNum))
|
||||
args = append(args, *params.StaffID)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.Year != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("year = $%d", argNum))
|
||||
args = append(args, *params.Year)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.YearFrom != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("year >= $%d", argNum))
|
||||
args = append(args, *params.YearFrom)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.YearTo != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("year <= $%d", argNum))
|
||||
args = append(args, *params.YearTo)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.PubType != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("pub_type = $%d", argNum))
|
||||
args = append(args, *params.PubType)
|
||||
argNum++
|
||||
}
|
||||
|
||||
whereClause := ""
|
||||
if len(conditions) > 0 {
|
||||
whereClause = "WHERE " + strings.Join(conditions, " AND ")
|
||||
}
|
||||
|
||||
// Count
|
||||
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM publications %s", whereClause)
|
||||
var total int
|
||||
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Pagination
|
||||
limit := params.Limit
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
offset := params.Offset
|
||||
|
||||
// Query
|
||||
query := fmt.Sprintf(`
|
||||
SELECT id, title, abstract, year, pub_type, venue, doi, url, citation_count, keywords
|
||||
FROM publications %s
|
||||
ORDER BY year DESC NULLS LAST, citation_count DESC
|
||||
LIMIT %d OFFSET %d
|
||||
`, whereClause, limit, offset)
|
||||
|
||||
rows, err := r.db.Pool.Query(ctx, query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var pubs []Publication
|
||||
for rows.Next() {
|
||||
var p Publication
|
||||
if err := rows.Scan(
|
||||
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, &p.Keywords,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pubs = append(pubs, p)
|
||||
}
|
||||
|
||||
return &PublicationSearchResult{
|
||||
Publications: pubs,
|
||||
Total: total,
|
||||
Limit: limit,
|
||||
Offset: offset,
|
||||
Query: params.Query,
|
||||
}, rows.Err()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CRAWL STATUS
|
||||
// ============================================================================
|
||||
|
||||
398
edu-search-service/internal/database/repository_staff.go
Normal file
398
edu-search-service/internal/database/repository_staff.go
Normal file
@@ -0,0 +1,398 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// ============================================================================
|
||||
// STAFF
|
||||
// ============================================================================
|
||||
|
||||
// CreateStaff creates or updates a staff member
|
||||
func (r *Repository) CreateStaff(ctx context.Context, s *UniversityStaff) error {
|
||||
query := `
|
||||
INSERT INTO university_staff (
|
||||
university_id, department_id, first_name, last_name, full_name,
|
||||
title, academic_title, position, position_type, is_professor,
|
||||
email, phone, office, profile_url, photo_url,
|
||||
orcid, google_scholar_id, researchgate_url, linkedin_url, personal_website,
|
||||
research_interests, research_summary, supervisor_id, team_role, source_url
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25
|
||||
)
|
||||
ON CONFLICT (university_id, first_name, last_name, COALESCE(department_id, '00000000-0000-0000-0000-000000000000'::uuid))
|
||||
DO UPDATE SET
|
||||
full_name = EXCLUDED.full_name,
|
||||
title = EXCLUDED.title,
|
||||
academic_title = EXCLUDED.academic_title,
|
||||
position = EXCLUDED.position,
|
||||
position_type = EXCLUDED.position_type,
|
||||
is_professor = EXCLUDED.is_professor,
|
||||
email = COALESCE(EXCLUDED.email, university_staff.email),
|
||||
phone = COALESCE(EXCLUDED.phone, university_staff.phone),
|
||||
office = COALESCE(EXCLUDED.office, university_staff.office),
|
||||
profile_url = COALESCE(EXCLUDED.profile_url, university_staff.profile_url),
|
||||
photo_url = COALESCE(EXCLUDED.photo_url, university_staff.photo_url),
|
||||
orcid = COALESCE(EXCLUDED.orcid, university_staff.orcid),
|
||||
google_scholar_id = COALESCE(EXCLUDED.google_scholar_id, university_staff.google_scholar_id),
|
||||
researchgate_url = COALESCE(EXCLUDED.researchgate_url, university_staff.researchgate_url),
|
||||
linkedin_url = COALESCE(EXCLUDED.linkedin_url, university_staff.linkedin_url),
|
||||
personal_website = COALESCE(EXCLUDED.personal_website, university_staff.personal_website),
|
||||
research_interests = COALESCE(EXCLUDED.research_interests, university_staff.research_interests),
|
||||
research_summary = COALESCE(EXCLUDED.research_summary, university_staff.research_summary),
|
||||
supervisor_id = COALESCE(EXCLUDED.supervisor_id, university_staff.supervisor_id),
|
||||
team_role = COALESCE(EXCLUDED.team_role, university_staff.team_role),
|
||||
source_url = COALESCE(EXCLUDED.source_url, university_staff.source_url),
|
||||
crawled_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING id, crawled_at, created_at, updated_at
|
||||
`
|
||||
return r.db.Pool.QueryRow(ctx, query,
|
||||
s.UniversityID, s.DepartmentID, s.FirstName, s.LastName, s.FullName,
|
||||
s.Title, s.AcademicTitle, s.Position, s.PositionType, s.IsProfessor,
|
||||
s.Email, s.Phone, s.Office, s.ProfileURL, s.PhotoURL,
|
||||
s.ORCID, s.GoogleScholarID, s.ResearchgateURL, s.LinkedInURL, s.PersonalWebsite,
|
||||
s.ResearchInterests, s.ResearchSummary, s.SupervisorID, s.TeamRole, s.SourceURL,
|
||||
).Scan(&s.ID, &s.CrawledAt, &s.CreatedAt, &s.UpdatedAt)
|
||||
}
|
||||
|
||||
// GetStaff retrieves a staff member by ID
|
||||
func (r *Repository) GetStaff(ctx context.Context, id uuid.UUID) (*UniversityStaff, error) {
|
||||
query := `SELECT * FROM v_staff_full WHERE id = $1`
|
||||
|
||||
s := &UniversityStaff{}
|
||||
err := r.db.Pool.QueryRow(ctx, query, id).Scan(
|
||||
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
|
||||
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
|
||||
&s.Email, &s.Phone, &s.Office, &s.ProfileURL, &s.PhotoURL,
|
||||
&s.ORCID, &s.GoogleScholarID, &s.ResearchgateURL, &s.LinkedInURL, &s.PersonalWebsite,
|
||||
&s.ResearchInterests, &s.ResearchSummary, &s.CrawledAt, &s.LastVerified, &s.IsActive, &s.SourceURL,
|
||||
&s.CreatedAt, &s.UpdatedAt, &s.UniversityName, &s.UniversityShort, nil, nil,
|
||||
&s.DepartmentName, nil, &s.PublicationCount,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// SearchStaff searches for staff members
|
||||
func (r *Repository) SearchStaff(ctx context.Context, params StaffSearchParams) (*StaffSearchResult, error) {
|
||||
// Build query dynamically
|
||||
var conditions []string
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
|
||||
baseQuery := `
|
||||
SELECT s.id, s.university_id, s.department_id, s.first_name, s.last_name, s.full_name,
|
||||
s.title, s.academic_title, s.position, s.position_type, s.is_professor,
|
||||
s.email, s.profile_url, s.photo_url, s.orcid,
|
||||
s.research_interests, s.crawled_at, s.is_active,
|
||||
u.name as university_name, u.short_name as university_short, u.state as university_state,
|
||||
d.name as department_name,
|
||||
(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
|
||||
FROM university_staff s
|
||||
JOIN universities u ON s.university_id = u.id
|
||||
LEFT JOIN departments d ON s.department_id = d.id
|
||||
`
|
||||
|
||||
if params.Query != "" {
|
||||
conditions = append(conditions, fmt.Sprintf(
|
||||
`(to_tsvector('german', COALESCE(s.full_name, '') || ' ' || COALESCE(s.research_summary, '')) @@ plainto_tsquery('german', $%d)
|
||||
OR s.full_name ILIKE '%%' || $%d || '%%'
|
||||
OR s.last_name ILIKE '%%' || $%d || '%%')`,
|
||||
argNum, argNum, argNum))
|
||||
args = append(args, params.Query)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.UniversityID != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.university_id = $%d", argNum))
|
||||
args = append(args, *params.UniversityID)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.DepartmentID != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.department_id = $%d", argNum))
|
||||
args = append(args, *params.DepartmentID)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.State != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("u.state = $%d", argNum))
|
||||
args = append(args, *params.State)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.UniType != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("u.uni_type = $%d", argNum))
|
||||
args = append(args, *params.UniType)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.PositionType != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.position_type = $%d", argNum))
|
||||
args = append(args, *params.PositionType)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.IsProfessor != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("s.is_professor = $%d", argNum))
|
||||
args = append(args, *params.IsProfessor)
|
||||
argNum++
|
||||
}
|
||||
|
||||
// Build WHERE clause
|
||||
whereClause := ""
|
||||
if len(conditions) > 0 {
|
||||
whereClause = "WHERE " + strings.Join(conditions, " AND ")
|
||||
}
|
||||
|
||||
// Count total
|
||||
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id %s", whereClause)
|
||||
var total int
|
||||
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Apply pagination
|
||||
limit := params.Limit
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
if limit > 100 {
|
||||
limit = 100
|
||||
}
|
||||
|
||||
offset := params.Offset
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
|
||||
// Full query with pagination
|
||||
fullQuery := fmt.Sprintf("%s %s ORDER BY s.is_professor DESC, s.last_name ASC LIMIT %d OFFSET %d",
|
||||
baseQuery, whereClause, limit, offset)
|
||||
|
||||
rows, err := r.db.Pool.Query(ctx, fullQuery, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var staff []UniversityStaff
|
||||
for rows.Next() {
|
||||
var s UniversityStaff
|
||||
var uniState *string
|
||||
if err := rows.Scan(
|
||||
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
|
||||
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
|
||||
&s.Email, &s.ProfileURL, &s.PhotoURL, &s.ORCID,
|
||||
&s.ResearchInterests, &s.CrawledAt, &s.IsActive,
|
||||
&s.UniversityName, &s.UniversityShort, &uniState,
|
||||
&s.DepartmentName, &s.PublicationCount,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
staff = append(staff, s)
|
||||
}
|
||||
|
||||
return &StaffSearchResult{
|
||||
Staff: staff,
|
||||
Total: total,
|
||||
Limit: limit,
|
||||
Offset: offset,
|
||||
Query: params.Query,
|
||||
}, rows.Err()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PUBLICATIONS
|
||||
// ============================================================================
|
||||
|
||||
// CreatePublication creates or updates a publication
|
||||
func (r *Repository) CreatePublication(ctx context.Context, p *Publication) error {
|
||||
query := `
|
||||
INSERT INTO publications (
|
||||
title, title_en, abstract, abstract_en, year, month,
|
||||
pub_type, venue, venue_short, publisher,
|
||||
doi, isbn, issn, arxiv_id, pubmed_id,
|
||||
url, pdf_url, citation_count, keywords, topics, source, raw_data
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22
|
||||
)
|
||||
ON CONFLICT (doi) WHERE doi IS NOT NULL DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
abstract = EXCLUDED.abstract,
|
||||
year = EXCLUDED.year,
|
||||
venue = EXCLUDED.venue,
|
||||
citation_count = EXCLUDED.citation_count,
|
||||
updated_at = NOW()
|
||||
RETURNING id, crawled_at, created_at, updated_at
|
||||
`
|
||||
|
||||
// Handle potential duplicate without DOI
|
||||
err := r.db.Pool.QueryRow(ctx, query,
|
||||
p.Title, p.TitleEN, p.Abstract, p.AbstractEN, p.Year, p.Month,
|
||||
p.PubType, p.Venue, p.VenueShort, p.Publisher,
|
||||
p.DOI, p.ISBN, p.ISSN, p.ArxivID, p.PubmedID,
|
||||
p.URL, p.PDFURL, p.CitationCount, p.Keywords, p.Topics, p.Source, p.RawData,
|
||||
).Scan(&p.ID, &p.CrawledAt, &p.CreatedAt, &p.UpdatedAt)
|
||||
|
||||
if err != nil && strings.Contains(err.Error(), "duplicate") {
|
||||
// Try to find existing publication by title and year
|
||||
findQuery := `SELECT id FROM publications WHERE title = $1 AND year = $2`
|
||||
err = r.db.Pool.QueryRow(ctx, findQuery, p.Title, p.Year).Scan(&p.ID)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// LinkStaffPublication creates a link between staff and publication
|
||||
func (r *Repository) LinkStaffPublication(ctx context.Context, sp *StaffPublication) error {
|
||||
query := `
|
||||
INSERT INTO staff_publications (staff_id, publication_id, author_position, is_corresponding)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT (staff_id, publication_id) DO UPDATE SET
|
||||
author_position = EXCLUDED.author_position,
|
||||
is_corresponding = EXCLUDED.is_corresponding
|
||||
`
|
||||
_, err := r.db.Pool.Exec(ctx, query,
|
||||
sp.StaffID, sp.PublicationID, sp.AuthorPosition, sp.IsCorresponding,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetStaffPublications retrieves all publications for a staff member
|
||||
func (r *Repository) GetStaffPublications(ctx context.Context, staffID uuid.UUID) ([]Publication, error) {
|
||||
query := `
|
||||
SELECT p.id, p.title, p.abstract, p.year, p.pub_type, p.venue, p.doi, p.url, p.citation_count
|
||||
FROM publications p
|
||||
JOIN staff_publications sp ON p.id = sp.publication_id
|
||||
WHERE sp.staff_id = $1
|
||||
ORDER BY p.year DESC NULLS LAST, p.title
|
||||
`
|
||||
|
||||
rows, err := r.db.Pool.Query(ctx, query, staffID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var pubs []Publication
|
||||
for rows.Next() {
|
||||
var p Publication
|
||||
if err := rows.Scan(
|
||||
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pubs = append(pubs, p)
|
||||
}
|
||||
return pubs, rows.Err()
|
||||
}
|
||||
|
||||
// SearchPublications searches for publications
|
||||
func (r *Repository) SearchPublications(ctx context.Context, params PublicationSearchParams) (*PublicationSearchResult, error) {
|
||||
var conditions []string
|
||||
var args []interface{}
|
||||
argNum := 1
|
||||
|
||||
if params.Query != "" {
|
||||
conditions = append(conditions, fmt.Sprintf(
|
||||
`to_tsvector('german', COALESCE(title, '') || ' ' || COALESCE(abstract, '')) @@ plainto_tsquery('german', $%d)`,
|
||||
argNum))
|
||||
args = append(args, params.Query)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.StaffID != nil {
|
||||
conditions = append(conditions, fmt.Sprintf(
|
||||
`id IN (SELECT publication_id FROM staff_publications WHERE staff_id = $%d)`,
|
||||
argNum))
|
||||
args = append(args, *params.StaffID)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.Year != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("year = $%d", argNum))
|
||||
args = append(args, *params.Year)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.YearFrom != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("year >= $%d", argNum))
|
||||
args = append(args, *params.YearFrom)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.YearTo != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("year <= $%d", argNum))
|
||||
args = append(args, *params.YearTo)
|
||||
argNum++
|
||||
}
|
||||
|
||||
if params.PubType != nil {
|
||||
conditions = append(conditions, fmt.Sprintf("pub_type = $%d", argNum))
|
||||
args = append(args, *params.PubType)
|
||||
argNum++
|
||||
}
|
||||
|
||||
whereClause := ""
|
||||
if len(conditions) > 0 {
|
||||
whereClause = "WHERE " + strings.Join(conditions, " AND ")
|
||||
}
|
||||
|
||||
// Count
|
||||
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM publications %s", whereClause)
|
||||
var total int
|
||||
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Pagination
|
||||
limit := params.Limit
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
offset := params.Offset
|
||||
|
||||
// Query
|
||||
query := fmt.Sprintf(`
|
||||
SELECT id, title, abstract, year, pub_type, venue, doi, url, citation_count, keywords
|
||||
FROM publications %s
|
||||
ORDER BY year DESC NULLS LAST, citation_count DESC
|
||||
LIMIT %d OFFSET %d
|
||||
`, whereClause, limit, offset)
|
||||
|
||||
rows, err := r.db.Pool.Query(ctx, query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var pubs []Publication
|
||||
for rows.Next() {
|
||||
var p Publication
|
||||
if err := rows.Scan(
|
||||
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, &p.Keywords,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pubs = append(pubs, p)
|
||||
}
|
||||
|
||||
return &PublicationSearchResult{
|
||||
Publications: pubs,
|
||||
Total: total,
|
||||
Limit: limit,
|
||||
Offset: offset,
|
||||
Query: params.Query,
|
||||
}, rows.Err()
|
||||
}
|
||||
@@ -2,7 +2,6 @@ package policy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
@@ -205,413 +204,6 @@ func (s *Store) DeletePolicy(ctx context.Context, id uuid.UUID) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// ALLOWED SOURCES
|
||||
// =============================================================================
|
||||
|
||||
// CreateSource creates a new allowed source.
|
||||
func (s *Store) CreateSource(ctx context.Context, req *CreateAllowedSourceRequest) (*AllowedSource, error) {
|
||||
trustBoost := 0.5
|
||||
if req.TrustBoost != nil {
|
||||
trustBoost = *req.TrustBoost
|
||||
}
|
||||
|
||||
source := &AllowedSource{
|
||||
ID: uuid.New(),
|
||||
PolicyID: req.PolicyID,
|
||||
Domain: req.Domain,
|
||||
Name: req.Name,
|
||||
Description: req.Description,
|
||||
License: req.License,
|
||||
LegalBasis: req.LegalBasis,
|
||||
CitationTemplate: req.CitationTemplate,
|
||||
TrustBoost: trustBoost,
|
||||
IsActive: true,
|
||||
CreatedAt: time.Now(),
|
||||
UpdatedAt: time.Now(),
|
||||
}
|
||||
|
||||
query := `
|
||||
INSERT INTO allowed_sources (id, policy_id, domain, name, description, license,
|
||||
legal_basis, citation_template, trust_boost, is_active,
|
||||
created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
|
||||
RETURNING id`
|
||||
|
||||
err := s.pool.QueryRow(ctx, query,
|
||||
source.ID, source.PolicyID, source.Domain, source.Name, source.Description,
|
||||
source.License, source.LegalBasis, source.CitationTemplate, source.TrustBoost,
|
||||
source.IsActive, source.CreatedAt, source.UpdatedAt,
|
||||
).Scan(&source.ID)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create source: %w", err)
|
||||
}
|
||||
|
||||
// Create default operation permissions
|
||||
err = s.createDefaultOperations(ctx, source.ID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create default operations: %w", err)
|
||||
}
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// createDefaultOperations creates default operation permissions for a source.
|
||||
func (s *Store) createDefaultOperations(ctx context.Context, sourceID uuid.UUID) error {
|
||||
defaults := []struct {
|
||||
op Operation
|
||||
allowed bool
|
||||
citation bool
|
||||
}{
|
||||
{OperationLookup, true, true},
|
||||
{OperationRAG, true, true},
|
||||
{OperationTraining, false, false}, // VERBOTEN by default
|
||||
{OperationExport, true, true},
|
||||
}
|
||||
|
||||
for _, d := range defaults {
|
||||
query := `
|
||||
INSERT INTO operation_permissions (id, source_id, operation, is_allowed, requires_citation, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)`
|
||||
_, err := s.pool.Exec(ctx, query, uuid.New(), sourceID, d.op, d.allowed, d.citation, time.Now(), time.Now())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetSource retrieves a source by ID.
|
||||
func (s *Store) GetSource(ctx context.Context, id uuid.UUID) (*AllowedSource, error) {
|
||||
query := `
|
||||
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
|
||||
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
|
||||
als.created_at, als.updated_at, sp.name as policy_name
|
||||
FROM allowed_sources als
|
||||
JOIN source_policies sp ON als.policy_id = sp.id
|
||||
WHERE als.id = $1`
|
||||
|
||||
source := &AllowedSource{}
|
||||
err := s.pool.QueryRow(ctx, query, id).Scan(
|
||||
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
|
||||
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
|
||||
&source.IsActive, &source.CreatedAt, &source.UpdatedAt, &source.PolicyName,
|
||||
)
|
||||
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get source: %w", err)
|
||||
}
|
||||
|
||||
// Load operations
|
||||
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
source.Operations = ops
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// GetSourceByDomain retrieves a source by domain with optional bundesland filter.
|
||||
func (s *Store) GetSourceByDomain(ctx context.Context, domain string, bundesland *Bundesland) (*AllowedSource, error) {
|
||||
query := `
|
||||
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
|
||||
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
|
||||
als.created_at, als.updated_at
|
||||
FROM allowed_sources als
|
||||
JOIN source_policies sp ON als.policy_id = sp.id
|
||||
WHERE als.is_active = true
|
||||
AND sp.is_active = true
|
||||
AND (als.domain = $1 OR $1 LIKE '%.' || als.domain)
|
||||
AND (sp.bundesland IS NULL OR sp.bundesland = $2)
|
||||
LIMIT 1`
|
||||
|
||||
source := &AllowedSource{}
|
||||
err := s.pool.QueryRow(ctx, query, domain, bundesland).Scan(
|
||||
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
|
||||
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
|
||||
&source.IsActive, &source.CreatedAt, &source.UpdatedAt,
|
||||
)
|
||||
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get source by domain: %w", err)
|
||||
}
|
||||
|
||||
// Load operations
|
||||
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
source.Operations = ops
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// ListSources retrieves sources with optional filters.
|
||||
func (s *Store) ListSources(ctx context.Context, filter *SourceListFilter) ([]AllowedSource, int, error) {
|
||||
baseQuery := `FROM allowed_sources als JOIN source_policies sp ON als.policy_id = sp.id WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argCount := 0
|
||||
|
||||
if filter.PolicyID != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.policy_id = $%d", argCount)
|
||||
args = append(args, *filter.PolicyID)
|
||||
}
|
||||
|
||||
if filter.Domain != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.domain ILIKE $%d", argCount)
|
||||
args = append(args, "%"+*filter.Domain+"%")
|
||||
}
|
||||
|
||||
if filter.License != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.license = $%d", argCount)
|
||||
args = append(args, *filter.License)
|
||||
}
|
||||
|
||||
if filter.IsActive != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.is_active = $%d", argCount)
|
||||
args = append(args, *filter.IsActive)
|
||||
}
|
||||
|
||||
// Count query
|
||||
var total int
|
||||
countQuery := "SELECT COUNT(*) " + baseQuery
|
||||
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count sources: %w", err)
|
||||
}
|
||||
|
||||
// Data query
|
||||
dataQuery := `SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
|
||||
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
|
||||
als.created_at, als.updated_at, sp.name as policy_name ` + baseQuery +
|
||||
` ORDER BY als.created_at DESC`
|
||||
|
||||
if filter.Limit > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
|
||||
args = append(args, filter.Limit)
|
||||
}
|
||||
if filter.Offset > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
|
||||
args = append(args, filter.Offset)
|
||||
}
|
||||
|
||||
rows, err := s.pool.Query(ctx, dataQuery, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to list sources: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
sources := []AllowedSource{}
|
||||
for rows.Next() {
|
||||
var src AllowedSource
|
||||
err := rows.Scan(
|
||||
&src.ID, &src.PolicyID, &src.Domain, &src.Name, &src.Description,
|
||||
&src.License, &src.LegalBasis, &src.CitationTemplate, &src.TrustBoost,
|
||||
&src.IsActive, &src.CreatedAt, &src.UpdatedAt, &src.PolicyName,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan source: %w", err)
|
||||
}
|
||||
sources = append(sources, src)
|
||||
}
|
||||
|
||||
return sources, total, nil
|
||||
}
|
||||
|
||||
// UpdateSource updates an existing source.
|
||||
func (s *Store) UpdateSource(ctx context.Context, id uuid.UUID, req *UpdateAllowedSourceRequest) (*AllowedSource, error) {
|
||||
source, err := s.GetSource(ctx, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source not found")
|
||||
}
|
||||
|
||||
if req.Domain != nil {
|
||||
source.Domain = *req.Domain
|
||||
}
|
||||
if req.Name != nil {
|
||||
source.Name = *req.Name
|
||||
}
|
||||
if req.Description != nil {
|
||||
source.Description = req.Description
|
||||
}
|
||||
if req.License != nil {
|
||||
source.License = *req.License
|
||||
}
|
||||
if req.LegalBasis != nil {
|
||||
source.LegalBasis = req.LegalBasis
|
||||
}
|
||||
if req.CitationTemplate != nil {
|
||||
source.CitationTemplate = req.CitationTemplate
|
||||
}
|
||||
if req.TrustBoost != nil {
|
||||
source.TrustBoost = *req.TrustBoost
|
||||
}
|
||||
if req.IsActive != nil {
|
||||
source.IsActive = *req.IsActive
|
||||
}
|
||||
source.UpdatedAt = time.Now()
|
||||
|
||||
query := `
|
||||
UPDATE allowed_sources
|
||||
SET domain = $2, name = $3, description = $4, license = $5, legal_basis = $6,
|
||||
citation_template = $7, trust_boost = $8, is_active = $9, updated_at = $10
|
||||
WHERE id = $1`
|
||||
|
||||
_, err = s.pool.Exec(ctx, query,
|
||||
id, source.Domain, source.Name, source.Description, source.License,
|
||||
source.LegalBasis, source.CitationTemplate, source.TrustBoost,
|
||||
source.IsActive, source.UpdatedAt,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to update source: %w", err)
|
||||
}
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// DeleteSource deletes a source by ID.
|
||||
func (s *Store) DeleteSource(ctx context.Context, id uuid.UUID) error {
|
||||
query := `DELETE FROM allowed_sources WHERE id = $1`
|
||||
_, err := s.pool.Exec(ctx, query, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete source: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// OPERATION PERMISSIONS
|
||||
// =============================================================================
|
||||
|
||||
// GetOperationsBySourceID retrieves all operation permissions for a source.
|
||||
func (s *Store) GetOperationsBySourceID(ctx context.Context, sourceID uuid.UUID) ([]OperationPermission, error) {
|
||||
query := `
|
||||
SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
|
||||
FROM operation_permissions
|
||||
WHERE source_id = $1
|
||||
ORDER BY operation`
|
||||
|
||||
rows, err := s.pool.Query(ctx, query, sourceID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operations: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
ops := []OperationPermission{}
|
||||
for rows.Next() {
|
||||
var op OperationPermission
|
||||
err := rows.Scan(
|
||||
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
|
||||
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan operation: %w", err)
|
||||
}
|
||||
ops = append(ops, op)
|
||||
}
|
||||
|
||||
return ops, nil
|
||||
}
|
||||
|
||||
// UpdateOperationPermission updates an operation permission.
|
||||
func (s *Store) UpdateOperationPermission(ctx context.Context, id uuid.UUID, req *UpdateOperationPermissionRequest) (*OperationPermission, error) {
|
||||
query := `SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
|
||||
FROM operation_permissions WHERE id = $1`
|
||||
|
||||
op := &OperationPermission{}
|
||||
err := s.pool.QueryRow(ctx, query, id).Scan(
|
||||
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
|
||||
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
|
||||
)
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, fmt.Errorf("operation permission not found")
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operation: %w", err)
|
||||
}
|
||||
|
||||
if req.IsAllowed != nil {
|
||||
op.IsAllowed = *req.IsAllowed
|
||||
}
|
||||
if req.RequiresCitation != nil {
|
||||
op.RequiresCitation = *req.RequiresCitation
|
||||
}
|
||||
if req.Notes != nil {
|
||||
op.Notes = req.Notes
|
||||
}
|
||||
op.UpdatedAt = time.Now()
|
||||
|
||||
updateQuery := `
|
||||
UPDATE operation_permissions
|
||||
SET is_allowed = $2, requires_citation = $3, notes = $4, updated_at = $5
|
||||
WHERE id = $1`
|
||||
|
||||
_, err = s.pool.Exec(ctx, updateQuery, id, op.IsAllowed, op.RequiresCitation, op.Notes, op.UpdatedAt)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to update operation: %w", err)
|
||||
}
|
||||
|
||||
return op, nil
|
||||
}
|
||||
|
||||
// GetOperationsMatrix retrieves all operation permissions grouped by source.
|
||||
func (s *Store) GetOperationsMatrix(ctx context.Context) ([]AllowedSource, error) {
|
||||
query := `
|
||||
SELECT als.id, als.domain, als.name, als.license, als.is_active,
|
||||
sp.name as policy_name, sp.bundesland
|
||||
FROM allowed_sources als
|
||||
JOIN source_policies sp ON als.policy_id = sp.id
|
||||
WHERE als.is_active = true AND sp.is_active = true
|
||||
ORDER BY sp.bundesland NULLS FIRST, als.name`
|
||||
|
||||
rows, err := s.pool.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operations matrix: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
sources := []AllowedSource{}
|
||||
for rows.Next() {
|
||||
var src AllowedSource
|
||||
var bundesland *Bundesland
|
||||
err := rows.Scan(
|
||||
&src.ID, &src.Domain, &src.Name, &src.License, &src.IsActive,
|
||||
&src.PolicyName, &bundesland,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan source: %w", err)
|
||||
}
|
||||
|
||||
// Load operations for each source
|
||||
ops, err := s.GetOperationsBySourceID(ctx, src.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
src.Operations = ops
|
||||
sources = append(sources, src)
|
||||
}
|
||||
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// PII RULES
|
||||
// =============================================================================
|
||||
@@ -765,404 +357,3 @@ func (s *Store) DeletePIIRule(ctx context.Context, id uuid.UUID) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// AUDIT LOG
|
||||
// =============================================================================
|
||||
|
||||
// CreateAuditLog creates a new audit log entry.
|
||||
func (s *Store) CreateAuditLog(ctx context.Context, entry *PolicyAuditLog) error {
|
||||
entry.ID = uuid.New()
|
||||
entry.CreatedAt = time.Now()
|
||||
|
||||
query := `
|
||||
INSERT INTO policy_audit_log (id, action, entity_type, entity_id, old_value, new_value,
|
||||
user_id, user_email, ip_address, user_agent, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`
|
||||
|
||||
_, err := s.pool.Exec(ctx, query,
|
||||
entry.ID, entry.Action, entry.EntityType, entry.EntityID,
|
||||
entry.OldValue, entry.NewValue, entry.UserID, entry.UserEmail,
|
||||
entry.IPAddress, entry.UserAgent, entry.CreatedAt,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create audit log: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListAuditLogs retrieves audit logs with filters.
|
||||
func (s *Store) ListAuditLogs(ctx context.Context, filter *AuditLogFilter) ([]PolicyAuditLog, int, error) {
|
||||
baseQuery := `FROM policy_audit_log WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argCount := 0
|
||||
|
||||
if filter.EntityType != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND entity_type = $%d", argCount)
|
||||
args = append(args, *filter.EntityType)
|
||||
}
|
||||
|
||||
if filter.EntityID != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND entity_id = $%d", argCount)
|
||||
args = append(args, *filter.EntityID)
|
||||
}
|
||||
|
||||
if filter.Action != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND action = $%d", argCount)
|
||||
args = append(args, *filter.Action)
|
||||
}
|
||||
|
||||
if filter.UserEmail != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND user_email ILIKE $%d", argCount)
|
||||
args = append(args, "%"+*filter.UserEmail+"%")
|
||||
}
|
||||
|
||||
if filter.FromDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
|
||||
args = append(args, *filter.FromDate)
|
||||
}
|
||||
|
||||
if filter.ToDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
|
||||
args = append(args, *filter.ToDate)
|
||||
}
|
||||
|
||||
// Count query
|
||||
var total int
|
||||
countQuery := "SELECT COUNT(*) " + baseQuery
|
||||
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count audit logs: %w", err)
|
||||
}
|
||||
|
||||
// Data query
|
||||
dataQuery := `SELECT id, action, entity_type, entity_id, old_value, new_value,
|
||||
user_id, user_email, ip_address, user_agent, created_at ` + baseQuery +
|
||||
` ORDER BY created_at DESC`
|
||||
|
||||
if filter.Limit > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
|
||||
args = append(args, filter.Limit)
|
||||
}
|
||||
if filter.Offset > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
|
||||
args = append(args, filter.Offset)
|
||||
}
|
||||
|
||||
rows, err := s.pool.Query(ctx, dataQuery, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to list audit logs: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
logs := []PolicyAuditLog{}
|
||||
for rows.Next() {
|
||||
var l PolicyAuditLog
|
||||
err := rows.Scan(
|
||||
&l.ID, &l.Action, &l.EntityType, &l.EntityID, &l.OldValue, &l.NewValue,
|
||||
&l.UserID, &l.UserEmail, &l.IPAddress, &l.UserAgent, &l.CreatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan audit log: %w", err)
|
||||
}
|
||||
logs = append(logs, l)
|
||||
}
|
||||
|
||||
return logs, total, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// BLOCKED CONTENT LOG
|
||||
// =============================================================================
|
||||
|
||||
// CreateBlockedContentLog creates a new blocked content log entry.
|
||||
func (s *Store) CreateBlockedContentLog(ctx context.Context, entry *BlockedContentLog) error {
|
||||
entry.ID = uuid.New()
|
||||
entry.CreatedAt = time.Now()
|
||||
|
||||
query := `
|
||||
INSERT INTO blocked_content_log (id, url, domain, block_reason, matched_rule_id, details, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)`
|
||||
|
||||
_, err := s.pool.Exec(ctx, query,
|
||||
entry.ID, entry.URL, entry.Domain, entry.BlockReason,
|
||||
entry.MatchedRuleID, entry.Details, entry.CreatedAt,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create blocked content log: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListBlockedContent retrieves blocked content logs with filters.
|
||||
func (s *Store) ListBlockedContent(ctx context.Context, filter *BlockedContentFilter) ([]BlockedContentLog, int, error) {
|
||||
baseQuery := `FROM blocked_content_log WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argCount := 0
|
||||
|
||||
if filter.Domain != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND domain ILIKE $%d", argCount)
|
||||
args = append(args, "%"+*filter.Domain+"%")
|
||||
}
|
||||
|
||||
if filter.BlockReason != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND block_reason = $%d", argCount)
|
||||
args = append(args, *filter.BlockReason)
|
||||
}
|
||||
|
||||
if filter.FromDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
|
||||
args = append(args, *filter.FromDate)
|
||||
}
|
||||
|
||||
if filter.ToDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
|
||||
args = append(args, *filter.ToDate)
|
||||
}
|
||||
|
||||
// Count query
|
||||
var total int
|
||||
countQuery := "SELECT COUNT(*) " + baseQuery
|
||||
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count blocked content: %w", err)
|
||||
}
|
||||
|
||||
// Data query
|
||||
dataQuery := `SELECT id, url, domain, block_reason, matched_rule_id, details, created_at ` + baseQuery +
|
||||
` ORDER BY created_at DESC`
|
||||
|
||||
if filter.Limit > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
|
||||
args = append(args, filter.Limit)
|
||||
}
|
||||
if filter.Offset > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
|
||||
args = append(args, filter.Offset)
|
||||
}
|
||||
|
||||
rows, err := s.pool.Query(ctx, dataQuery, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to list blocked content: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
logs := []BlockedContentLog{}
|
||||
for rows.Next() {
|
||||
var l BlockedContentLog
|
||||
err := rows.Scan(
|
||||
&l.ID, &l.URL, &l.Domain, &l.BlockReason,
|
||||
&l.MatchedRuleID, &l.Details, &l.CreatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan blocked content: %w", err)
|
||||
}
|
||||
logs = append(logs, l)
|
||||
}
|
||||
|
||||
return logs, total, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// STATISTICS
|
||||
// =============================================================================
|
||||
|
||||
// GetStats retrieves aggregated statistics for the policy system.
|
||||
func (s *Store) GetStats(ctx context.Context) (*PolicyStats, error) {
|
||||
stats := &PolicyStats{
|
||||
SourcesByLicense: make(map[string]int),
|
||||
BlocksByReason: make(map[string]int),
|
||||
}
|
||||
|
||||
// Active policies
|
||||
err := s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM source_policies WHERE is_active = true`).Scan(&stats.ActivePolicies)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count active policies: %w", err)
|
||||
}
|
||||
|
||||
// Total sources
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources`).Scan(&stats.TotalSources)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count total sources: %w", err)
|
||||
}
|
||||
|
||||
// Active sources
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources WHERE is_active = true`).Scan(&stats.ActiveSources)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count active sources: %w", err)
|
||||
}
|
||||
|
||||
// Blocked today
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log WHERE created_at >= CURRENT_DATE`).Scan(&stats.BlockedToday)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count blocked today: %w", err)
|
||||
}
|
||||
|
||||
// Blocked total
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log`).Scan(&stats.BlockedTotal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count blocked total: %w", err)
|
||||
}
|
||||
|
||||
// Active PII rules
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM pii_rules WHERE is_active = true`).Scan(&stats.PIIRulesActive)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count active PII rules: %w", err)
|
||||
}
|
||||
|
||||
// Sources by license
|
||||
rows, err := s.pool.Query(ctx, `SELECT license, COUNT(*) FROM allowed_sources GROUP BY license`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count sources by license: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var license string
|
||||
var count int
|
||||
if err := rows.Scan(&license, &count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stats.SourcesByLicense[license] = count
|
||||
}
|
||||
|
||||
// Blocks by reason
|
||||
rows, err = s.pool.Query(ctx, `SELECT block_reason, COUNT(*) FROM blocked_content_log GROUP BY block_reason`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count blocks by reason: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var reason string
|
||||
var count int
|
||||
if err := rows.Scan(&reason, &count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stats.BlocksByReason[reason] = count
|
||||
}
|
||||
|
||||
// Compliance score (simplified: active sources / total sources)
|
||||
if stats.TotalSources > 0 {
|
||||
stats.ComplianceScore = float64(stats.ActiveSources) / float64(stats.TotalSources) * 100
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// YAML LOADER
|
||||
// =============================================================================
|
||||
|
||||
// LoadFromYAML loads initial policy data from YAML configuration.
|
||||
func (s *Store) LoadFromYAML(ctx context.Context, config *BundeslaenderConfig) error {
|
||||
// Load federal policy
|
||||
if config.Federal.Name != "" {
|
||||
err := s.loadPolicy(ctx, nil, &config.Federal, &config.DefaultOperations)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load federal policy: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Load Bundesland policies
|
||||
for code, policyConfig := range config.Bundeslaender {
|
||||
if code == "federal" || code == "default_operations" || code == "pii_rules" {
|
||||
continue
|
||||
}
|
||||
bl := Bundesland(code)
|
||||
err := s.loadPolicy(ctx, &bl, &policyConfig, &config.DefaultOperations)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load policy for %s: %w", code, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Load PII rules
|
||||
for _, ruleConfig := range config.PIIRules {
|
||||
err := s.loadPIIRule(ctx, &ruleConfig)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load PII rule %s: %w", ruleConfig.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) loadPolicy(ctx context.Context, bundesland *Bundesland, config *PolicyConfig, ops *OperationsConfig) error {
|
||||
// Create policy
|
||||
policy, err := s.CreatePolicy(ctx, &CreateSourcePolicyRequest{
|
||||
Name: config.Name,
|
||||
Bundesland: bundesland,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Create sources
|
||||
for _, srcConfig := range config.Sources {
|
||||
trustBoost := 0.5
|
||||
if srcConfig.TrustBoost > 0 {
|
||||
trustBoost = srcConfig.TrustBoost
|
||||
}
|
||||
|
||||
var legalBasis, citation *string
|
||||
if srcConfig.LegalBasis != "" {
|
||||
legalBasis = &srcConfig.LegalBasis
|
||||
}
|
||||
if srcConfig.CitationTemplate != "" {
|
||||
citation = &srcConfig.CitationTemplate
|
||||
}
|
||||
|
||||
_, err := s.CreateSource(ctx, &CreateAllowedSourceRequest{
|
||||
PolicyID: policy.ID,
|
||||
Domain: srcConfig.Domain,
|
||||
Name: srcConfig.Name,
|
||||
License: License(srcConfig.License),
|
||||
LegalBasis: legalBasis,
|
||||
CitationTemplate: citation,
|
||||
TrustBoost: &trustBoost,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create source %s: %w", srcConfig.Domain, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) loadPIIRule(ctx context.Context, config *PIIRuleConfig) error {
|
||||
severity := PIISeverityBlock
|
||||
if config.Severity != "" {
|
||||
severity = PIISeverity(config.Severity)
|
||||
}
|
||||
|
||||
_, err := s.CreatePIIRule(ctx, &CreatePIIRuleRequest{
|
||||
Name: config.Name,
|
||||
RuleType: PIIRuleType(config.Type),
|
||||
Pattern: config.Pattern,
|
||||
Severity: severity,
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// ToJSON converts an entity to JSON for audit logging.
|
||||
func ToJSON(v interface{}) json.RawMessage {
|
||||
data, _ := json.Marshal(v)
|
||||
return data
|
||||
}
|
||||
|
||||
411
edu-search-service/internal/policy/store_audit.go
Normal file
411
edu-search-service/internal/policy/store_audit.go
Normal file
@@ -0,0 +1,411 @@
|
||||
package policy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// =============================================================================
|
||||
// AUDIT LOG
|
||||
// =============================================================================
|
||||
|
||||
// CreateAuditLog creates a new audit log entry.
|
||||
func (s *Store) CreateAuditLog(ctx context.Context, entry *PolicyAuditLog) error {
|
||||
entry.ID = uuid.New()
|
||||
entry.CreatedAt = time.Now()
|
||||
|
||||
query := `
|
||||
INSERT INTO policy_audit_log (id, action, entity_type, entity_id, old_value, new_value,
|
||||
user_id, user_email, ip_address, user_agent, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`
|
||||
|
||||
_, err := s.pool.Exec(ctx, query,
|
||||
entry.ID, entry.Action, entry.EntityType, entry.EntityID,
|
||||
entry.OldValue, entry.NewValue, entry.UserID, entry.UserEmail,
|
||||
entry.IPAddress, entry.UserAgent, entry.CreatedAt,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create audit log: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListAuditLogs retrieves audit logs with filters.
|
||||
func (s *Store) ListAuditLogs(ctx context.Context, filter *AuditLogFilter) ([]PolicyAuditLog, int, error) {
|
||||
baseQuery := `FROM policy_audit_log WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argCount := 0
|
||||
|
||||
if filter.EntityType != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND entity_type = $%d", argCount)
|
||||
args = append(args, *filter.EntityType)
|
||||
}
|
||||
|
||||
if filter.EntityID != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND entity_id = $%d", argCount)
|
||||
args = append(args, *filter.EntityID)
|
||||
}
|
||||
|
||||
if filter.Action != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND action = $%d", argCount)
|
||||
args = append(args, *filter.Action)
|
||||
}
|
||||
|
||||
if filter.UserEmail != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND user_email ILIKE $%d", argCount)
|
||||
args = append(args, "%"+*filter.UserEmail+"%")
|
||||
}
|
||||
|
||||
if filter.FromDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
|
||||
args = append(args, *filter.FromDate)
|
||||
}
|
||||
|
||||
if filter.ToDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
|
||||
args = append(args, *filter.ToDate)
|
||||
}
|
||||
|
||||
// Count query
|
||||
var total int
|
||||
countQuery := "SELECT COUNT(*) " + baseQuery
|
||||
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count audit logs: %w", err)
|
||||
}
|
||||
|
||||
// Data query
|
||||
dataQuery := `SELECT id, action, entity_type, entity_id, old_value, new_value,
|
||||
user_id, user_email, ip_address, user_agent, created_at ` + baseQuery +
|
||||
` ORDER BY created_at DESC`
|
||||
|
||||
if filter.Limit > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
|
||||
args = append(args, filter.Limit)
|
||||
}
|
||||
if filter.Offset > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
|
||||
args = append(args, filter.Offset)
|
||||
}
|
||||
|
||||
rows, err := s.pool.Query(ctx, dataQuery, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to list audit logs: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
logs := []PolicyAuditLog{}
|
||||
for rows.Next() {
|
||||
var l PolicyAuditLog
|
||||
err := rows.Scan(
|
||||
&l.ID, &l.Action, &l.EntityType, &l.EntityID, &l.OldValue, &l.NewValue,
|
||||
&l.UserID, &l.UserEmail, &l.IPAddress, &l.UserAgent, &l.CreatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan audit log: %w", err)
|
||||
}
|
||||
logs = append(logs, l)
|
||||
}
|
||||
|
||||
return logs, total, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// BLOCKED CONTENT LOG
|
||||
// =============================================================================
|
||||
|
||||
// CreateBlockedContentLog creates a new blocked content log entry.
|
||||
func (s *Store) CreateBlockedContentLog(ctx context.Context, entry *BlockedContentLog) error {
|
||||
entry.ID = uuid.New()
|
||||
entry.CreatedAt = time.Now()
|
||||
|
||||
query := `
|
||||
INSERT INTO blocked_content_log (id, url, domain, block_reason, matched_rule_id, details, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)`
|
||||
|
||||
_, err := s.pool.Exec(ctx, query,
|
||||
entry.ID, entry.URL, entry.Domain, entry.BlockReason,
|
||||
entry.MatchedRuleID, entry.Details, entry.CreatedAt,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create blocked content log: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ListBlockedContent retrieves blocked content logs with filters.
|
||||
func (s *Store) ListBlockedContent(ctx context.Context, filter *BlockedContentFilter) ([]BlockedContentLog, int, error) {
|
||||
baseQuery := `FROM blocked_content_log WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argCount := 0
|
||||
|
||||
if filter.Domain != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND domain ILIKE $%d", argCount)
|
||||
args = append(args, "%"+*filter.Domain+"%")
|
||||
}
|
||||
|
||||
if filter.BlockReason != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND block_reason = $%d", argCount)
|
||||
args = append(args, *filter.BlockReason)
|
||||
}
|
||||
|
||||
if filter.FromDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
|
||||
args = append(args, *filter.FromDate)
|
||||
}
|
||||
|
||||
if filter.ToDate != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
|
||||
args = append(args, *filter.ToDate)
|
||||
}
|
||||
|
||||
// Count query
|
||||
var total int
|
||||
countQuery := "SELECT COUNT(*) " + baseQuery
|
||||
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count blocked content: %w", err)
|
||||
}
|
||||
|
||||
// Data query
|
||||
dataQuery := `SELECT id, url, domain, block_reason, matched_rule_id, details, created_at ` + baseQuery +
|
||||
` ORDER BY created_at DESC`
|
||||
|
||||
if filter.Limit > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
|
||||
args = append(args, filter.Limit)
|
||||
}
|
||||
if filter.Offset > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
|
||||
args = append(args, filter.Offset)
|
||||
}
|
||||
|
||||
rows, err := s.pool.Query(ctx, dataQuery, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to list blocked content: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
logs := []BlockedContentLog{}
|
||||
for rows.Next() {
|
||||
var l BlockedContentLog
|
||||
err := rows.Scan(
|
||||
&l.ID, &l.URL, &l.Domain, &l.BlockReason,
|
||||
&l.MatchedRuleID, &l.Details, &l.CreatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan blocked content: %w", err)
|
||||
}
|
||||
logs = append(logs, l)
|
||||
}
|
||||
|
||||
return logs, total, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// STATISTICS
|
||||
// =============================================================================
|
||||
|
||||
// GetStats retrieves aggregated statistics for the policy system.
|
||||
func (s *Store) GetStats(ctx context.Context) (*PolicyStats, error) {
|
||||
stats := &PolicyStats{
|
||||
SourcesByLicense: make(map[string]int),
|
||||
BlocksByReason: make(map[string]int),
|
||||
}
|
||||
|
||||
// Active policies
|
||||
err := s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM source_policies WHERE is_active = true`).Scan(&stats.ActivePolicies)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count active policies: %w", err)
|
||||
}
|
||||
|
||||
// Total sources
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources`).Scan(&stats.TotalSources)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count total sources: %w", err)
|
||||
}
|
||||
|
||||
// Active sources
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources WHERE is_active = true`).Scan(&stats.ActiveSources)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count active sources: %w", err)
|
||||
}
|
||||
|
||||
// Blocked today
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log WHERE created_at >= CURRENT_DATE`).Scan(&stats.BlockedToday)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count blocked today: %w", err)
|
||||
}
|
||||
|
||||
// Blocked total
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log`).Scan(&stats.BlockedTotal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count blocked total: %w", err)
|
||||
}
|
||||
|
||||
// Active PII rules
|
||||
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM pii_rules WHERE is_active = true`).Scan(&stats.PIIRulesActive)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count active PII rules: %w", err)
|
||||
}
|
||||
|
||||
// Sources by license
|
||||
rows, err := s.pool.Query(ctx, `SELECT license, COUNT(*) FROM allowed_sources GROUP BY license`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count sources by license: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var license string
|
||||
var count int
|
||||
if err := rows.Scan(&license, &count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stats.SourcesByLicense[license] = count
|
||||
}
|
||||
|
||||
// Blocks by reason
|
||||
rows, err = s.pool.Query(ctx, `SELECT block_reason, COUNT(*) FROM blocked_content_log GROUP BY block_reason`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to count blocks by reason: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var reason string
|
||||
var count int
|
||||
if err := rows.Scan(&reason, &count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stats.BlocksByReason[reason] = count
|
||||
}
|
||||
|
||||
// Compliance score (simplified: active sources / total sources)
|
||||
if stats.TotalSources > 0 {
|
||||
stats.ComplianceScore = float64(stats.ActiveSources) / float64(stats.TotalSources) * 100
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// YAML LOADER
|
||||
// =============================================================================
|
||||
|
||||
// LoadFromYAML loads initial policy data from YAML configuration.
|
||||
func (s *Store) LoadFromYAML(ctx context.Context, config *BundeslaenderConfig) error {
|
||||
// Load federal policy
|
||||
if config.Federal.Name != "" {
|
||||
err := s.loadPolicy(ctx, nil, &config.Federal, &config.DefaultOperations)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load federal policy: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Load Bundesland policies
|
||||
for code, policyConfig := range config.Bundeslaender {
|
||||
if code == "federal" || code == "default_operations" || code == "pii_rules" {
|
||||
continue
|
||||
}
|
||||
bl := Bundesland(code)
|
||||
err := s.loadPolicy(ctx, &bl, &policyConfig, &config.DefaultOperations)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load policy for %s: %w", code, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Load PII rules
|
||||
for _, ruleConfig := range config.PIIRules {
|
||||
err := s.loadPIIRule(ctx, &ruleConfig)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load PII rule %s: %w", ruleConfig.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) loadPolicy(ctx context.Context, bundesland *Bundesland, config *PolicyConfig, ops *OperationsConfig) error {
|
||||
// Create policy
|
||||
policy, err := s.CreatePolicy(ctx, &CreateSourcePolicyRequest{
|
||||
Name: config.Name,
|
||||
Bundesland: bundesland,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Create sources
|
||||
for _, srcConfig := range config.Sources {
|
||||
trustBoost := 0.5
|
||||
if srcConfig.TrustBoost > 0 {
|
||||
trustBoost = srcConfig.TrustBoost
|
||||
}
|
||||
|
||||
var legalBasis, citation *string
|
||||
if srcConfig.LegalBasis != "" {
|
||||
legalBasis = &srcConfig.LegalBasis
|
||||
}
|
||||
if srcConfig.CitationTemplate != "" {
|
||||
citation = &srcConfig.CitationTemplate
|
||||
}
|
||||
|
||||
_, err := s.CreateSource(ctx, &CreateAllowedSourceRequest{
|
||||
PolicyID: policy.ID,
|
||||
Domain: srcConfig.Domain,
|
||||
Name: srcConfig.Name,
|
||||
License: License(srcConfig.License),
|
||||
LegalBasis: legalBasis,
|
||||
CitationTemplate: citation,
|
||||
TrustBoost: &trustBoost,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create source %s: %w", srcConfig.Domain, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) loadPIIRule(ctx context.Context, config *PIIRuleConfig) error {
|
||||
severity := PIISeverityBlock
|
||||
if config.Severity != "" {
|
||||
severity = PIISeverity(config.Severity)
|
||||
}
|
||||
|
||||
_, err := s.CreatePIIRule(ctx, &CreatePIIRuleRequest{
|
||||
Name: config.Name,
|
||||
RuleType: PIIRuleType(config.Type),
|
||||
Pattern: config.Pattern,
|
||||
Severity: severity,
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// ToJSON converts an entity to JSON for audit logging.
|
||||
func ToJSON(v interface{}) json.RawMessage {
|
||||
data, _ := json.Marshal(v)
|
||||
return data
|
||||
}
|
||||
417
edu-search-service/internal/policy/store_sources.go
Normal file
417
edu-search-service/internal/policy/store_sources.go
Normal file
@@ -0,0 +1,417 @@
|
||||
package policy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
// =============================================================================
|
||||
// ALLOWED SOURCES
|
||||
// =============================================================================
|
||||
|
||||
// CreateSource creates a new allowed source.
|
||||
func (s *Store) CreateSource(ctx context.Context, req *CreateAllowedSourceRequest) (*AllowedSource, error) {
|
||||
trustBoost := 0.5
|
||||
if req.TrustBoost != nil {
|
||||
trustBoost = *req.TrustBoost
|
||||
}
|
||||
|
||||
source := &AllowedSource{
|
||||
ID: uuid.New(),
|
||||
PolicyID: req.PolicyID,
|
||||
Domain: req.Domain,
|
||||
Name: req.Name,
|
||||
Description: req.Description,
|
||||
License: req.License,
|
||||
LegalBasis: req.LegalBasis,
|
||||
CitationTemplate: req.CitationTemplate,
|
||||
TrustBoost: trustBoost,
|
||||
IsActive: true,
|
||||
CreatedAt: time.Now(),
|
||||
UpdatedAt: time.Now(),
|
||||
}
|
||||
|
||||
query := `
|
||||
INSERT INTO allowed_sources (id, policy_id, domain, name, description, license,
|
||||
legal_basis, citation_template, trust_boost, is_active,
|
||||
created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
|
||||
RETURNING id`
|
||||
|
||||
err := s.pool.QueryRow(ctx, query,
|
||||
source.ID, source.PolicyID, source.Domain, source.Name, source.Description,
|
||||
source.License, source.LegalBasis, source.CitationTemplate, source.TrustBoost,
|
||||
source.IsActive, source.CreatedAt, source.UpdatedAt,
|
||||
).Scan(&source.ID)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create source: %w", err)
|
||||
}
|
||||
|
||||
// Create default operation permissions
|
||||
err = s.createDefaultOperations(ctx, source.ID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create default operations: %w", err)
|
||||
}
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// createDefaultOperations creates default operation permissions for a source.
|
||||
func (s *Store) createDefaultOperations(ctx context.Context, sourceID uuid.UUID) error {
|
||||
defaults := []struct {
|
||||
op Operation
|
||||
allowed bool
|
||||
citation bool
|
||||
}{
|
||||
{OperationLookup, true, true},
|
||||
{OperationRAG, true, true},
|
||||
{OperationTraining, false, false}, // VERBOTEN by default
|
||||
{OperationExport, true, true},
|
||||
}
|
||||
|
||||
for _, d := range defaults {
|
||||
query := `
|
||||
INSERT INTO operation_permissions (id, source_id, operation, is_allowed, requires_citation, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)`
|
||||
_, err := s.pool.Exec(ctx, query, uuid.New(), sourceID, d.op, d.allowed, d.citation, time.Now(), time.Now())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetSource retrieves a source by ID.
|
||||
func (s *Store) GetSource(ctx context.Context, id uuid.UUID) (*AllowedSource, error) {
|
||||
query := `
|
||||
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
|
||||
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
|
||||
als.created_at, als.updated_at, sp.name as policy_name
|
||||
FROM allowed_sources als
|
||||
JOIN source_policies sp ON als.policy_id = sp.id
|
||||
WHERE als.id = $1`
|
||||
|
||||
source := &AllowedSource{}
|
||||
err := s.pool.QueryRow(ctx, query, id).Scan(
|
||||
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
|
||||
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
|
||||
&source.IsActive, &source.CreatedAt, &source.UpdatedAt, &source.PolicyName,
|
||||
)
|
||||
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get source: %w", err)
|
||||
}
|
||||
|
||||
// Load operations
|
||||
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
source.Operations = ops
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// GetSourceByDomain retrieves a source by domain with optional bundesland filter.
|
||||
func (s *Store) GetSourceByDomain(ctx context.Context, domain string, bundesland *Bundesland) (*AllowedSource, error) {
|
||||
query := `
|
||||
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
|
||||
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
|
||||
als.created_at, als.updated_at
|
||||
FROM allowed_sources als
|
||||
JOIN source_policies sp ON als.policy_id = sp.id
|
||||
WHERE als.is_active = true
|
||||
AND sp.is_active = true
|
||||
AND (als.domain = $1 OR $1 LIKE '%.' || als.domain)
|
||||
AND (sp.bundesland IS NULL OR sp.bundesland = $2)
|
||||
LIMIT 1`
|
||||
|
||||
source := &AllowedSource{}
|
||||
err := s.pool.QueryRow(ctx, query, domain, bundesland).Scan(
|
||||
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
|
||||
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
|
||||
&source.IsActive, &source.CreatedAt, &source.UpdatedAt,
|
||||
)
|
||||
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get source by domain: %w", err)
|
||||
}
|
||||
|
||||
// Load operations
|
||||
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
source.Operations = ops
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// ListSources retrieves sources with optional filters.
|
||||
func (s *Store) ListSources(ctx context.Context, filter *SourceListFilter) ([]AllowedSource, int, error) {
|
||||
baseQuery := `FROM allowed_sources als JOIN source_policies sp ON als.policy_id = sp.id WHERE 1=1`
|
||||
args := []interface{}{}
|
||||
argCount := 0
|
||||
|
||||
if filter.PolicyID != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.policy_id = $%d", argCount)
|
||||
args = append(args, *filter.PolicyID)
|
||||
}
|
||||
|
||||
if filter.Domain != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.domain ILIKE $%d", argCount)
|
||||
args = append(args, "%"+*filter.Domain+"%")
|
||||
}
|
||||
|
||||
if filter.License != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.license = $%d", argCount)
|
||||
args = append(args, *filter.License)
|
||||
}
|
||||
|
||||
if filter.IsActive != nil {
|
||||
argCount++
|
||||
baseQuery += fmt.Sprintf(" AND als.is_active = $%d", argCount)
|
||||
args = append(args, *filter.IsActive)
|
||||
}
|
||||
|
||||
// Count query
|
||||
var total int
|
||||
countQuery := "SELECT COUNT(*) " + baseQuery
|
||||
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to count sources: %w", err)
|
||||
}
|
||||
|
||||
// Data query
|
||||
dataQuery := `SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
|
||||
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
|
||||
als.created_at, als.updated_at, sp.name as policy_name ` + baseQuery +
|
||||
` ORDER BY als.created_at DESC`
|
||||
|
||||
if filter.Limit > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
|
||||
args = append(args, filter.Limit)
|
||||
}
|
||||
if filter.Offset > 0 {
|
||||
argCount++
|
||||
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
|
||||
args = append(args, filter.Offset)
|
||||
}
|
||||
|
||||
rows, err := s.pool.Query(ctx, dataQuery, args...)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to list sources: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
sources := []AllowedSource{}
|
||||
for rows.Next() {
|
||||
var src AllowedSource
|
||||
err := rows.Scan(
|
||||
&src.ID, &src.PolicyID, &src.Domain, &src.Name, &src.Description,
|
||||
&src.License, &src.LegalBasis, &src.CitationTemplate, &src.TrustBoost,
|
||||
&src.IsActive, &src.CreatedAt, &src.UpdatedAt, &src.PolicyName,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("failed to scan source: %w", err)
|
||||
}
|
||||
sources = append(sources, src)
|
||||
}
|
||||
|
||||
return sources, total, nil
|
||||
}
|
||||
|
||||
// UpdateSource updates an existing source.
|
||||
func (s *Store) UpdateSource(ctx context.Context, id uuid.UUID, req *UpdateAllowedSourceRequest) (*AllowedSource, error) {
|
||||
source, err := s.GetSource(ctx, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source not found")
|
||||
}
|
||||
|
||||
if req.Domain != nil {
|
||||
source.Domain = *req.Domain
|
||||
}
|
||||
if req.Name != nil {
|
||||
source.Name = *req.Name
|
||||
}
|
||||
if req.Description != nil {
|
||||
source.Description = req.Description
|
||||
}
|
||||
if req.License != nil {
|
||||
source.License = *req.License
|
||||
}
|
||||
if req.LegalBasis != nil {
|
||||
source.LegalBasis = req.LegalBasis
|
||||
}
|
||||
if req.CitationTemplate != nil {
|
||||
source.CitationTemplate = req.CitationTemplate
|
||||
}
|
||||
if req.TrustBoost != nil {
|
||||
source.TrustBoost = *req.TrustBoost
|
||||
}
|
||||
if req.IsActive != nil {
|
||||
source.IsActive = *req.IsActive
|
||||
}
|
||||
source.UpdatedAt = time.Now()
|
||||
|
||||
query := `
|
||||
UPDATE allowed_sources
|
||||
SET domain = $2, name = $3, description = $4, license = $5, legal_basis = $6,
|
||||
citation_template = $7, trust_boost = $8, is_active = $9, updated_at = $10
|
||||
WHERE id = $1`
|
||||
|
||||
_, err = s.pool.Exec(ctx, query,
|
||||
id, source.Domain, source.Name, source.Description, source.License,
|
||||
source.LegalBasis, source.CitationTemplate, source.TrustBoost,
|
||||
source.IsActive, source.UpdatedAt,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to update source: %w", err)
|
||||
}
|
||||
|
||||
return source, nil
|
||||
}
|
||||
|
||||
// DeleteSource deletes a source by ID.
|
||||
func (s *Store) DeleteSource(ctx context.Context, id uuid.UUID) error {
|
||||
query := `DELETE FROM allowed_sources WHERE id = $1`
|
||||
_, err := s.pool.Exec(ctx, query, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete source: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// OPERATION PERMISSIONS
|
||||
// =============================================================================
|
||||
|
||||
// GetOperationsBySourceID retrieves all operation permissions for a source.
|
||||
func (s *Store) GetOperationsBySourceID(ctx context.Context, sourceID uuid.UUID) ([]OperationPermission, error) {
|
||||
query := `
|
||||
SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
|
||||
FROM operation_permissions
|
||||
WHERE source_id = $1
|
||||
ORDER BY operation`
|
||||
|
||||
rows, err := s.pool.Query(ctx, query, sourceID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operations: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
ops := []OperationPermission{}
|
||||
for rows.Next() {
|
||||
var op OperationPermission
|
||||
err := rows.Scan(
|
||||
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
|
||||
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan operation: %w", err)
|
||||
}
|
||||
ops = append(ops, op)
|
||||
}
|
||||
|
||||
return ops, nil
|
||||
}
|
||||
|
||||
// UpdateOperationPermission updates an operation permission.
|
||||
func (s *Store) UpdateOperationPermission(ctx context.Context, id uuid.UUID, req *UpdateOperationPermissionRequest) (*OperationPermission, error) {
|
||||
query := `SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
|
||||
FROM operation_permissions WHERE id = $1`
|
||||
|
||||
op := &OperationPermission{}
|
||||
err := s.pool.QueryRow(ctx, query, id).Scan(
|
||||
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
|
||||
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
|
||||
)
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, fmt.Errorf("operation permission not found")
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operation: %w", err)
|
||||
}
|
||||
|
||||
if req.IsAllowed != nil {
|
||||
op.IsAllowed = *req.IsAllowed
|
||||
}
|
||||
if req.RequiresCitation != nil {
|
||||
op.RequiresCitation = *req.RequiresCitation
|
||||
}
|
||||
if req.Notes != nil {
|
||||
op.Notes = req.Notes
|
||||
}
|
||||
op.UpdatedAt = time.Now()
|
||||
|
||||
updateQuery := `
|
||||
UPDATE operation_permissions
|
||||
SET is_allowed = $2, requires_citation = $3, notes = $4, updated_at = $5
|
||||
WHERE id = $1`
|
||||
|
||||
_, err = s.pool.Exec(ctx, updateQuery, id, op.IsAllowed, op.RequiresCitation, op.Notes, op.UpdatedAt)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to update operation: %w", err)
|
||||
}
|
||||
|
||||
return op, nil
|
||||
}
|
||||
|
||||
// GetOperationsMatrix retrieves all operation permissions grouped by source.
|
||||
func (s *Store) GetOperationsMatrix(ctx context.Context) ([]AllowedSource, error) {
|
||||
query := `
|
||||
SELECT als.id, als.domain, als.name, als.license, als.is_active,
|
||||
sp.name as policy_name, sp.bundesland
|
||||
FROM allowed_sources als
|
||||
JOIN source_policies sp ON als.policy_id = sp.id
|
||||
WHERE als.is_active = true AND sp.is_active = true
|
||||
ORDER BY sp.bundesland NULLS FIRST, als.name`
|
||||
|
||||
rows, err := s.pool.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get operations matrix: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
sources := []AllowedSource{}
|
||||
for rows.Next() {
|
||||
var src AllowedSource
|
||||
var bundesland *Bundesland
|
||||
err := rows.Scan(
|
||||
&src.ID, &src.Domain, &src.Name, &src.License, &src.IsActive,
|
||||
&src.PolicyName, &bundesland,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan source: %w", err)
|
||||
}
|
||||
|
||||
// Load operations for each source
|
||||
ops, err := s.GetOperationsBySourceID(ctx, src.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
src.Operations = ops
|
||||
sources = append(sources, src)
|
||||
}
|
||||
|
||||
return sources, nil
|
||||
}
|
||||
@@ -214,355 +214,6 @@ func (s *Service) Search(ctx context.Context, req *SearchRequest) (*SearchRespon
|
||||
}, nil
|
||||
}
|
||||
|
||||
// buildQuery constructs the OpenSearch query
|
||||
func (s *Service) buildQuery(req *SearchRequest) map[string]interface{} {
|
||||
// Main query
|
||||
must := []map[string]interface{}{}
|
||||
filter := []map[string]interface{}{}
|
||||
|
||||
// Text search
|
||||
if req.Query != "" {
|
||||
must = append(must, map[string]interface{}{
|
||||
"multi_match": map[string]interface{}{
|
||||
"query": req.Query,
|
||||
"fields": []string{"title^3", "content_text"},
|
||||
"type": "best_fields",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Filters
|
||||
if len(req.Filters.Language) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"language": req.Filters.Language},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.CountryHint) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SourceCategory) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.DocType) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SchoolLevel) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.Subjects) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.State) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"state": req.Filters.State},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.MinTrustScore > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.DateFrom != "" {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Build bool query
|
||||
boolQuery := map[string]interface{}{}
|
||||
if len(must) > 0 {
|
||||
boolQuery["must"] = must
|
||||
}
|
||||
if len(filter) > 0 {
|
||||
boolQuery["filter"] = filter
|
||||
}
|
||||
|
||||
// Construct full query
|
||||
query := map[string]interface{}{
|
||||
"query": map[string]interface{}{
|
||||
"bool": boolQuery,
|
||||
},
|
||||
"from": req.Offset,
|
||||
"size": req.Limit,
|
||||
"_source": []string{
|
||||
"doc_id", "title", "url", "domain", "language",
|
||||
"doc_type", "school_level", "subjects",
|
||||
"trust_score", "quality_score", "snippet_text",
|
||||
},
|
||||
}
|
||||
|
||||
// Add highlighting if requested
|
||||
if req.Include.Highlights {
|
||||
query["highlight"] = map[string]interface{}{
|
||||
"fields": map[string]interface{}{
|
||||
"title": map[string]interface{}{},
|
||||
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Add function score for trust/quality boosting
|
||||
query["query"] = map[string]interface{}{
|
||||
"function_score": map[string]interface{}{
|
||||
"query": query["query"],
|
||||
"functions": []map[string]interface{}{
|
||||
{
|
||||
"field_value_factor": map[string]interface{}{
|
||||
"field": "trust_score",
|
||||
"factor": 1.5,
|
||||
"modifier": "sqrt",
|
||||
"missing": 0.5,
|
||||
},
|
||||
},
|
||||
{
|
||||
"field_value_factor": map[string]interface{}{
|
||||
"field": "quality_score",
|
||||
"factor": 1.0,
|
||||
"modifier": "sqrt",
|
||||
"missing": 0.5,
|
||||
},
|
||||
},
|
||||
},
|
||||
"score_mode": "multiply",
|
||||
"boost_mode": "multiply",
|
||||
},
|
||||
}
|
||||
|
||||
return query
|
||||
}
|
||||
|
||||
// buildSemanticQuery constructs a pure vector search query using k-NN
|
||||
func (s *Service) buildSemanticQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
|
||||
filter := s.buildFilters(req)
|
||||
|
||||
// k-NN query for semantic search
|
||||
knnQuery := map[string]interface{}{
|
||||
"content_embedding": map[string]interface{}{
|
||||
"vector": embedding,
|
||||
"k": req.Limit + req.Offset, // Get enough results for pagination
|
||||
},
|
||||
}
|
||||
|
||||
// Add filter if present
|
||||
if len(filter) > 0 {
|
||||
knnQuery["content_embedding"].(map[string]interface{})["filter"] = map[string]interface{}{
|
||||
"bool": map[string]interface{}{
|
||||
"filter": filter,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
query := map[string]interface{}{
|
||||
"knn": knnQuery,
|
||||
"from": req.Offset,
|
||||
"size": req.Limit,
|
||||
"_source": []string{
|
||||
"doc_id", "title", "url", "domain", "language",
|
||||
"doc_type", "school_level", "subjects",
|
||||
"trust_score", "quality_score", "snippet_text",
|
||||
},
|
||||
}
|
||||
|
||||
// Add highlighting if requested
|
||||
if req.Include.Highlights {
|
||||
query["highlight"] = map[string]interface{}{
|
||||
"fields": map[string]interface{}{
|
||||
"title": map[string]interface{}{},
|
||||
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return query
|
||||
}
|
||||
|
||||
// buildHybridQuery constructs a combined BM25 + vector search query
|
||||
func (s *Service) buildHybridQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
|
||||
filter := s.buildFilters(req)
|
||||
|
||||
// Build the bool query for BM25
|
||||
must := []map[string]interface{}{}
|
||||
if req.Query != "" {
|
||||
must = append(must, map[string]interface{}{
|
||||
"multi_match": map[string]interface{}{
|
||||
"query": req.Query,
|
||||
"fields": []string{"title^3", "content_text"},
|
||||
"type": "best_fields",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
boolQuery := map[string]interface{}{}
|
||||
if len(must) > 0 {
|
||||
boolQuery["must"] = must
|
||||
}
|
||||
if len(filter) > 0 {
|
||||
boolQuery["filter"] = filter
|
||||
}
|
||||
|
||||
// Convert embedding to []interface{} for JSON
|
||||
embeddingInterface := make([]interface{}, len(embedding))
|
||||
for i, v := range embedding {
|
||||
embeddingInterface[i] = v
|
||||
}
|
||||
|
||||
// Hybrid query using script_score to combine BM25 and cosine similarity
|
||||
// This is a simpler approach than OpenSearch's neural search plugin
|
||||
query := map[string]interface{}{
|
||||
"query": map[string]interface{}{
|
||||
"script_score": map[string]interface{}{
|
||||
"query": map[string]interface{}{
|
||||
"bool": boolQuery,
|
||||
},
|
||||
"script": map[string]interface{}{
|
||||
"source": "cosineSimilarity(params.query_vector, 'content_embedding') + 1.0 + _score * 0.5",
|
||||
"params": map[string]interface{}{
|
||||
"query_vector": embeddingInterface,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"from": req.Offset,
|
||||
"size": req.Limit,
|
||||
"_source": []string{
|
||||
"doc_id", "title", "url", "domain", "language",
|
||||
"doc_type", "school_level", "subjects",
|
||||
"trust_score", "quality_score", "snippet_text",
|
||||
},
|
||||
}
|
||||
|
||||
// Add highlighting if requested
|
||||
if req.Include.Highlights {
|
||||
query["highlight"] = map[string]interface{}{
|
||||
"fields": map[string]interface{}{
|
||||
"title": map[string]interface{}{},
|
||||
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return query
|
||||
}
|
||||
|
||||
// buildFilters constructs the filter array for queries
|
||||
func (s *Service) buildFilters(req *SearchRequest) []map[string]interface{} {
|
||||
filter := []map[string]interface{}{}
|
||||
|
||||
if len(req.Filters.Language) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"language": req.Filters.Language},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.CountryHint) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SourceCategory) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.DocType) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SchoolLevel) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.Subjects) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.State) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"state": req.Filters.State},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.MinTrustScore > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.DateFrom != "" {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return filter
|
||||
}
|
||||
|
||||
// hitToResult converts an OpenSearch hit to SearchResult
|
||||
func (s *Service) hitToResult(source map[string]interface{}, score float64, highlight map[string][]string, include SearchInclude) SearchResult {
|
||||
result := SearchResult{
|
||||
DocID: getString(source, "doc_id"),
|
||||
Title: getString(source, "title"),
|
||||
URL: getString(source, "url"),
|
||||
Domain: getString(source, "domain"),
|
||||
Language: getString(source, "language"),
|
||||
DocType: getString(source, "doc_type"),
|
||||
SchoolLevel: getString(source, "school_level"),
|
||||
Subjects: getStringArray(source, "subjects"),
|
||||
Scores: Scores{
|
||||
BM25: score,
|
||||
Trust: getFloat(source, "trust_score"),
|
||||
Quality: getFloat(source, "quality_score"),
|
||||
Final: score, // MVP: final = BM25 * trust * quality (via function_score)
|
||||
},
|
||||
}
|
||||
|
||||
if include.Snippets {
|
||||
result.Snippet = getString(source, "snippet_text")
|
||||
}
|
||||
|
||||
if include.Highlights && highlight != nil {
|
||||
if h, ok := highlight["content_text"]; ok {
|
||||
result.Highlights = h
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
func getString(m map[string]interface{}, key string) string {
|
||||
if v, ok := m[key].(string); ok {
|
||||
|
||||
350
edu-search-service/internal/search/search_query.go
Normal file
350
edu-search-service/internal/search/search_query.go
Normal file
@@ -0,0 +1,350 @@
|
||||
package search
|
||||
|
||||
// buildQuery constructs the OpenSearch query
|
||||
func (s *Service) buildQuery(req *SearchRequest) map[string]interface{} {
|
||||
// Main query
|
||||
must := []map[string]interface{}{}
|
||||
filter := []map[string]interface{}{}
|
||||
|
||||
// Text search
|
||||
if req.Query != "" {
|
||||
must = append(must, map[string]interface{}{
|
||||
"multi_match": map[string]interface{}{
|
||||
"query": req.Query,
|
||||
"fields": []string{"title^3", "content_text"},
|
||||
"type": "best_fields",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Filters
|
||||
if len(req.Filters.Language) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"language": req.Filters.Language},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.CountryHint) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SourceCategory) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.DocType) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SchoolLevel) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.Subjects) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.State) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"state": req.Filters.State},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.MinTrustScore > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.DateFrom != "" {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Build bool query
|
||||
boolQuery := map[string]interface{}{}
|
||||
if len(must) > 0 {
|
||||
boolQuery["must"] = must
|
||||
}
|
||||
if len(filter) > 0 {
|
||||
boolQuery["filter"] = filter
|
||||
}
|
||||
|
||||
// Construct full query
|
||||
query := map[string]interface{}{
|
||||
"query": map[string]interface{}{
|
||||
"bool": boolQuery,
|
||||
},
|
||||
"from": req.Offset,
|
||||
"size": req.Limit,
|
||||
"_source": []string{
|
||||
"doc_id", "title", "url", "domain", "language",
|
||||
"doc_type", "school_level", "subjects",
|
||||
"trust_score", "quality_score", "snippet_text",
|
||||
},
|
||||
}
|
||||
|
||||
// Add highlighting if requested
|
||||
if req.Include.Highlights {
|
||||
query["highlight"] = map[string]interface{}{
|
||||
"fields": map[string]interface{}{
|
||||
"title": map[string]interface{}{},
|
||||
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Add function score for trust/quality boosting
|
||||
query["query"] = map[string]interface{}{
|
||||
"function_score": map[string]interface{}{
|
||||
"query": query["query"],
|
||||
"functions": []map[string]interface{}{
|
||||
{
|
||||
"field_value_factor": map[string]interface{}{
|
||||
"field": "trust_score",
|
||||
"factor": 1.5,
|
||||
"modifier": "sqrt",
|
||||
"missing": 0.5,
|
||||
},
|
||||
},
|
||||
{
|
||||
"field_value_factor": map[string]interface{}{
|
||||
"field": "quality_score",
|
||||
"factor": 1.0,
|
||||
"modifier": "sqrt",
|
||||
"missing": 0.5,
|
||||
},
|
||||
},
|
||||
},
|
||||
"score_mode": "multiply",
|
||||
"boost_mode": "multiply",
|
||||
},
|
||||
}
|
||||
|
||||
return query
|
||||
}
|
||||
|
||||
// buildSemanticQuery constructs a pure vector search query using k-NN
|
||||
func (s *Service) buildSemanticQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
|
||||
filter := s.buildFilters(req)
|
||||
|
||||
// k-NN query for semantic search
|
||||
knnQuery := map[string]interface{}{
|
||||
"content_embedding": map[string]interface{}{
|
||||
"vector": embedding,
|
||||
"k": req.Limit + req.Offset, // Get enough results for pagination
|
||||
},
|
||||
}
|
||||
|
||||
// Add filter if present
|
||||
if len(filter) > 0 {
|
||||
knnQuery["content_embedding"].(map[string]interface{})["filter"] = map[string]interface{}{
|
||||
"bool": map[string]interface{}{
|
||||
"filter": filter,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
query := map[string]interface{}{
|
||||
"knn": knnQuery,
|
||||
"from": req.Offset,
|
||||
"size": req.Limit,
|
||||
"_source": []string{
|
||||
"doc_id", "title", "url", "domain", "language",
|
||||
"doc_type", "school_level", "subjects",
|
||||
"trust_score", "quality_score", "snippet_text",
|
||||
},
|
||||
}
|
||||
|
||||
// Add highlighting if requested
|
||||
if req.Include.Highlights {
|
||||
query["highlight"] = map[string]interface{}{
|
||||
"fields": map[string]interface{}{
|
||||
"title": map[string]interface{}{},
|
||||
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return query
|
||||
}
|
||||
|
||||
// buildHybridQuery constructs a combined BM25 + vector search query
|
||||
func (s *Service) buildHybridQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
|
||||
filter := s.buildFilters(req)
|
||||
|
||||
// Build the bool query for BM25
|
||||
must := []map[string]interface{}{}
|
||||
if req.Query != "" {
|
||||
must = append(must, map[string]interface{}{
|
||||
"multi_match": map[string]interface{}{
|
||||
"query": req.Query,
|
||||
"fields": []string{"title^3", "content_text"},
|
||||
"type": "best_fields",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
boolQuery := map[string]interface{}{}
|
||||
if len(must) > 0 {
|
||||
boolQuery["must"] = must
|
||||
}
|
||||
if len(filter) > 0 {
|
||||
boolQuery["filter"] = filter
|
||||
}
|
||||
|
||||
// Convert embedding to []interface{} for JSON
|
||||
embeddingInterface := make([]interface{}, len(embedding))
|
||||
for i, v := range embedding {
|
||||
embeddingInterface[i] = v
|
||||
}
|
||||
|
||||
// Hybrid query using script_score to combine BM25 and cosine similarity
|
||||
// This is a simpler approach than OpenSearch's neural search plugin
|
||||
query := map[string]interface{}{
|
||||
"query": map[string]interface{}{
|
||||
"script_score": map[string]interface{}{
|
||||
"query": map[string]interface{}{
|
||||
"bool": boolQuery,
|
||||
},
|
||||
"script": map[string]interface{}{
|
||||
"source": "cosineSimilarity(params.query_vector, 'content_embedding') + 1.0 + _score * 0.5",
|
||||
"params": map[string]interface{}{
|
||||
"query_vector": embeddingInterface,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"from": req.Offset,
|
||||
"size": req.Limit,
|
||||
"_source": []string{
|
||||
"doc_id", "title", "url", "domain", "language",
|
||||
"doc_type", "school_level", "subjects",
|
||||
"trust_score", "quality_score", "snippet_text",
|
||||
},
|
||||
}
|
||||
|
||||
// Add highlighting if requested
|
||||
if req.Include.Highlights {
|
||||
query["highlight"] = map[string]interface{}{
|
||||
"fields": map[string]interface{}{
|
||||
"title": map[string]interface{}{},
|
||||
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return query
|
||||
}
|
||||
|
||||
// buildFilters constructs the filter array for queries
|
||||
func (s *Service) buildFilters(req *SearchRequest) []map[string]interface{} {
|
||||
filter := []map[string]interface{}{}
|
||||
|
||||
if len(req.Filters.Language) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"language": req.Filters.Language},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.CountryHint) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SourceCategory) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.DocType) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.SchoolLevel) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.Subjects) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
|
||||
})
|
||||
}
|
||||
|
||||
if len(req.Filters.State) > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"terms": map[string]interface{}{"state": req.Filters.State},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.MinTrustScore > 0 {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if req.Filters.DateFrom != "" {
|
||||
filter = append(filter, map[string]interface{}{
|
||||
"range": map[string]interface{}{
|
||||
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return filter
|
||||
}
|
||||
|
||||
// hitToResult converts an OpenSearch hit to SearchResult
|
||||
func (s *Service) hitToResult(source map[string]interface{}, score float64, highlight map[string][]string, include SearchInclude) SearchResult {
|
||||
result := SearchResult{
|
||||
DocID: getString(source, "doc_id"),
|
||||
Title: getString(source, "title"),
|
||||
URL: getString(source, "url"),
|
||||
Domain: getString(source, "domain"),
|
||||
Language: getString(source, "language"),
|
||||
DocType: getString(source, "doc_type"),
|
||||
SchoolLevel: getString(source, "school_level"),
|
||||
Subjects: getStringArray(source, "subjects"),
|
||||
Scores: Scores{
|
||||
BM25: score,
|
||||
Trust: getFloat(source, "trust_score"),
|
||||
Quality: getFloat(source, "quality_score"),
|
||||
Final: score, // MVP: final = BM25 * trust * quality (via function_score)
|
||||
},
|
||||
}
|
||||
|
||||
if include.Snippets {
|
||||
result.Snippet = getString(source, "snippet_text")
|
||||
}
|
||||
|
||||
if include.Highlights && highlight != nil {
|
||||
if h, ok := highlight["content_text"]; ok {
|
||||
result.Highlights = h
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
247
edu-search-service/internal/staff/staff_crawler_discovery.go
Normal file
247
edu-search-service/internal/staff/staff_crawler_discovery.go
Normal file
@@ -0,0 +1,247 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
// findStaffPages discovers staff listing pages on a university website
|
||||
func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) {
|
||||
var pages []string
|
||||
|
||||
// Use custom pattern if available
|
||||
if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" {
|
||||
pages = append(pages, *uni.StaffPagePattern)
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// Try common patterns
|
||||
baseURL := strings.TrimSuffix(uni.URL, "/")
|
||||
commonPaths := []string{
|
||||
"/personen",
|
||||
"/team",
|
||||
"/mitarbeiter",
|
||||
"/mitarbeitende",
|
||||
"/staff",
|
||||
"/people",
|
||||
"/ueber-uns/team",
|
||||
"/about/team",
|
||||
"/fakultaet/personen",
|
||||
"/institute",
|
||||
}
|
||||
|
||||
for _, path := range commonPaths {
|
||||
testURL := baseURL + path
|
||||
exists, err := c.checkPageExists(ctx, testURL)
|
||||
if err == nil && exists {
|
||||
pages = append(pages, testURL)
|
||||
}
|
||||
}
|
||||
|
||||
// Also try to find staff links on the main page
|
||||
mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL)
|
||||
if err == nil {
|
||||
pages = append(pages, mainPageLinks...)
|
||||
}
|
||||
|
||||
// UOL-specific: Find department/personen pages through navigation
|
||||
// Check for both uol.de and uni-oldenburg.de (they are the same university)
|
||||
if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") {
|
||||
log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL)
|
||||
uolPages, err := c.findUOLDepartmentPages(ctx, baseURL)
|
||||
if err == nil {
|
||||
log.Printf("[UOL] Found %d department pages", len(uolPages))
|
||||
pages = append(pages, uolPages...)
|
||||
} else {
|
||||
log.Printf("[UOL] Error finding department pages: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate
|
||||
seen := make(map[string]bool)
|
||||
var unique []string
|
||||
for _, p := range pages {
|
||||
if !seen[p] {
|
||||
seen[p] = true
|
||||
unique = append(unique, p)
|
||||
}
|
||||
}
|
||||
|
||||
return unique, nil
|
||||
}
|
||||
|
||||
// findUOLDepartmentPages finds department person pages for Uni Oldenburg
|
||||
func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) {
|
||||
var pages []string
|
||||
|
||||
// UOL uses both uol.de and uni-oldenburg.de domains
|
||||
// Departments have /personen or /team subpages
|
||||
|
||||
// Helper to check if URL is UOL-related
|
||||
isUOLURL := func(url string) bool {
|
||||
lower := strings.ToLower(url)
|
||||
return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de")
|
||||
}
|
||||
|
||||
// First try to find department links from known starting points
|
||||
startPages := []string{
|
||||
"https://uol.de/informatik/department/abteilungen-und-einrichtungen",
|
||||
"https://uol.de/fk2",
|
||||
"https://uol.de/fk1",
|
||||
"https://uol.de/fk3",
|
||||
"https://uol.de/fk4",
|
||||
"https://uol.de/fk5",
|
||||
"https://uol.de/fk6",
|
||||
baseURL,
|
||||
}
|
||||
|
||||
deptPaths := make(map[string]bool)
|
||||
|
||||
for _, startURL := range startPages {
|
||||
log.Printf("[UOL] Scanning start page: %s", startURL)
|
||||
body, err := c.fetchPage(ctx, startURL)
|
||||
if err != nil {
|
||||
log.Printf("[UOL] Error fetching %s: %v", startURL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Find links to department pages (they typically have /personen subpages)
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Look for department-like paths
|
||||
hrefLower := strings.ToLower(href)
|
||||
isDeptPath := isUOLURL(href) &&
|
||||
!strings.Contains(hrefLower, "/studium") &&
|
||||
!strings.Contains(hrefLower, "/forschung") &&
|
||||
!strings.Contains(hrefLower, "/aktuelles") &&
|
||||
!strings.Contains(hrefLower, "/kontakt")
|
||||
|
||||
if isDeptPath {
|
||||
fullURL := resolveURL(startURL, href)
|
||||
if fullURL != "" && isUOLURL(fullURL) {
|
||||
// Add personen page for this department
|
||||
personenURL := strings.TrimSuffix(fullURL, "/") + "/personen"
|
||||
deptPaths[personenURL] = true
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Also look for direct /personen or /team links
|
||||
doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if exists {
|
||||
fullURL := resolveURL(startURL, href)
|
||||
if fullURL != "" && isUOLURL(fullURL) {
|
||||
deptPaths[fullURL] = true
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Add well-known department personen pages directly (these exist for sure)
|
||||
knownDepts := []string{
|
||||
"https://uol.de/socps/personen",
|
||||
"https://uol.de/vlba/team",
|
||||
"https://uol.de/informatik/department",
|
||||
"https://uol.de/se/team",
|
||||
"https://uol.de/ei/personen",
|
||||
"https://uol.de/is/team",
|
||||
"https://uol.de/paedagogik/personen",
|
||||
"https://uol.de/psychologie/personen",
|
||||
"https://uol.de/germanistik/personen",
|
||||
"https://uol.de/physik/personen",
|
||||
"https://uol.de/chemie/personen",
|
||||
"https://uol.de/biologie/personen",
|
||||
"https://uol.de/mathe/personen",
|
||||
}
|
||||
for _, dept := range knownDepts {
|
||||
deptPaths[dept] = true
|
||||
}
|
||||
|
||||
log.Printf("[UOL] Checking %d potential department pages", len(deptPaths))
|
||||
|
||||
// Verify which pages actually exist
|
||||
for path := range deptPaths {
|
||||
exists, err := c.checkPageExists(ctx, path)
|
||||
if err == nil && exists {
|
||||
log.Printf("[UOL] Found valid page: %s", path)
|
||||
pages = append(pages, path)
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[UOL] Found %d valid department/personen pages", len(pages))
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// checkPageExists checks if a URL returns a 200 status
|
||||
func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) {
|
||||
c.waitForRateLimit(urlStr)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK, nil
|
||||
}
|
||||
|
||||
// findStaffLinksOnPage finds links to staff pages on a given page
|
||||
func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) {
|
||||
body, err := c.fetchPage(ctx, pageURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var links []string
|
||||
staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"}
|
||||
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
text := strings.ToLower(s.Text())
|
||||
hrefLower := strings.ToLower(href)
|
||||
|
||||
for _, keyword := range staffKeywords {
|
||||
if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) {
|
||||
fullURL := resolveURL(pageURL, href)
|
||||
if fullURL != "" {
|
||||
links = append(links, fullURL)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return links, nil
|
||||
}
|
||||
364
edu-search-service/internal/staff/staff_crawler_enrich.go
Normal file
364
edu-search-service/internal/staff/staff_crawler_enrich.go
Normal file
@@ -0,0 +1,364 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
// EnrichStaffProfiles fetches individual profile pages and extracts detailed info
|
||||
// like email, phone, office, research interests, and publication links
|
||||
func (c *StaffCrawler) EnrichStaffProfiles(ctx context.Context, uni *database.University) (int, error) {
|
||||
// Get all staff for this university that have profile URLs
|
||||
staffList, err := c.repo.SearchStaff(ctx, database.StaffSearchParams{
|
||||
UniversityID: &uni.ID,
|
||||
Limit: 10000,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to search staff: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[Profile Enrichment] Starting enrichment for %d staff members at %s", staffList.Total, uni.Name)
|
||||
|
||||
enriched := 0
|
||||
for _, staff := range staffList.Staff {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return enriched, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
// Skip if no profile URL
|
||||
if staff.ProfileURL == nil || *staff.ProfileURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if already has email (already enriched)
|
||||
if staff.Email != nil && *staff.Email != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Fetch and extract profile details
|
||||
details, err := c.extractProfileDetails(ctx, *staff.ProfileURL)
|
||||
if err != nil {
|
||||
log.Printf("[Profile Enrichment] Error fetching %s: %v", *staff.ProfileURL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update staff record with new details
|
||||
updated := false
|
||||
if details.Email != "" && staff.Email == nil {
|
||||
staff.Email = &details.Email
|
||||
updated = true
|
||||
}
|
||||
if details.Phone != "" && staff.Phone == nil {
|
||||
staff.Phone = &details.Phone
|
||||
updated = true
|
||||
}
|
||||
if details.Office != "" && staff.Office == nil {
|
||||
staff.Office = &details.Office
|
||||
updated = true
|
||||
}
|
||||
if details.ORCID != "" && staff.ORCID == nil {
|
||||
staff.ORCID = &details.ORCID
|
||||
updated = true
|
||||
}
|
||||
if details.GoogleScholarID != "" && staff.GoogleScholarID == nil {
|
||||
staff.GoogleScholarID = &details.GoogleScholarID
|
||||
updated = true
|
||||
}
|
||||
if details.ResearchgateURL != "" && staff.ResearchgateURL == nil {
|
||||
staff.ResearchgateURL = &details.ResearchgateURL
|
||||
updated = true
|
||||
}
|
||||
if details.LinkedInURL != "" && staff.LinkedInURL == nil {
|
||||
staff.LinkedInURL = &details.LinkedInURL
|
||||
updated = true
|
||||
}
|
||||
if details.PersonalWebsite != "" && staff.PersonalWebsite == nil {
|
||||
staff.PersonalWebsite = &details.PersonalWebsite
|
||||
updated = true
|
||||
}
|
||||
if len(details.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
|
||||
staff.ResearchInterests = details.ResearchInterests
|
||||
updated = true
|
||||
}
|
||||
if details.PhotoURL != "" && staff.PhotoURL == nil {
|
||||
staff.PhotoURL = &details.PhotoURL
|
||||
updated = true
|
||||
}
|
||||
|
||||
if updated {
|
||||
err = c.repo.CreateStaff(ctx, &staff)
|
||||
if err != nil {
|
||||
log.Printf("[Profile Enrichment] Error updating %s: %v", staff.LastName, err)
|
||||
continue
|
||||
}
|
||||
enriched++
|
||||
log.Printf("[Profile Enrichment] Enriched: %s %s (email=%v)", stringValue(staff.FirstName), staff.LastName, details.Email != "")
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[Profile Enrichment] Completed: enriched %d of %d staff members", enriched, staffList.Total)
|
||||
return enriched, nil
|
||||
}
|
||||
|
||||
// ProfileDetails contains extracted details from a profile page
|
||||
type ProfileDetails struct {
|
||||
Email string
|
||||
Phone string
|
||||
Office string
|
||||
ORCID string
|
||||
GoogleScholarID string
|
||||
ResearchgateURL string
|
||||
LinkedInURL string
|
||||
PersonalWebsite string
|
||||
ResearchInterests []string
|
||||
PhotoURL string
|
||||
}
|
||||
|
||||
// extractProfileDetails extracts contact info from an individual profile page
|
||||
func (c *StaffCrawler) extractProfileDetails(ctx context.Context, profileURL string) (*ProfileDetails, error) {
|
||||
body, err := c.fetchPage(ctx, profileURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
details := &ProfileDetails{}
|
||||
|
||||
// UOL-specific: Look for definition list pattern (dt/dd pairs)
|
||||
// This is the most reliable way to get contact info on UOL pages
|
||||
doc.Find("dt").Each(func(i int, dt *goquery.Selection) {
|
||||
label := strings.TrimSpace(strings.ToLower(dt.Text()))
|
||||
dd := dt.Next()
|
||||
if dd.Length() == 0 || goquery.NodeName(dd) != "dd" {
|
||||
return
|
||||
}
|
||||
value := strings.TrimSpace(dd.Text())
|
||||
|
||||
switch {
|
||||
case strings.Contains(label, "email") || strings.Contains(label, "e-mail"):
|
||||
if details.Email == "" {
|
||||
// Get email from mailto link if present
|
||||
dd.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
|
||||
if details.Email != "" {
|
||||
return
|
||||
}
|
||||
href, _ := a.Attr("href")
|
||||
email := strings.TrimPrefix(href, "mailto:")
|
||||
email = strings.Split(email, "?")[0]
|
||||
if strings.Contains(email, "@") {
|
||||
details.Email = strings.TrimSpace(email)
|
||||
}
|
||||
})
|
||||
// Fallback: extract from text
|
||||
if details.Email == "" && strings.Contains(value, "@") {
|
||||
emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,10}`)
|
||||
if match := emailPattern.FindString(value); match != "" {
|
||||
details.Email = match
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.Contains(label, "telefon") || strings.Contains(label, "phone") || strings.Contains(label, "tel"):
|
||||
if details.Phone == "" {
|
||||
// Get phone from tel: link if present
|
||||
dd.Find("a[href^='tel:']").Each(func(j int, a *goquery.Selection) {
|
||||
if details.Phone != "" {
|
||||
return
|
||||
}
|
||||
href, _ := a.Attr("href")
|
||||
phone := strings.TrimPrefix(href, "tel:")
|
||||
if len(phone) >= 8 {
|
||||
details.Phone = phone
|
||||
}
|
||||
})
|
||||
// Fallback: extract from text
|
||||
if details.Phone == "" {
|
||||
phonePattern := regexp.MustCompile(`\+?[\d\s\-/()]{8,20}`)
|
||||
if match := phonePattern.FindString(value); match != "" {
|
||||
details.Phone = strings.TrimSpace(match)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.Contains(label, "raum") || strings.Contains(label, "büro") || strings.Contains(label, "office"):
|
||||
if details.Office == "" {
|
||||
details.Office = value
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Fallback: Extract email from mailto links if not found via dt/dd
|
||||
if details.Email == "" {
|
||||
doc.Find("a[href^='mailto:']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.Email != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
email := strings.TrimPrefix(href, "mailto:")
|
||||
email = strings.Split(email, "?")[0]
|
||||
// Only accept personal email addresses (not generic like info@, sekretariat@)
|
||||
if strings.Contains(email, "@") {
|
||||
emailLower := strings.ToLower(email)
|
||||
isGeneric := strings.HasPrefix(emailLower, "info@") ||
|
||||
strings.HasPrefix(emailLower, "sekretariat@") ||
|
||||
strings.HasPrefix(emailLower, "kontakt@") ||
|
||||
strings.HasPrefix(emailLower, "office@") ||
|
||||
strings.HasPrefix(emailLower, "fachschaft@")
|
||||
if !isGeneric {
|
||||
details.Email = strings.TrimSpace(email)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Fallback: Extract phone if not found via dt/dd
|
||||
if details.Phone == "" {
|
||||
doc.Find("a[href^='tel:']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.Phone != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
phone := strings.TrimPrefix(href, "tel:")
|
||||
if len(phone) >= 8 {
|
||||
details.Phone = phone
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Extract ORCID
|
||||
doc.Find("a[href*='orcid.org']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.ORCID != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
orcidPattern := regexp.MustCompile(`\d{4}-\d{4}-\d{4}-\d{3}[\dX]`)
|
||||
if match := orcidPattern.FindString(href); match != "" {
|
||||
details.ORCID = match
|
||||
}
|
||||
})
|
||||
|
||||
// Extract Google Scholar ID
|
||||
doc.Find("a[href*='scholar.google']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.GoogleScholarID != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
// Extract user ID from URL like scholar.google.com/citations?user=XXXXX
|
||||
if strings.Contains(href, "user=") {
|
||||
parts := strings.Split(href, "user=")
|
||||
if len(parts) > 1 {
|
||||
userID := strings.Split(parts[1], "&")[0]
|
||||
details.GoogleScholarID = userID
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Extract ResearchGate URL
|
||||
doc.Find("a[href*='researchgate.net']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.ResearchgateURL != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
if strings.Contains(href, "researchgate.net") {
|
||||
details.ResearchgateURL = href
|
||||
}
|
||||
})
|
||||
|
||||
// Extract LinkedIn URL
|
||||
doc.Find("a[href*='linkedin.com']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.LinkedInURL != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
if strings.Contains(href, "linkedin.com") {
|
||||
details.LinkedInURL = href
|
||||
}
|
||||
})
|
||||
|
||||
// Extract personal website (non-university links)
|
||||
doc.Find("a[href^='http']").Each(func(i int, s *goquery.Selection) {
|
||||
if details.PersonalWebsite != "" {
|
||||
return
|
||||
}
|
||||
href, _ := s.Attr("href")
|
||||
text := strings.ToLower(s.Text())
|
||||
|
||||
// Skip university links, social media, etc.
|
||||
if strings.Contains(href, "uni-oldenburg.de") || strings.Contains(href, "uol.de") ||
|
||||
strings.Contains(href, "linkedin") || strings.Contains(href, "researchgate") ||
|
||||
strings.Contains(href, "orcid.org") || strings.Contains(href, "scholar.google") ||
|
||||
strings.Contains(href, "twitter") || strings.Contains(href, "facebook") {
|
||||
return
|
||||
}
|
||||
|
||||
// Look for personal website indicators
|
||||
if strings.Contains(text, "homepage") || strings.Contains(text, "website") ||
|
||||
strings.Contains(text, "personal") || strings.Contains(text, "www") {
|
||||
details.PersonalWebsite = href
|
||||
}
|
||||
})
|
||||
|
||||
// Extract photo URL
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
if details.PhotoURL != "" {
|
||||
return
|
||||
}
|
||||
src, exists := s.Attr("src")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip icons, logos, etc.
|
||||
srcLower := strings.ToLower(src)
|
||||
if strings.Contains(srcLower, "icon") || strings.Contains(srcLower, "logo") ||
|
||||
strings.Contains(srcLower, "placeholder") || strings.Contains(srcLower, "default") {
|
||||
return
|
||||
}
|
||||
|
||||
// Look for images that might be profile photos
|
||||
alt, _ := s.Attr("alt")
|
||||
altLower := strings.ToLower(alt)
|
||||
classes, _ := s.Attr("class")
|
||||
classesLower := strings.ToLower(classes)
|
||||
|
||||
if strings.Contains(altLower, "foto") || strings.Contains(altLower, "photo") ||
|
||||
strings.Contains(altLower, "portrait") || strings.Contains(altLower, "bild") ||
|
||||
strings.Contains(classesLower, "photo") || strings.Contains(classesLower, "portrait") ||
|
||||
strings.Contains(classesLower, "profile") {
|
||||
details.PhotoURL = resolveURL(profileURL, src)
|
||||
}
|
||||
})
|
||||
|
||||
// Extract research interests/areas
|
||||
// Look for sections about research, forschung, schwerpunkte
|
||||
doc.Find("*").Each(func(i int, s *goquery.Selection) {
|
||||
if len(details.ResearchInterests) > 0 {
|
||||
return
|
||||
}
|
||||
text := strings.ToLower(s.Text())
|
||||
if strings.Contains(text, "forschung") || strings.Contains(text, "research") ||
|
||||
strings.Contains(text, "schwerpunkt") || strings.Contains(text, "interest") {
|
||||
// Check if parent has a list of items
|
||||
s.Parent().Find("li").Each(func(j int, li *goquery.Selection) {
|
||||
interest := strings.TrimSpace(li.Text())
|
||||
if len(interest) > 3 && len(interest) < 200 {
|
||||
details.ResearchInterests = append(details.ResearchInterests, interest)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
return details, nil
|
||||
}
|
||||
495
edu-search-service/internal/staff/staff_crawler_extract.go
Normal file
495
edu-search-service/internal/staff/staff_crawler_extract.go
Normal file
@@ -0,0 +1,495 @@
|
||||
package staff
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
)
|
||||
|
||||
// extractStaffFromPage extracts staff information from a staff listing page
|
||||
func (c *StaffCrawler) extractStaffFromPage(ctx context.Context, pageURL string, uni *database.University) ([]*database.UniversityStaff, error) {
|
||||
body, err := c.fetchPage(ctx, pageURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var staff []*database.UniversityStaff
|
||||
|
||||
// Try different extraction strategies
|
||||
extractors := []func(*goquery.Document, string) []*database.UniversityStaff{
|
||||
c.extractFromUOLPatterns, // UOL-specific patterns first
|
||||
c.extractFromPersonCards,
|
||||
c.extractFromTable,
|
||||
c.extractFromList,
|
||||
c.extractFromVCard,
|
||||
}
|
||||
|
||||
for _, extractor := range extractors {
|
||||
extracted := extractor(doc, pageURL)
|
||||
if len(extracted) > 0 {
|
||||
staff = append(staff, extracted...)
|
||||
}
|
||||
}
|
||||
|
||||
return staff, nil
|
||||
}
|
||||
|
||||
// extractFromUOLPatterns extracts staff using Uni Oldenburg specific patterns
|
||||
// UOL uses: nav#left-nav for person lists, p.mit-icon.person for person links,
|
||||
// and /suche/person?username=XXX for person API
|
||||
// Also captures hierarchy from section headers (Leitung, Mitarbeiter, etc.)
|
||||
func (c *StaffCrawler) extractFromUOLPatterns(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
||||
var staff []*database.UniversityStaff
|
||||
seen := make(map[string]bool)
|
||||
|
||||
// Extract department name from page title or breadcrumb
|
||||
deptName := ""
|
||||
doc.Find("h1").First().Each(func(i int, s *goquery.Selection) {
|
||||
deptName = strings.TrimSpace(s.Text())
|
||||
})
|
||||
|
||||
// Pattern 5 (NEW): Parse content with hierarchy headers
|
||||
// UOL pages have structure like:
|
||||
// #### Leitung
|
||||
// <ul><li><a href="...">Prof. Dr. Name</a></li></ul>
|
||||
// #### Wissenschaftliche Mitarbeiterinnen und Mitarbeiter
|
||||
// <ul><li><a href="...">M. Sc. Name</a></li></ul>
|
||||
currentRole := ""
|
||||
var leaderName string // Track the department head for supervisor assignment
|
||||
|
||||
// Walk through content area looking for headers and lists
|
||||
doc.Find("#content h4, #content h3, #content ul li a, .inhalt h4, .inhalt h3, .inhalt ul li a").Each(func(i int, s *goquery.Selection) {
|
||||
tagName := goquery.NodeName(s)
|
||||
|
||||
// Check if this is a section header
|
||||
if tagName == "h3" || tagName == "h4" {
|
||||
headerText := strings.ToLower(strings.TrimSpace(s.Text()))
|
||||
if strings.Contains(headerText, "leitung") {
|
||||
currentRole = "leitung"
|
||||
} else if strings.Contains(headerText, "sekretariat") {
|
||||
currentRole = "sekretariat"
|
||||
} else if strings.Contains(headerText, "wissenschaftlich") || strings.Contains(headerText, "mitarbeiter") {
|
||||
currentRole = "mitarbeiter"
|
||||
} else if strings.Contains(headerText, "doktorand") || strings.Contains(headerText, "promovierend") {
|
||||
currentRole = "doktorand"
|
||||
} else if strings.Contains(headerText, "technisch") {
|
||||
currentRole = "technisch"
|
||||
} else if strings.Contains(headerText, "extern") {
|
||||
currentRole = "extern"
|
||||
} else if strings.Contains(headerText, "student") || strings.Contains(headerText, "hilfskr") || strings.Contains(headerText, "hiwi") {
|
||||
currentRole = "hiwi"
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Process person links under current header
|
||||
if tagName == "a" {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Check if this looks like a person page link
|
||||
if !strings.Contains(href, "/personen/") && !strings.Contains(href, "suche/person") {
|
||||
return
|
||||
}
|
||||
|
||||
name := strings.TrimSpace(s.Text())
|
||||
if name == "" || seen[name] || !c.looksLikeName(name) {
|
||||
return
|
||||
}
|
||||
seen[name] = true
|
||||
|
||||
person := &database.UniversityStaff{}
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
|
||||
if person.LastName != "" {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
|
||||
// Set team role based on current section
|
||||
if currentRole != "" {
|
||||
person.TeamRole = ¤tRole
|
||||
}
|
||||
|
||||
// Track leader for supervisor assignment
|
||||
if currentRole == "leitung" && leaderName == "" {
|
||||
leaderName = name
|
||||
person.IsProfessor = true
|
||||
posType := "professor"
|
||||
person.PositionType = &posType
|
||||
}
|
||||
|
||||
staff = append(staff, person)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Pattern 1: nav#left-nav ul li a - side navigation with person links
|
||||
// Format: /abteilung/personen/prof-dr-name or /abteilung/personen/m-sc-name
|
||||
doc.Find("nav#left-nav ul li a, #left-navi li a").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Check if this looks like a person page link
|
||||
if !strings.Contains(href, "/personen/") {
|
||||
return
|
||||
}
|
||||
|
||||
name := strings.TrimSpace(s.Text())
|
||||
if name == "" || seen[name] {
|
||||
return
|
||||
}
|
||||
seen[name] = true
|
||||
|
||||
person := &database.UniversityStaff{}
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
|
||||
if person.LastName != "" {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
// Pattern 2: p.mit-icon.person a - inline person references
|
||||
// Format: <p class="mit-icon person"><a href="/suche/person/USERNAME">Prof. Dr. Name</a></p>
|
||||
// OR: <p class="mit-icon person"><a href="/abteilung/personen/prof-dr-name">Prof. Dr. Name</a></p>
|
||||
doc.Find("p.mit-icon.person a, .mit-icon.person a").Each(func(i int, s *goquery.Selection) {
|
||||
name := strings.TrimSpace(s.Text())
|
||||
if name == "" || seen[name] {
|
||||
return
|
||||
}
|
||||
seen[name] = true
|
||||
|
||||
person := &database.UniversityStaff{}
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
|
||||
if person.LastName != "" {
|
||||
href, exists := s.Attr("href")
|
||||
if exists {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
}
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
// Pattern 3: Links to /suche/person?username=XXX
|
||||
doc.Find("a[href*='suche/person']").Each(func(i int, s *goquery.Selection) {
|
||||
name := strings.TrimSpace(s.Text())
|
||||
// Skip non-person text like "Internetkoordinator"
|
||||
if name == "" || seen[name] || !c.looksLikeName(name) {
|
||||
return
|
||||
}
|
||||
seen[name] = true
|
||||
|
||||
person := &database.UniversityStaff{}
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
|
||||
if person.LastName != "" {
|
||||
href, exists := s.Attr("href")
|
||||
if exists {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
}
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
// Pattern 4: Breadcrumb navigation sublinks with person names
|
||||
// Format: <ul class="sublinks"><li><a href="/dept/personen/name">Prof. Dr. Name</a></li>
|
||||
doc.Find(".sublinks li a, nav#navizeile .sublinks li a").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists || !strings.Contains(href, "/personen/") {
|
||||
return
|
||||
}
|
||||
|
||||
name := strings.TrimSpace(s.Text())
|
||||
if name == "" || seen[name] {
|
||||
return
|
||||
}
|
||||
seen[name] = true
|
||||
|
||||
person := &database.UniversityStaff{}
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
|
||||
if person.LastName != "" {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
if len(staff) > 0 {
|
||||
log.Printf("[UOL Extractor] Found %d staff members using UOL patterns (dept: %s)", len(staff), deptName)
|
||||
}
|
||||
|
||||
return staff
|
||||
}
|
||||
|
||||
// extractFromPersonCards extracts staff from card-style layouts
|
||||
func (c *StaffCrawler) extractFromPersonCards(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
||||
var staff []*database.UniversityStaff
|
||||
|
||||
// Common card selectors
|
||||
cardSelectors := []string{
|
||||
".person-card",
|
||||
".staff-card",
|
||||
".team-member",
|
||||
".mitarbeiter",
|
||||
".person",
|
||||
".employee",
|
||||
"[itemtype='http://schema.org/Person']",
|
||||
".vcard",
|
||||
}
|
||||
|
||||
for _, selector := range cardSelectors {
|
||||
doc.Find(selector).Each(func(i int, s *goquery.Selection) {
|
||||
person := c.extractPersonFromElement(s, baseURL)
|
||||
if person != nil && person.LastName != "" {
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
if len(staff) > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return staff
|
||||
}
|
||||
|
||||
// extractFromTable extracts staff from table layouts
|
||||
func (c *StaffCrawler) extractFromTable(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
||||
var staff []*database.UniversityStaff
|
||||
|
||||
doc.Find("table").Each(func(i int, table *goquery.Selection) {
|
||||
// Check if this looks like a staff table
|
||||
headerText := strings.ToLower(table.Find("th").Text())
|
||||
if !strings.Contains(headerText, "name") && !strings.Contains(headerText, "person") {
|
||||
return
|
||||
}
|
||||
|
||||
table.Find("tr").Each(func(j int, row *goquery.Selection) {
|
||||
if row.Find("th").Length() > 0 {
|
||||
return // Skip header row
|
||||
}
|
||||
|
||||
cells := row.Find("td")
|
||||
if cells.Length() < 2 {
|
||||
return
|
||||
}
|
||||
|
||||
person := &database.UniversityStaff{}
|
||||
|
||||
// First cell usually contains name
|
||||
nameCell := cells.First()
|
||||
name := strings.TrimSpace(nameCell.Text())
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
|
||||
// Look for email
|
||||
row.Find("a[href^='mailto:']").Each(func(k int, a *goquery.Selection) {
|
||||
href, _ := a.Attr("href")
|
||||
email := strings.TrimPrefix(href, "mailto:")
|
||||
person.Email = &email
|
||||
})
|
||||
|
||||
// Look for profile link
|
||||
nameCell.Find("a[href]").Each(func(k int, a *goquery.Selection) {
|
||||
href, exists := a.Attr("href")
|
||||
if exists && !strings.HasPrefix(href, "mailto:") {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
}
|
||||
})
|
||||
|
||||
// Extract position from other cells
|
||||
cells.Each(func(k int, cell *goquery.Selection) {
|
||||
text := strings.TrimSpace(cell.Text())
|
||||
if c.looksLikePosition(text) {
|
||||
person.Position = &text
|
||||
person.PositionType = c.classifyPosition(text)
|
||||
person.IsProfessor = c.isProfessor(text)
|
||||
}
|
||||
})
|
||||
|
||||
if person.LastName != "" {
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return staff
|
||||
}
|
||||
|
||||
// extractFromList extracts staff from list layouts
|
||||
func (c *StaffCrawler) extractFromList(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
||||
var staff []*database.UniversityStaff
|
||||
|
||||
listSelectors := []string{"ul.staff", "ul.team", "ul.mitarbeiter", ".staff-list li", ".team-list li"}
|
||||
|
||||
for _, selector := range listSelectors {
|
||||
doc.Find(selector).Each(func(i int, li *goquery.Selection) {
|
||||
person := c.extractPersonFromElement(li, baseURL)
|
||||
if person != nil && person.LastName != "" {
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
if len(staff) > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return staff
|
||||
}
|
||||
|
||||
// extractFromVCard extracts staff from vCard/hCard microformats
|
||||
func (c *StaffCrawler) extractFromVCard(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
|
||||
var staff []*database.UniversityStaff
|
||||
|
||||
doc.Find(".vcard, .h-card").Each(func(i int, s *goquery.Selection) {
|
||||
person := &database.UniversityStaff{}
|
||||
|
||||
// Name
|
||||
fn := s.Find(".fn, .p-name").Text()
|
||||
if fn != "" {
|
||||
person.FullName = &fn
|
||||
c.parseName(fn, person)
|
||||
}
|
||||
|
||||
// Email
|
||||
email := s.Find(".email, .u-email").Text()
|
||||
if email == "" {
|
||||
s.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
|
||||
href, _ := a.Attr("href")
|
||||
email = strings.TrimPrefix(href, "mailto:")
|
||||
})
|
||||
}
|
||||
if email != "" {
|
||||
person.Email = &email
|
||||
}
|
||||
|
||||
// Title/Position
|
||||
title := s.Find(".title, .p-job-title, .role").Text()
|
||||
if title != "" {
|
||||
person.Position = &title
|
||||
person.PositionType = c.classifyPosition(title)
|
||||
person.IsProfessor = c.isProfessor(title)
|
||||
}
|
||||
|
||||
// Photo
|
||||
s.Find(".photo, .u-photo, img").Each(func(j int, img *goquery.Selection) {
|
||||
src, exists := img.Attr("src")
|
||||
if exists {
|
||||
fullURL := resolveURL(baseURL, src)
|
||||
person.PhotoURL = &fullURL
|
||||
}
|
||||
})
|
||||
|
||||
// Profile URL
|
||||
s.Find("a[href].url, a[href].u-url").Each(func(j int, a *goquery.Selection) {
|
||||
href, exists := a.Attr("href")
|
||||
if exists {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
person.ProfileURL = &fullURL
|
||||
}
|
||||
})
|
||||
|
||||
if person.LastName != "" {
|
||||
staff = append(staff, person)
|
||||
}
|
||||
})
|
||||
|
||||
return staff
|
||||
}
|
||||
|
||||
// extractPersonFromElement extracts a person from a generic HTML element
|
||||
func (c *StaffCrawler) extractPersonFromElement(s *goquery.Selection, baseURL string) *database.UniversityStaff {
|
||||
person := &database.UniversityStaff{}
|
||||
|
||||
// Try to find name
|
||||
nameSelectors := []string{".name", ".person-name", "h2", "h3", "h4", ".title", "strong", "b"}
|
||||
for _, sel := range nameSelectors {
|
||||
name := strings.TrimSpace(s.Find(sel).First().Text())
|
||||
if name != "" && len(name) < 100 && !c.looksLikePosition(name) {
|
||||
person.FullName = &name
|
||||
c.parseName(name, person)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// If no name found, try the whole text
|
||||
if person.LastName == "" {
|
||||
text := strings.TrimSpace(s.Text())
|
||||
lines := strings.Split(text, "\n")
|
||||
if len(lines) > 0 {
|
||||
firstLine := strings.TrimSpace(lines[0])
|
||||
if len(firstLine) > 0 && len(firstLine) < 100 {
|
||||
person.FullName = &firstLine
|
||||
c.parseName(firstLine, person)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract email
|
||||
s.Find("a[href^='mailto:']").Each(func(i int, a *goquery.Selection) {
|
||||
href, _ := a.Attr("href")
|
||||
email := strings.TrimPrefix(href, "mailto:")
|
||||
email = strings.Split(email, "?")[0] // Remove query params
|
||||
person.Email = &email
|
||||
})
|
||||
|
||||
// Extract position
|
||||
positionSelectors := []string{".position", ".role", ".job-title", ".funktion", "small", ".subtitle"}
|
||||
for _, sel := range positionSelectors {
|
||||
pos := strings.TrimSpace(s.Find(sel).First().Text())
|
||||
if pos != "" && c.looksLikePosition(pos) {
|
||||
person.Position = &pos
|
||||
person.PositionType = c.classifyPosition(pos)
|
||||
person.IsProfessor = c.isProfessor(pos)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Extract photo
|
||||
s.Find("img").Each(func(i int, img *goquery.Selection) {
|
||||
src, exists := img.Attr("src")
|
||||
if exists && !strings.Contains(src, "placeholder") && !strings.Contains(src, "icon") {
|
||||
fullURL := resolveURL(baseURL, src)
|
||||
person.PhotoURL = &fullURL
|
||||
}
|
||||
})
|
||||
|
||||
// Extract profile link
|
||||
s.Find("a[href]").Each(func(i int, a *goquery.Selection) {
|
||||
href, exists := a.Attr("href")
|
||||
if exists && !strings.HasPrefix(href, "mailto:") && !strings.HasPrefix(href, "tel:") {
|
||||
fullURL := resolveURL(baseURL, href)
|
||||
if person.ProfileURL == nil {
|
||||
person.ProfileURL = &fullURL
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return person
|
||||
}
|
||||
Reference in New Issue
Block a user