Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -278,267 +278,6 @@ func (h *AIExtractionHandlers) SubmitExtractedData(c *gin.Context) {
})
}
// SubmitBatchExtractedData saves multiple AI-extracted profile data items
// POST /api/v1/ai/extraction/submit-batch
func (h *AIExtractionHandlers) SubmitBatchExtractedData(c *gin.Context) {
var batch struct {
Items []ExtractedProfileData `json:"items" binding:"required"`
}
if err := c.ShouldBindJSON(&batch); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
return
}
results := make([]gin.H, 0, len(batch.Items))
successCount := 0
errorCount := 0
for _, item := range batch.Items {
// Get existing staff record
staff, err := h.repo.GetStaff(c.Request.Context(), item.StaffID)
if err != nil {
results = append(results, gin.H{
"staff_id": item.StaffID,
"status": "error",
"error": "Staff not found",
})
errorCount++
continue
}
// Apply updates (same logic as single submit)
updated := false
if item.Email != "" && (staff.Email == nil || *staff.Email == "") {
staff.Email = &item.Email
updated = true
}
if item.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
staff.Phone = &item.Phone
updated = true
}
if item.Office != "" && (staff.Office == nil || *staff.Office == "") {
staff.Office = &item.Office
updated = true
}
if item.Position != "" && (staff.Position == nil || *staff.Position == "") {
staff.Position = &item.Position
updated = true
}
if item.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
staff.PositionType = &item.PositionType
updated = true
}
if item.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
staff.TeamRole = &item.TeamRole
updated = true
}
if len(item.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
staff.ResearchInterests = item.ResearchInterests
updated = true
}
if item.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
staff.ORCID = &item.ORCID
updated = true
}
// Update last verified
now := time.Now()
staff.LastVerified = &now
if updated {
err = h.repo.CreateStaff(c.Request.Context(), staff)
if err != nil {
results = append(results, gin.H{
"staff_id": item.StaffID,
"status": "error",
"error": err.Error(),
})
errorCount++
continue
}
}
results = append(results, gin.H{
"staff_id": item.StaffID,
"status": "success",
"updated": updated,
})
successCount++
}
c.JSON(http.StatusOK, gin.H{
"results": results,
"success_count": successCount,
"error_count": errorCount,
"total": len(batch.Items),
})
}
// InstituteHierarchyTask represents an institute page to crawl for hierarchy
type InstituteHierarchyTask struct {
InstituteURL string `json:"institute_url"`
InstituteName string `json:"institute_name,omitempty"`
UniversityID uuid.UUID `json:"university_id"`
}
// GetInstitutePages returns institute pages that need hierarchy crawling
// GET /api/v1/ai/extraction/institutes?university_id=...
func (h *AIExtractionHandlers) GetInstitutePages(c *gin.Context) {
var universityID *uuid.UUID
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
id, err := uuid.Parse(uniIDStr)
if err == nil {
universityID = &id
}
}
// Get unique institute/department URLs from staff profiles
params := database.StaffSearchParams{
UniversityID: universityID,
Limit: 1000,
}
result, err := h.repo.SearchStaff(c.Request.Context(), params)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// Collect unique source URLs (these are typically department pages)
urlSet := make(map[string]bool)
var tasks []InstituteHierarchyTask
for _, staff := range result.Staff {
if staff.SourceURL != nil && *staff.SourceURL != "" {
url := *staff.SourceURL
if !urlSet[url] {
urlSet[url] = true
tasks = append(tasks, InstituteHierarchyTask{
InstituteURL: url,
UniversityID: staff.UniversityID,
})
}
}
}
c.JSON(http.StatusOK, gin.H{
"institutes": tasks,
"total": len(tasks),
})
}
// InstituteHierarchyData represents hierarchy data extracted from an institute page
type InstituteHierarchyData struct {
InstituteURL string `json:"institute_url" binding:"required"`
UniversityID uuid.UUID `json:"university_id" binding:"required"`
InstituteName string `json:"institute_name,omitempty"`
// Leadership
LeaderName string `json:"leader_name,omitempty"`
LeaderTitle string `json:"leader_title,omitempty"` // e.g., "Professor", "Lehrstuhlinhaber"
// Staff organization
StaffGroups []struct {
Role string `json:"role"` // e.g., "Leitung", "Wissenschaftliche Mitarbeiter", "Sekretariat"
Members []string `json:"members"` // Names of people in this group
} `json:"staff_groups,omitempty"`
// Teaching info (Lehrveranstaltungen)
TeachingCourses []struct {
Title string `json:"title"`
Teacher string `json:"teacher,omitempty"`
} `json:"teaching_courses,omitempty"`
}
// SubmitInstituteHierarchy saves hierarchy data from an institute page
// POST /api/v1/ai/extraction/institutes/submit
func (h *AIExtractionHandlers) SubmitInstituteHierarchy(c *gin.Context) {
var data InstituteHierarchyData
if err := c.ShouldBindJSON(&data); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
return
}
// Find or create department
dept := &database.Department{
UniversityID: data.UniversityID,
Name: data.InstituteName,
}
if data.InstituteURL != "" {
dept.URL = &data.InstituteURL
}
err := h.repo.CreateDepartment(c.Request.Context(), dept)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create department: " + err.Error()})
return
}
// Find leader and set as supervisor for all staff in this institute
var leaderID *uuid.UUID
if data.LeaderName != "" {
// Search for leader
leaderParams := database.StaffSearchParams{
Query: data.LeaderName,
UniversityID: &data.UniversityID,
Limit: 1,
}
result, err := h.repo.SearchStaff(c.Request.Context(), leaderParams)
if err == nil && len(result.Staff) > 0 {
leaderID = &result.Staff[0].ID
// Update leader with department and role
leader := &result.Staff[0]
leader.DepartmentID = &dept.ID
roleLeitung := "leitung"
leader.TeamRole = &roleLeitung
leader.IsProfessor = true
if data.LeaderTitle != "" {
leader.AcademicTitle = &data.LeaderTitle
}
h.repo.CreateStaff(c.Request.Context(), leader)
}
}
// Process staff groups
updatedCount := 0
for _, group := range data.StaffGroups {
for _, memberName := range group.Members {
// Find staff member
memberParams := database.StaffSearchParams{
Query: memberName,
UniversityID: &data.UniversityID,
Limit: 1,
}
result, err := h.repo.SearchStaff(c.Request.Context(), memberParams)
if err != nil || len(result.Staff) == 0 {
continue
}
member := &result.Staff[0]
member.DepartmentID = &dept.ID
member.TeamRole = &group.Role
// Set supervisor if leader was found and this is not the leader
if leaderID != nil && member.ID != *leaderID {
member.SupervisorID = leaderID
}
h.repo.CreateStaff(c.Request.Context(), member)
updatedCount++
}
}
c.JSON(http.StatusOK, gin.H{
"status": "success",
"department_id": dept.ID,
"leader_id": leaderID,
"members_updated": updatedCount,
})
}
// RegisterAIExtractionRoutes registers AI extraction routes
func (h *AIExtractionHandlers) RegisterRoutes(r *gin.RouterGroup) {
ai := r.Group("/ai/extraction")

View File

@@ -0,0 +1,272 @@
package handlers
import (
"net/http"
"time"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"github.com/breakpilot/edu-search-service/internal/database"
)
// SubmitBatchExtractedData saves multiple AI-extracted profile data items
// POST /api/v1/ai/extraction/submit-batch
func (h *AIExtractionHandlers) SubmitBatchExtractedData(c *gin.Context) {
var batch struct {
Items []ExtractedProfileData `json:"items" binding:"required"`
}
if err := c.ShouldBindJSON(&batch); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
return
}
results := make([]gin.H, 0, len(batch.Items))
successCount := 0
errorCount := 0
for _, item := range batch.Items {
// Get existing staff record
staff, err := h.repo.GetStaff(c.Request.Context(), item.StaffID)
if err != nil {
results = append(results, gin.H{
"staff_id": item.StaffID,
"status": "error",
"error": "Staff not found",
})
errorCount++
continue
}
// Apply updates (same logic as single submit)
updated := false
if item.Email != "" && (staff.Email == nil || *staff.Email == "") {
staff.Email = &item.Email
updated = true
}
if item.Phone != "" && (staff.Phone == nil || *staff.Phone == "") {
staff.Phone = &item.Phone
updated = true
}
if item.Office != "" && (staff.Office == nil || *staff.Office == "") {
staff.Office = &item.Office
updated = true
}
if item.Position != "" && (staff.Position == nil || *staff.Position == "") {
staff.Position = &item.Position
updated = true
}
if item.PositionType != "" && (staff.PositionType == nil || *staff.PositionType == "") {
staff.PositionType = &item.PositionType
updated = true
}
if item.TeamRole != "" && (staff.TeamRole == nil || *staff.TeamRole == "") {
staff.TeamRole = &item.TeamRole
updated = true
}
if len(item.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
staff.ResearchInterests = item.ResearchInterests
updated = true
}
if item.ORCID != "" && (staff.ORCID == nil || *staff.ORCID == "") {
staff.ORCID = &item.ORCID
updated = true
}
// Update last verified
now := time.Now()
staff.LastVerified = &now
if updated {
err = h.repo.CreateStaff(c.Request.Context(), staff)
if err != nil {
results = append(results, gin.H{
"staff_id": item.StaffID,
"status": "error",
"error": err.Error(),
})
errorCount++
continue
}
}
results = append(results, gin.H{
"staff_id": item.StaffID,
"status": "success",
"updated": updated,
})
successCount++
}
c.JSON(http.StatusOK, gin.H{
"results": results,
"success_count": successCount,
"error_count": errorCount,
"total": len(batch.Items),
})
}
// InstituteHierarchyTask represents an institute page to crawl for hierarchy
type InstituteHierarchyTask struct {
InstituteURL string `json:"institute_url"`
InstituteName string `json:"institute_name,omitempty"`
UniversityID uuid.UUID `json:"university_id"`
}
// GetInstitutePages returns institute pages that need hierarchy crawling
// GET /api/v1/ai/extraction/institutes?university_id=...
func (h *AIExtractionHandlers) GetInstitutePages(c *gin.Context) {
var universityID *uuid.UUID
if uniIDStr := c.Query("university_id"); uniIDStr != "" {
id, err := uuid.Parse(uniIDStr)
if err == nil {
universityID = &id
}
}
// Get unique institute/department URLs from staff profiles
params := database.StaffSearchParams{
UniversityID: universityID,
Limit: 1000,
}
result, err := h.repo.SearchStaff(c.Request.Context(), params)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// Collect unique source URLs (these are typically department pages)
urlSet := make(map[string]bool)
var tasks []InstituteHierarchyTask
for _, staff := range result.Staff {
if staff.SourceURL != nil && *staff.SourceURL != "" {
url := *staff.SourceURL
if !urlSet[url] {
urlSet[url] = true
tasks = append(tasks, InstituteHierarchyTask{
InstituteURL: url,
UniversityID: staff.UniversityID,
})
}
}
}
c.JSON(http.StatusOK, gin.H{
"institutes": tasks,
"total": len(tasks),
})
}
// InstituteHierarchyData represents hierarchy data extracted from an institute page
type InstituteHierarchyData struct {
InstituteURL string `json:"institute_url" binding:"required"`
UniversityID uuid.UUID `json:"university_id" binding:"required"`
InstituteName string `json:"institute_name,omitempty"`
// Leadership
LeaderName string `json:"leader_name,omitempty"`
LeaderTitle string `json:"leader_title,omitempty"` // e.g., "Professor", "Lehrstuhlinhaber"
// Staff organization
StaffGroups []struct {
Role string `json:"role"` // e.g., "Leitung", "Wissenschaftliche Mitarbeiter", "Sekretariat"
Members []string `json:"members"` // Names of people in this group
} `json:"staff_groups,omitempty"`
// Teaching info (Lehrveranstaltungen)
TeachingCourses []struct {
Title string `json:"title"`
Teacher string `json:"teacher,omitempty"`
} `json:"teaching_courses,omitempty"`
}
// SubmitInstituteHierarchy saves hierarchy data from an institute page
// POST /api/v1/ai/extraction/institutes/submit
func (h *AIExtractionHandlers) SubmitInstituteHierarchy(c *gin.Context) {
var data InstituteHierarchyData
if err := c.ShouldBindJSON(&data); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request: " + err.Error()})
return
}
// Find or create department
dept := &database.Department{
UniversityID: data.UniversityID,
Name: data.InstituteName,
}
if data.InstituteURL != "" {
dept.URL = &data.InstituteURL
}
err := h.repo.CreateDepartment(c.Request.Context(), dept)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create department: " + err.Error()})
return
}
// Find leader and set as supervisor for all staff in this institute
var leaderID *uuid.UUID
if data.LeaderName != "" {
// Search for leader
leaderParams := database.StaffSearchParams{
Query: data.LeaderName,
UniversityID: &data.UniversityID,
Limit: 1,
}
result, err := h.repo.SearchStaff(c.Request.Context(), leaderParams)
if err == nil && len(result.Staff) > 0 {
leaderID = &result.Staff[0].ID
// Update leader with department and role
leader := &result.Staff[0]
leader.DepartmentID = &dept.ID
roleLeitung := "leitung"
leader.TeamRole = &roleLeitung
leader.IsProfessor = true
if data.LeaderTitle != "" {
leader.AcademicTitle = &data.LeaderTitle
}
h.repo.CreateStaff(c.Request.Context(), leader)
}
}
// Process staff groups
updatedCount := 0
for _, group := range data.StaffGroups {
for _, memberName := range group.Members {
// Find staff member
memberParams := database.StaffSearchParams{
Query: memberName,
UniversityID: &data.UniversityID,
Limit: 1,
}
result, err := h.repo.SearchStaff(c.Request.Context(), memberParams)
if err != nil || len(result.Staff) == 0 {
continue
}
member := &result.Staff[0]
member.DepartmentID = &dept.ID
member.TeamRole = &group.Role
// Set supervisor if leader was found and this is not the leader
if leaderID != nil && member.ID != *leaderID {
member.SupervisorID = leaderID
}
h.repo.CreateStaff(c.Request.Context(), member)
updatedCount++
}
}
c.JSON(http.StatusOK, gin.H{
"status": "success",
"department_id": dept.ID,
"leader_id": leaderID,
"members_updated": updatedCount,
})
}

View File

@@ -2,7 +2,6 @@ package handlers
import (
"net/http"
"time"
"github.com/breakpilot/edu-search-service/internal/policy"
"github.com/gin-gonic/gin"
@@ -349,289 +348,6 @@ func (h *PolicyHandler) UpdateOperationPermission(c *gin.Context) {
c.JSON(http.StatusOK, op)
}
// =============================================================================
// PII RULES
// =============================================================================
// ListPIIRules returns all PII detection rules.
func (h *PolicyHandler) ListPIIRules(c *gin.Context) {
activeOnly := c.Query("active_only") == "true"
rules, err := h.store.ListPIIRules(c.Request.Context(), activeOnly)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list PII rules", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"rules": rules,
"total": len(rules),
})
}
// GetPIIRule returns a single PII rule by ID.
func (h *PolicyHandler) GetPIIRule(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
return
}
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
return
}
if rule == nil {
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
return
}
c.JSON(http.StatusOK, rule)
}
// CreatePIIRule creates a new PII detection rule.
func (h *PolicyHandler) CreatePIIRule(c *gin.Context) {
var req policy.CreatePIIRuleRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
rule, err := h.store.CreatePIIRule(c.Request.Context(), &req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create PII rule", "details": err.Error()})
return
}
// Log audit
userEmail := getUserEmail(c)
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionCreate, policy.AuditEntityPIIRule, &rule.ID, nil, rule, userEmail)
c.JSON(http.StatusCreated, rule)
}
// UpdatePIIRule updates an existing PII rule.
func (h *PolicyHandler) UpdatePIIRule(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
return
}
// Get old value for audit
oldRule, err := h.store.GetPIIRule(c.Request.Context(), id)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
return
}
if oldRule == nil {
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
return
}
var req policy.UpdatePIIRuleRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
rule, err := h.store.UpdatePIIRule(c.Request.Context(), id, &req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update PII rule", "details": err.Error()})
return
}
// Log audit
userEmail := getUserEmail(c)
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionUpdate, policy.AuditEntityPIIRule, &rule.ID, oldRule, rule, userEmail)
c.JSON(http.StatusOK, rule)
}
// DeletePIIRule deletes a PII rule.
func (h *PolicyHandler) DeletePIIRule(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
return
}
// Get rule for audit before deletion
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
return
}
if rule == nil {
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
return
}
if err := h.store.DeletePIIRule(c.Request.Context(), id); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete PII rule", "details": err.Error()})
return
}
// Log audit
userEmail := getUserEmail(c)
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionDelete, policy.AuditEntityPIIRule, &id, rule, nil, userEmail)
c.JSON(http.StatusOK, gin.H{"deleted": true, "id": id})
}
// TestPIIRules tests PII detection against sample text.
func (h *PolicyHandler) TestPIIRules(c *gin.Context) {
var req policy.PIITestRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
response, err := h.enforcer.DetectPII(c.Request.Context(), req.Text)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to test PII detection", "details": err.Error()})
return
}
c.JSON(http.StatusOK, response)
}
// =============================================================================
// AUDIT & COMPLIANCE
// =============================================================================
// ListAuditLogs returns audit log entries.
func (h *PolicyHandler) ListAuditLogs(c *gin.Context) {
var filter policy.AuditLogFilter
if err := c.ShouldBindQuery(&filter); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
return
}
// Set defaults
if filter.Limit <= 0 || filter.Limit > 500 {
filter.Limit = 100
}
logs, total, err := h.store.ListAuditLogs(c.Request.Context(), &filter)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list audit logs", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"logs": logs,
"total": total,
"limit": filter.Limit,
"offset": filter.Offset,
})
}
// ListBlockedContent returns blocked content log entries.
func (h *PolicyHandler) ListBlockedContent(c *gin.Context) {
var filter policy.BlockedContentFilter
if err := c.ShouldBindQuery(&filter); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
return
}
// Set defaults
if filter.Limit <= 0 || filter.Limit > 500 {
filter.Limit = 100
}
logs, total, err := h.store.ListBlockedContent(c.Request.Context(), &filter)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list blocked content", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"blocked": logs,
"total": total,
"limit": filter.Limit,
"offset": filter.Offset,
})
}
// CheckCompliance performs a compliance check for a URL.
func (h *PolicyHandler) CheckCompliance(c *gin.Context) {
var req policy.CheckComplianceRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
response, err := h.enforcer.CheckCompliance(c.Request.Context(), &req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to check compliance", "details": err.Error()})
return
}
c.JSON(http.StatusOK, response)
}
// GetPolicyStats returns aggregated statistics.
func (h *PolicyHandler) GetPolicyStats(c *gin.Context) {
stats, err := h.store.GetStats(c.Request.Context())
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get stats", "details": err.Error()})
return
}
c.JSON(http.StatusOK, stats)
}
// GenerateComplianceReport generates an audit report.
func (h *PolicyHandler) GenerateComplianceReport(c *gin.Context) {
var auditFilter policy.AuditLogFilter
var blockedFilter policy.BlockedContentFilter
// Parse date filters
fromStr := c.Query("from")
toStr := c.Query("to")
if fromStr != "" {
from, err := time.Parse("2006-01-02", fromStr)
if err == nil {
auditFilter.FromDate = &from
blockedFilter.FromDate = &from
}
}
if toStr != "" {
to, err := time.Parse("2006-01-02", toStr)
if err == nil {
// Add 1 day to include the end date
to = to.Add(24 * time.Hour)
auditFilter.ToDate = &to
blockedFilter.ToDate = &to
}
}
// No limit for report
auditFilter.Limit = 10000
blockedFilter.Limit = 10000
auditor := policy.NewAuditor(h.store)
report, err := auditor.GenerateAuditReport(c.Request.Context(), &auditFilter, &blockedFilter)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to generate report", "details": err.Error()})
return
}
// Set filename for download
format := c.Query("format")
if format == "download" {
filename := "compliance-report-" + time.Now().Format("2006-01-02") + ".json"
c.Header("Content-Disposition", "attachment; filename="+filename)
c.Header("Content-Type", "application/json")
}
c.JSON(http.StatusOK, report)
}
// =============================================================================
// HELPERS
// =============================================================================

View File

@@ -0,0 +1,293 @@
package handlers
import (
"net/http"
"time"
"github.com/breakpilot/edu-search-service/internal/policy"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
)
// =============================================================================
// PII RULES
// =============================================================================
// ListPIIRules returns all PII detection rules.
func (h *PolicyHandler) ListPIIRules(c *gin.Context) {
activeOnly := c.Query("active_only") == "true"
rules, err := h.store.ListPIIRules(c.Request.Context(), activeOnly)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list PII rules", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"rules": rules,
"total": len(rules),
})
}
// GetPIIRule returns a single PII rule by ID.
func (h *PolicyHandler) GetPIIRule(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
return
}
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
return
}
if rule == nil {
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
return
}
c.JSON(http.StatusOK, rule)
}
// CreatePIIRule creates a new PII detection rule.
func (h *PolicyHandler) CreatePIIRule(c *gin.Context) {
var req policy.CreatePIIRuleRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
rule, err := h.store.CreatePIIRule(c.Request.Context(), &req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create PII rule", "details": err.Error()})
return
}
// Log audit
userEmail := getUserEmail(c)
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionCreate, policy.AuditEntityPIIRule, &rule.ID, nil, rule, userEmail)
c.JSON(http.StatusCreated, rule)
}
// UpdatePIIRule updates an existing PII rule.
func (h *PolicyHandler) UpdatePIIRule(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
return
}
// Get old value for audit
oldRule, err := h.store.GetPIIRule(c.Request.Context(), id)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
return
}
if oldRule == nil {
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
return
}
var req policy.UpdatePIIRuleRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
rule, err := h.store.UpdatePIIRule(c.Request.Context(), id, &req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update PII rule", "details": err.Error()})
return
}
// Log audit
userEmail := getUserEmail(c)
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionUpdate, policy.AuditEntityPIIRule, &rule.ID, oldRule, rule, userEmail)
c.JSON(http.StatusOK, rule)
}
// DeletePIIRule deletes a PII rule.
func (h *PolicyHandler) DeletePIIRule(c *gin.Context) {
id, err := uuid.Parse(c.Param("id"))
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid PII rule ID"})
return
}
// Get rule for audit before deletion
rule, err := h.store.GetPIIRule(c.Request.Context(), id)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get PII rule", "details": err.Error()})
return
}
if rule == nil {
c.JSON(http.StatusNotFound, gin.H{"error": "PII rule not found"})
return
}
if err := h.store.DeletePIIRule(c.Request.Context(), id); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to delete PII rule", "details": err.Error()})
return
}
// Log audit
userEmail := getUserEmail(c)
h.enforcer.LogChange(c.Request.Context(), policy.AuditActionDelete, policy.AuditEntityPIIRule, &id, rule, nil, userEmail)
c.JSON(http.StatusOK, gin.H{"deleted": true, "id": id})
}
// TestPIIRules tests PII detection against sample text.
func (h *PolicyHandler) TestPIIRules(c *gin.Context) {
var req policy.PIITestRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
response, err := h.enforcer.DetectPII(c.Request.Context(), req.Text)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to test PII detection", "details": err.Error()})
return
}
c.JSON(http.StatusOK, response)
}
// =============================================================================
// AUDIT & COMPLIANCE
// =============================================================================
// ListAuditLogs returns audit log entries.
func (h *PolicyHandler) ListAuditLogs(c *gin.Context) {
var filter policy.AuditLogFilter
if err := c.ShouldBindQuery(&filter); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
return
}
// Set defaults
if filter.Limit <= 0 || filter.Limit > 500 {
filter.Limit = 100
}
logs, total, err := h.store.ListAuditLogs(c.Request.Context(), &filter)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list audit logs", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"logs": logs,
"total": total,
"limit": filter.Limit,
"offset": filter.Offset,
})
}
// ListBlockedContent returns blocked content log entries.
func (h *PolicyHandler) ListBlockedContent(c *gin.Context) {
var filter policy.BlockedContentFilter
if err := c.ShouldBindQuery(&filter); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid query parameters", "details": err.Error()})
return
}
// Set defaults
if filter.Limit <= 0 || filter.Limit > 500 {
filter.Limit = 100
}
logs, total, err := h.store.ListBlockedContent(c.Request.Context(), &filter)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to list blocked content", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"blocked": logs,
"total": total,
"limit": filter.Limit,
"offset": filter.Offset,
})
}
// CheckCompliance performs a compliance check for a URL.
func (h *PolicyHandler) CheckCompliance(c *gin.Context) {
var req policy.CheckComplianceRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body", "details": err.Error()})
return
}
response, err := h.enforcer.CheckCompliance(c.Request.Context(), &req)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to check compliance", "details": err.Error()})
return
}
c.JSON(http.StatusOK, response)
}
// GetPolicyStats returns aggregated statistics.
func (h *PolicyHandler) GetPolicyStats(c *gin.Context) {
stats, err := h.store.GetStats(c.Request.Context())
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get stats", "details": err.Error()})
return
}
c.JSON(http.StatusOK, stats)
}
// GenerateComplianceReport generates an audit report.
func (h *PolicyHandler) GenerateComplianceReport(c *gin.Context) {
var auditFilter policy.AuditLogFilter
var blockedFilter policy.BlockedContentFilter
// Parse date filters
fromStr := c.Query("from")
toStr := c.Query("to")
if fromStr != "" {
from, err := time.Parse("2006-01-02", fromStr)
if err == nil {
auditFilter.FromDate = &from
blockedFilter.FromDate = &from
}
}
if toStr != "" {
to, err := time.Parse("2006-01-02", toStr)
if err == nil {
// Add 1 day to include the end date
to = to.Add(24 * time.Hour)
auditFilter.ToDate = &to
blockedFilter.ToDate = &to
}
}
// No limit for report
auditFilter.Limit = 10000
blockedFilter.Limit = 10000
auditor := policy.NewAuditor(h.store)
report, err := auditor.GenerateAuditReport(c.Request.Context(), &auditFilter, &blockedFilter)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to generate report", "details": err.Error()})
return
}
// Set filename for download
format := c.Query("format")
if format == "download" {
filename := "compliance-report-" + time.Now().Format("2006-01-02") + ".json"
c.Header("Content-Disposition", "attachment; filename="+filename)
c.Header("Content-Type", "application/json")
}
c.JSON(http.StatusOK, report)
}

View File

@@ -2,8 +2,6 @@ package database
import (
"context"
"fmt"
"strings"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
@@ -145,395 +143,6 @@ func (r *Repository) GetDepartmentByName(ctx context.Context, uniID uuid.UUID, n
return d, nil
}
// ============================================================================
// STAFF
// ============================================================================
// CreateStaff creates or updates a staff member
func (r *Repository) CreateStaff(ctx context.Context, s *UniversityStaff) error {
query := `
INSERT INTO university_staff (
university_id, department_id, first_name, last_name, full_name,
title, academic_title, position, position_type, is_professor,
email, phone, office, profile_url, photo_url,
orcid, google_scholar_id, researchgate_url, linkedin_url, personal_website,
research_interests, research_summary, supervisor_id, team_role, source_url
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
$21, $22, $23, $24, $25
)
ON CONFLICT (university_id, first_name, last_name, COALESCE(department_id, '00000000-0000-0000-0000-000000000000'::uuid))
DO UPDATE SET
full_name = EXCLUDED.full_name,
title = EXCLUDED.title,
academic_title = EXCLUDED.academic_title,
position = EXCLUDED.position,
position_type = EXCLUDED.position_type,
is_professor = EXCLUDED.is_professor,
email = COALESCE(EXCLUDED.email, university_staff.email),
phone = COALESCE(EXCLUDED.phone, university_staff.phone),
office = COALESCE(EXCLUDED.office, university_staff.office),
profile_url = COALESCE(EXCLUDED.profile_url, university_staff.profile_url),
photo_url = COALESCE(EXCLUDED.photo_url, university_staff.photo_url),
orcid = COALESCE(EXCLUDED.orcid, university_staff.orcid),
google_scholar_id = COALESCE(EXCLUDED.google_scholar_id, university_staff.google_scholar_id),
researchgate_url = COALESCE(EXCLUDED.researchgate_url, university_staff.researchgate_url),
linkedin_url = COALESCE(EXCLUDED.linkedin_url, university_staff.linkedin_url),
personal_website = COALESCE(EXCLUDED.personal_website, university_staff.personal_website),
research_interests = COALESCE(EXCLUDED.research_interests, university_staff.research_interests),
research_summary = COALESCE(EXCLUDED.research_summary, university_staff.research_summary),
supervisor_id = COALESCE(EXCLUDED.supervisor_id, university_staff.supervisor_id),
team_role = COALESCE(EXCLUDED.team_role, university_staff.team_role),
source_url = COALESCE(EXCLUDED.source_url, university_staff.source_url),
crawled_at = NOW(),
updated_at = NOW()
RETURNING id, crawled_at, created_at, updated_at
`
return r.db.Pool.QueryRow(ctx, query,
s.UniversityID, s.DepartmentID, s.FirstName, s.LastName, s.FullName,
s.Title, s.AcademicTitle, s.Position, s.PositionType, s.IsProfessor,
s.Email, s.Phone, s.Office, s.ProfileURL, s.PhotoURL,
s.ORCID, s.GoogleScholarID, s.ResearchgateURL, s.LinkedInURL, s.PersonalWebsite,
s.ResearchInterests, s.ResearchSummary, s.SupervisorID, s.TeamRole, s.SourceURL,
).Scan(&s.ID, &s.CrawledAt, &s.CreatedAt, &s.UpdatedAt)
}
// GetStaff retrieves a staff member by ID
func (r *Repository) GetStaff(ctx context.Context, id uuid.UUID) (*UniversityStaff, error) {
query := `SELECT * FROM v_staff_full WHERE id = $1`
s := &UniversityStaff{}
err := r.db.Pool.QueryRow(ctx, query, id).Scan(
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
&s.Email, &s.Phone, &s.Office, &s.ProfileURL, &s.PhotoURL,
&s.ORCID, &s.GoogleScholarID, &s.ResearchgateURL, &s.LinkedInURL, &s.PersonalWebsite,
&s.ResearchInterests, &s.ResearchSummary, &s.CrawledAt, &s.LastVerified, &s.IsActive, &s.SourceURL,
&s.CreatedAt, &s.UpdatedAt, &s.UniversityName, &s.UniversityShort, nil, nil,
&s.DepartmentName, nil, &s.PublicationCount,
)
if err != nil {
return nil, err
}
return s, nil
}
// SearchStaff searches for staff members
func (r *Repository) SearchStaff(ctx context.Context, params StaffSearchParams) (*StaffSearchResult, error) {
// Build query dynamically
var conditions []string
var args []interface{}
argNum := 1
baseQuery := `
SELECT s.id, s.university_id, s.department_id, s.first_name, s.last_name, s.full_name,
s.title, s.academic_title, s.position, s.position_type, s.is_professor,
s.email, s.profile_url, s.photo_url, s.orcid,
s.research_interests, s.crawled_at, s.is_active,
u.name as university_name, u.short_name as university_short, u.state as university_state,
d.name as department_name,
(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
FROM university_staff s
JOIN universities u ON s.university_id = u.id
LEFT JOIN departments d ON s.department_id = d.id
`
if params.Query != "" {
conditions = append(conditions, fmt.Sprintf(
`(to_tsvector('german', COALESCE(s.full_name, '') || ' ' || COALESCE(s.research_summary, '')) @@ plainto_tsquery('german', $%d)
OR s.full_name ILIKE '%%' || $%d || '%%'
OR s.last_name ILIKE '%%' || $%d || '%%')`,
argNum, argNum, argNum))
args = append(args, params.Query)
argNum++
}
if params.UniversityID != nil {
conditions = append(conditions, fmt.Sprintf("s.university_id = $%d", argNum))
args = append(args, *params.UniversityID)
argNum++
}
if params.DepartmentID != nil {
conditions = append(conditions, fmt.Sprintf("s.department_id = $%d", argNum))
args = append(args, *params.DepartmentID)
argNum++
}
if params.State != nil {
conditions = append(conditions, fmt.Sprintf("u.state = $%d", argNum))
args = append(args, *params.State)
argNum++
}
if params.UniType != nil {
conditions = append(conditions, fmt.Sprintf("u.uni_type = $%d", argNum))
args = append(args, *params.UniType)
argNum++
}
if params.PositionType != nil {
conditions = append(conditions, fmt.Sprintf("s.position_type = $%d", argNum))
args = append(args, *params.PositionType)
argNum++
}
if params.IsProfessor != nil {
conditions = append(conditions, fmt.Sprintf("s.is_professor = $%d", argNum))
args = append(args, *params.IsProfessor)
argNum++
}
// Build WHERE clause
whereClause := ""
if len(conditions) > 0 {
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
// Count total
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id %s", whereClause)
var total int
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
return nil, err
}
// Apply pagination
limit := params.Limit
if limit <= 0 {
limit = 20
}
if limit > 100 {
limit = 100
}
offset := params.Offset
if offset < 0 {
offset = 0
}
// Full query with pagination
fullQuery := fmt.Sprintf("%s %s ORDER BY s.is_professor DESC, s.last_name ASC LIMIT %d OFFSET %d",
baseQuery, whereClause, limit, offset)
rows, err := r.db.Pool.Query(ctx, fullQuery, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var staff []UniversityStaff
for rows.Next() {
var s UniversityStaff
var uniState *string
if err := rows.Scan(
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
&s.Email, &s.ProfileURL, &s.PhotoURL, &s.ORCID,
&s.ResearchInterests, &s.CrawledAt, &s.IsActive,
&s.UniversityName, &s.UniversityShort, &uniState,
&s.DepartmentName, &s.PublicationCount,
); err != nil {
return nil, err
}
staff = append(staff, s)
}
return &StaffSearchResult{
Staff: staff,
Total: total,
Limit: limit,
Offset: offset,
Query: params.Query,
}, rows.Err()
}
// ============================================================================
// PUBLICATIONS
// ============================================================================
// CreatePublication creates or updates a publication
func (r *Repository) CreatePublication(ctx context.Context, p *Publication) error {
query := `
INSERT INTO publications (
title, title_en, abstract, abstract_en, year, month,
pub_type, venue, venue_short, publisher,
doi, isbn, issn, arxiv_id, pubmed_id,
url, pdf_url, citation_count, keywords, topics, source, raw_data
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22
)
ON CONFLICT (doi) WHERE doi IS NOT NULL DO UPDATE SET
title = EXCLUDED.title,
abstract = EXCLUDED.abstract,
year = EXCLUDED.year,
venue = EXCLUDED.venue,
citation_count = EXCLUDED.citation_count,
updated_at = NOW()
RETURNING id, crawled_at, created_at, updated_at
`
// Handle potential duplicate without DOI
err := r.db.Pool.QueryRow(ctx, query,
p.Title, p.TitleEN, p.Abstract, p.AbstractEN, p.Year, p.Month,
p.PubType, p.Venue, p.VenueShort, p.Publisher,
p.DOI, p.ISBN, p.ISSN, p.ArxivID, p.PubmedID,
p.URL, p.PDFURL, p.CitationCount, p.Keywords, p.Topics, p.Source, p.RawData,
).Scan(&p.ID, &p.CrawledAt, &p.CreatedAt, &p.UpdatedAt)
if err != nil && strings.Contains(err.Error(), "duplicate") {
// Try to find existing publication by title and year
findQuery := `SELECT id FROM publications WHERE title = $1 AND year = $2`
err = r.db.Pool.QueryRow(ctx, findQuery, p.Title, p.Year).Scan(&p.ID)
}
return err
}
// LinkStaffPublication creates a link between staff and publication
func (r *Repository) LinkStaffPublication(ctx context.Context, sp *StaffPublication) error {
query := `
INSERT INTO staff_publications (staff_id, publication_id, author_position, is_corresponding)
VALUES ($1, $2, $3, $4)
ON CONFLICT (staff_id, publication_id) DO UPDATE SET
author_position = EXCLUDED.author_position,
is_corresponding = EXCLUDED.is_corresponding
`
_, err := r.db.Pool.Exec(ctx, query,
sp.StaffID, sp.PublicationID, sp.AuthorPosition, sp.IsCorresponding,
)
return err
}
// GetStaffPublications retrieves all publications for a staff member
func (r *Repository) GetStaffPublications(ctx context.Context, staffID uuid.UUID) ([]Publication, error) {
query := `
SELECT p.id, p.title, p.abstract, p.year, p.pub_type, p.venue, p.doi, p.url, p.citation_count
FROM publications p
JOIN staff_publications sp ON p.id = sp.publication_id
WHERE sp.staff_id = $1
ORDER BY p.year DESC NULLS LAST, p.title
`
rows, err := r.db.Pool.Query(ctx, query, staffID)
if err != nil {
return nil, err
}
defer rows.Close()
var pubs []Publication
for rows.Next() {
var p Publication
if err := rows.Scan(
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount,
); err != nil {
return nil, err
}
pubs = append(pubs, p)
}
return pubs, rows.Err()
}
// SearchPublications searches for publications
func (r *Repository) SearchPublications(ctx context.Context, params PublicationSearchParams) (*PublicationSearchResult, error) {
var conditions []string
var args []interface{}
argNum := 1
if params.Query != "" {
conditions = append(conditions, fmt.Sprintf(
`to_tsvector('german', COALESCE(title, '') || ' ' || COALESCE(abstract, '')) @@ plainto_tsquery('german', $%d)`,
argNum))
args = append(args, params.Query)
argNum++
}
if params.StaffID != nil {
conditions = append(conditions, fmt.Sprintf(
`id IN (SELECT publication_id FROM staff_publications WHERE staff_id = $%d)`,
argNum))
args = append(args, *params.StaffID)
argNum++
}
if params.Year != nil {
conditions = append(conditions, fmt.Sprintf("year = $%d", argNum))
args = append(args, *params.Year)
argNum++
}
if params.YearFrom != nil {
conditions = append(conditions, fmt.Sprintf("year >= $%d", argNum))
args = append(args, *params.YearFrom)
argNum++
}
if params.YearTo != nil {
conditions = append(conditions, fmt.Sprintf("year <= $%d", argNum))
args = append(args, *params.YearTo)
argNum++
}
if params.PubType != nil {
conditions = append(conditions, fmt.Sprintf("pub_type = $%d", argNum))
args = append(args, *params.PubType)
argNum++
}
whereClause := ""
if len(conditions) > 0 {
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
// Count
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM publications %s", whereClause)
var total int
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
return nil, err
}
// Pagination
limit := params.Limit
if limit <= 0 {
limit = 20
}
offset := params.Offset
// Query
query := fmt.Sprintf(`
SELECT id, title, abstract, year, pub_type, venue, doi, url, citation_count, keywords
FROM publications %s
ORDER BY year DESC NULLS LAST, citation_count DESC
LIMIT %d OFFSET %d
`, whereClause, limit, offset)
rows, err := r.db.Pool.Query(ctx, query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var pubs []Publication
for rows.Next() {
var p Publication
if err := rows.Scan(
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, &p.Keywords,
); err != nil {
return nil, err
}
pubs = append(pubs, p)
}
return &PublicationSearchResult{
Publications: pubs,
Total: total,
Limit: limit,
Offset: offset,
Query: params.Query,
}, rows.Err()
}
// ============================================================================
// CRAWL STATUS
// ============================================================================

View File

@@ -0,0 +1,398 @@
package database
import (
"context"
"fmt"
"strings"
"github.com/google/uuid"
)
// ============================================================================
// STAFF
// ============================================================================
// CreateStaff creates or updates a staff member
func (r *Repository) CreateStaff(ctx context.Context, s *UniversityStaff) error {
query := `
INSERT INTO university_staff (
university_id, department_id, first_name, last_name, full_name,
title, academic_title, position, position_type, is_professor,
email, phone, office, profile_url, photo_url,
orcid, google_scholar_id, researchgate_url, linkedin_url, personal_website,
research_interests, research_summary, supervisor_id, team_role, source_url
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
$21, $22, $23, $24, $25
)
ON CONFLICT (university_id, first_name, last_name, COALESCE(department_id, '00000000-0000-0000-0000-000000000000'::uuid))
DO UPDATE SET
full_name = EXCLUDED.full_name,
title = EXCLUDED.title,
academic_title = EXCLUDED.academic_title,
position = EXCLUDED.position,
position_type = EXCLUDED.position_type,
is_professor = EXCLUDED.is_professor,
email = COALESCE(EXCLUDED.email, university_staff.email),
phone = COALESCE(EXCLUDED.phone, university_staff.phone),
office = COALESCE(EXCLUDED.office, university_staff.office),
profile_url = COALESCE(EXCLUDED.profile_url, university_staff.profile_url),
photo_url = COALESCE(EXCLUDED.photo_url, university_staff.photo_url),
orcid = COALESCE(EXCLUDED.orcid, university_staff.orcid),
google_scholar_id = COALESCE(EXCLUDED.google_scholar_id, university_staff.google_scholar_id),
researchgate_url = COALESCE(EXCLUDED.researchgate_url, university_staff.researchgate_url),
linkedin_url = COALESCE(EXCLUDED.linkedin_url, university_staff.linkedin_url),
personal_website = COALESCE(EXCLUDED.personal_website, university_staff.personal_website),
research_interests = COALESCE(EXCLUDED.research_interests, university_staff.research_interests),
research_summary = COALESCE(EXCLUDED.research_summary, university_staff.research_summary),
supervisor_id = COALESCE(EXCLUDED.supervisor_id, university_staff.supervisor_id),
team_role = COALESCE(EXCLUDED.team_role, university_staff.team_role),
source_url = COALESCE(EXCLUDED.source_url, university_staff.source_url),
crawled_at = NOW(),
updated_at = NOW()
RETURNING id, crawled_at, created_at, updated_at
`
return r.db.Pool.QueryRow(ctx, query,
s.UniversityID, s.DepartmentID, s.FirstName, s.LastName, s.FullName,
s.Title, s.AcademicTitle, s.Position, s.PositionType, s.IsProfessor,
s.Email, s.Phone, s.Office, s.ProfileURL, s.PhotoURL,
s.ORCID, s.GoogleScholarID, s.ResearchgateURL, s.LinkedInURL, s.PersonalWebsite,
s.ResearchInterests, s.ResearchSummary, s.SupervisorID, s.TeamRole, s.SourceURL,
).Scan(&s.ID, &s.CrawledAt, &s.CreatedAt, &s.UpdatedAt)
}
// GetStaff retrieves a staff member by ID
func (r *Repository) GetStaff(ctx context.Context, id uuid.UUID) (*UniversityStaff, error) {
query := `SELECT * FROM v_staff_full WHERE id = $1`
s := &UniversityStaff{}
err := r.db.Pool.QueryRow(ctx, query, id).Scan(
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
&s.Email, &s.Phone, &s.Office, &s.ProfileURL, &s.PhotoURL,
&s.ORCID, &s.GoogleScholarID, &s.ResearchgateURL, &s.LinkedInURL, &s.PersonalWebsite,
&s.ResearchInterests, &s.ResearchSummary, &s.CrawledAt, &s.LastVerified, &s.IsActive, &s.SourceURL,
&s.CreatedAt, &s.UpdatedAt, &s.UniversityName, &s.UniversityShort, nil, nil,
&s.DepartmentName, nil, &s.PublicationCount,
)
if err != nil {
return nil, err
}
return s, nil
}
// SearchStaff searches for staff members
func (r *Repository) SearchStaff(ctx context.Context, params StaffSearchParams) (*StaffSearchResult, error) {
// Build query dynamically
var conditions []string
var args []interface{}
argNum := 1
baseQuery := `
SELECT s.id, s.university_id, s.department_id, s.first_name, s.last_name, s.full_name,
s.title, s.academic_title, s.position, s.position_type, s.is_professor,
s.email, s.profile_url, s.photo_url, s.orcid,
s.research_interests, s.crawled_at, s.is_active,
u.name as university_name, u.short_name as university_short, u.state as university_state,
d.name as department_name,
(SELECT COUNT(*) FROM staff_publications sp WHERE sp.staff_id = s.id) as publication_count
FROM university_staff s
JOIN universities u ON s.university_id = u.id
LEFT JOIN departments d ON s.department_id = d.id
`
if params.Query != "" {
conditions = append(conditions, fmt.Sprintf(
`(to_tsvector('german', COALESCE(s.full_name, '') || ' ' || COALESCE(s.research_summary, '')) @@ plainto_tsquery('german', $%d)
OR s.full_name ILIKE '%%' || $%d || '%%'
OR s.last_name ILIKE '%%' || $%d || '%%')`,
argNum, argNum, argNum))
args = append(args, params.Query)
argNum++
}
if params.UniversityID != nil {
conditions = append(conditions, fmt.Sprintf("s.university_id = $%d", argNum))
args = append(args, *params.UniversityID)
argNum++
}
if params.DepartmentID != nil {
conditions = append(conditions, fmt.Sprintf("s.department_id = $%d", argNum))
args = append(args, *params.DepartmentID)
argNum++
}
if params.State != nil {
conditions = append(conditions, fmt.Sprintf("u.state = $%d", argNum))
args = append(args, *params.State)
argNum++
}
if params.UniType != nil {
conditions = append(conditions, fmt.Sprintf("u.uni_type = $%d", argNum))
args = append(args, *params.UniType)
argNum++
}
if params.PositionType != nil {
conditions = append(conditions, fmt.Sprintf("s.position_type = $%d", argNum))
args = append(args, *params.PositionType)
argNum++
}
if params.IsProfessor != nil {
conditions = append(conditions, fmt.Sprintf("s.is_professor = $%d", argNum))
args = append(args, *params.IsProfessor)
argNum++
}
// Build WHERE clause
whereClause := ""
if len(conditions) > 0 {
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
// Count total
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM university_staff s JOIN universities u ON s.university_id = u.id LEFT JOIN departments d ON s.department_id = d.id %s", whereClause)
var total int
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
return nil, err
}
// Apply pagination
limit := params.Limit
if limit <= 0 {
limit = 20
}
if limit > 100 {
limit = 100
}
offset := params.Offset
if offset < 0 {
offset = 0
}
// Full query with pagination
fullQuery := fmt.Sprintf("%s %s ORDER BY s.is_professor DESC, s.last_name ASC LIMIT %d OFFSET %d",
baseQuery, whereClause, limit, offset)
rows, err := r.db.Pool.Query(ctx, fullQuery, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var staff []UniversityStaff
for rows.Next() {
var s UniversityStaff
var uniState *string
if err := rows.Scan(
&s.ID, &s.UniversityID, &s.DepartmentID, &s.FirstName, &s.LastName, &s.FullName,
&s.Title, &s.AcademicTitle, &s.Position, &s.PositionType, &s.IsProfessor,
&s.Email, &s.ProfileURL, &s.PhotoURL, &s.ORCID,
&s.ResearchInterests, &s.CrawledAt, &s.IsActive,
&s.UniversityName, &s.UniversityShort, &uniState,
&s.DepartmentName, &s.PublicationCount,
); err != nil {
return nil, err
}
staff = append(staff, s)
}
return &StaffSearchResult{
Staff: staff,
Total: total,
Limit: limit,
Offset: offset,
Query: params.Query,
}, rows.Err()
}
// ============================================================================
// PUBLICATIONS
// ============================================================================
// CreatePublication creates or updates a publication
func (r *Repository) CreatePublication(ctx context.Context, p *Publication) error {
query := `
INSERT INTO publications (
title, title_en, abstract, abstract_en, year, month,
pub_type, venue, venue_short, publisher,
doi, isbn, issn, arxiv_id, pubmed_id,
url, pdf_url, citation_count, keywords, topics, source, raw_data
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22
)
ON CONFLICT (doi) WHERE doi IS NOT NULL DO UPDATE SET
title = EXCLUDED.title,
abstract = EXCLUDED.abstract,
year = EXCLUDED.year,
venue = EXCLUDED.venue,
citation_count = EXCLUDED.citation_count,
updated_at = NOW()
RETURNING id, crawled_at, created_at, updated_at
`
// Handle potential duplicate without DOI
err := r.db.Pool.QueryRow(ctx, query,
p.Title, p.TitleEN, p.Abstract, p.AbstractEN, p.Year, p.Month,
p.PubType, p.Venue, p.VenueShort, p.Publisher,
p.DOI, p.ISBN, p.ISSN, p.ArxivID, p.PubmedID,
p.URL, p.PDFURL, p.CitationCount, p.Keywords, p.Topics, p.Source, p.RawData,
).Scan(&p.ID, &p.CrawledAt, &p.CreatedAt, &p.UpdatedAt)
if err != nil && strings.Contains(err.Error(), "duplicate") {
// Try to find existing publication by title and year
findQuery := `SELECT id FROM publications WHERE title = $1 AND year = $2`
err = r.db.Pool.QueryRow(ctx, findQuery, p.Title, p.Year).Scan(&p.ID)
}
return err
}
// LinkStaffPublication creates a link between staff and publication
func (r *Repository) LinkStaffPublication(ctx context.Context, sp *StaffPublication) error {
query := `
INSERT INTO staff_publications (staff_id, publication_id, author_position, is_corresponding)
VALUES ($1, $2, $3, $4)
ON CONFLICT (staff_id, publication_id) DO UPDATE SET
author_position = EXCLUDED.author_position,
is_corresponding = EXCLUDED.is_corresponding
`
_, err := r.db.Pool.Exec(ctx, query,
sp.StaffID, sp.PublicationID, sp.AuthorPosition, sp.IsCorresponding,
)
return err
}
// GetStaffPublications retrieves all publications for a staff member
func (r *Repository) GetStaffPublications(ctx context.Context, staffID uuid.UUID) ([]Publication, error) {
query := `
SELECT p.id, p.title, p.abstract, p.year, p.pub_type, p.venue, p.doi, p.url, p.citation_count
FROM publications p
JOIN staff_publications sp ON p.id = sp.publication_id
WHERE sp.staff_id = $1
ORDER BY p.year DESC NULLS LAST, p.title
`
rows, err := r.db.Pool.Query(ctx, query, staffID)
if err != nil {
return nil, err
}
defer rows.Close()
var pubs []Publication
for rows.Next() {
var p Publication
if err := rows.Scan(
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount,
); err != nil {
return nil, err
}
pubs = append(pubs, p)
}
return pubs, rows.Err()
}
// SearchPublications searches for publications
func (r *Repository) SearchPublications(ctx context.Context, params PublicationSearchParams) (*PublicationSearchResult, error) {
var conditions []string
var args []interface{}
argNum := 1
if params.Query != "" {
conditions = append(conditions, fmt.Sprintf(
`to_tsvector('german', COALESCE(title, '') || ' ' || COALESCE(abstract, '')) @@ plainto_tsquery('german', $%d)`,
argNum))
args = append(args, params.Query)
argNum++
}
if params.StaffID != nil {
conditions = append(conditions, fmt.Sprintf(
`id IN (SELECT publication_id FROM staff_publications WHERE staff_id = $%d)`,
argNum))
args = append(args, *params.StaffID)
argNum++
}
if params.Year != nil {
conditions = append(conditions, fmt.Sprintf("year = $%d", argNum))
args = append(args, *params.Year)
argNum++
}
if params.YearFrom != nil {
conditions = append(conditions, fmt.Sprintf("year >= $%d", argNum))
args = append(args, *params.YearFrom)
argNum++
}
if params.YearTo != nil {
conditions = append(conditions, fmt.Sprintf("year <= $%d", argNum))
args = append(args, *params.YearTo)
argNum++
}
if params.PubType != nil {
conditions = append(conditions, fmt.Sprintf("pub_type = $%d", argNum))
args = append(args, *params.PubType)
argNum++
}
whereClause := ""
if len(conditions) > 0 {
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
// Count
countQuery := fmt.Sprintf("SELECT COUNT(*) FROM publications %s", whereClause)
var total int
if err := r.db.Pool.QueryRow(ctx, countQuery, args...).Scan(&total); err != nil {
return nil, err
}
// Pagination
limit := params.Limit
if limit <= 0 {
limit = 20
}
offset := params.Offset
// Query
query := fmt.Sprintf(`
SELECT id, title, abstract, year, pub_type, venue, doi, url, citation_count, keywords
FROM publications %s
ORDER BY year DESC NULLS LAST, citation_count DESC
LIMIT %d OFFSET %d
`, whereClause, limit, offset)
rows, err := r.db.Pool.Query(ctx, query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var pubs []Publication
for rows.Next() {
var p Publication
if err := rows.Scan(
&p.ID, &p.Title, &p.Abstract, &p.Year, &p.PubType, &p.Venue, &p.DOI, &p.URL, &p.CitationCount, &p.Keywords,
); err != nil {
return nil, err
}
pubs = append(pubs, p)
}
return &PublicationSearchResult{
Publications: pubs,
Total: total,
Limit: limit,
Offset: offset,
Query: params.Query,
}, rows.Err()
}

View File

@@ -2,7 +2,6 @@ package policy
import (
"context"
"encoding/json"
"fmt"
"time"
@@ -205,413 +204,6 @@ func (s *Store) DeletePolicy(ctx context.Context, id uuid.UUID) error {
return nil
}
// =============================================================================
// ALLOWED SOURCES
// =============================================================================
// CreateSource creates a new allowed source.
func (s *Store) CreateSource(ctx context.Context, req *CreateAllowedSourceRequest) (*AllowedSource, error) {
trustBoost := 0.5
if req.TrustBoost != nil {
trustBoost = *req.TrustBoost
}
source := &AllowedSource{
ID: uuid.New(),
PolicyID: req.PolicyID,
Domain: req.Domain,
Name: req.Name,
Description: req.Description,
License: req.License,
LegalBasis: req.LegalBasis,
CitationTemplate: req.CitationTemplate,
TrustBoost: trustBoost,
IsActive: true,
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
query := `
INSERT INTO allowed_sources (id, policy_id, domain, name, description, license,
legal_basis, citation_template, trust_boost, is_active,
created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
RETURNING id`
err := s.pool.QueryRow(ctx, query,
source.ID, source.PolicyID, source.Domain, source.Name, source.Description,
source.License, source.LegalBasis, source.CitationTemplate, source.TrustBoost,
source.IsActive, source.CreatedAt, source.UpdatedAt,
).Scan(&source.ID)
if err != nil {
return nil, fmt.Errorf("failed to create source: %w", err)
}
// Create default operation permissions
err = s.createDefaultOperations(ctx, source.ID)
if err != nil {
return nil, fmt.Errorf("failed to create default operations: %w", err)
}
return source, nil
}
// createDefaultOperations creates default operation permissions for a source.
func (s *Store) createDefaultOperations(ctx context.Context, sourceID uuid.UUID) error {
defaults := []struct {
op Operation
allowed bool
citation bool
}{
{OperationLookup, true, true},
{OperationRAG, true, true},
{OperationTraining, false, false}, // VERBOTEN by default
{OperationExport, true, true},
}
for _, d := range defaults {
query := `
INSERT INTO operation_permissions (id, source_id, operation, is_allowed, requires_citation, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)`
_, err := s.pool.Exec(ctx, query, uuid.New(), sourceID, d.op, d.allowed, d.citation, time.Now(), time.Now())
if err != nil {
return err
}
}
return nil
}
// GetSource retrieves a source by ID.
func (s *Store) GetSource(ctx context.Context, id uuid.UUID) (*AllowedSource, error) {
query := `
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
als.created_at, als.updated_at, sp.name as policy_name
FROM allowed_sources als
JOIN source_policies sp ON als.policy_id = sp.id
WHERE als.id = $1`
source := &AllowedSource{}
err := s.pool.QueryRow(ctx, query, id).Scan(
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
&source.IsActive, &source.CreatedAt, &source.UpdatedAt, &source.PolicyName,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get source: %w", err)
}
// Load operations
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
if err != nil {
return nil, err
}
source.Operations = ops
return source, nil
}
// GetSourceByDomain retrieves a source by domain with optional bundesland filter.
func (s *Store) GetSourceByDomain(ctx context.Context, domain string, bundesland *Bundesland) (*AllowedSource, error) {
query := `
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
als.created_at, als.updated_at
FROM allowed_sources als
JOIN source_policies sp ON als.policy_id = sp.id
WHERE als.is_active = true
AND sp.is_active = true
AND (als.domain = $1 OR $1 LIKE '%.' || als.domain)
AND (sp.bundesland IS NULL OR sp.bundesland = $2)
LIMIT 1`
source := &AllowedSource{}
err := s.pool.QueryRow(ctx, query, domain, bundesland).Scan(
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
&source.IsActive, &source.CreatedAt, &source.UpdatedAt,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get source by domain: %w", err)
}
// Load operations
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
if err != nil {
return nil, err
}
source.Operations = ops
return source, nil
}
// ListSources retrieves sources with optional filters.
func (s *Store) ListSources(ctx context.Context, filter *SourceListFilter) ([]AllowedSource, int, error) {
baseQuery := `FROM allowed_sources als JOIN source_policies sp ON als.policy_id = sp.id WHERE 1=1`
args := []interface{}{}
argCount := 0
if filter.PolicyID != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.policy_id = $%d", argCount)
args = append(args, *filter.PolicyID)
}
if filter.Domain != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.domain ILIKE $%d", argCount)
args = append(args, "%"+*filter.Domain+"%")
}
if filter.License != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.license = $%d", argCount)
args = append(args, *filter.License)
}
if filter.IsActive != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.is_active = $%d", argCount)
args = append(args, *filter.IsActive)
}
// Count query
var total int
countQuery := "SELECT COUNT(*) " + baseQuery
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
if err != nil {
return nil, 0, fmt.Errorf("failed to count sources: %w", err)
}
// Data query
dataQuery := `SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
als.created_at, als.updated_at, sp.name as policy_name ` + baseQuery +
` ORDER BY als.created_at DESC`
if filter.Limit > 0 {
argCount++
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
args = append(args, filter.Limit)
}
if filter.Offset > 0 {
argCount++
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
args = append(args, filter.Offset)
}
rows, err := s.pool.Query(ctx, dataQuery, args...)
if err != nil {
return nil, 0, fmt.Errorf("failed to list sources: %w", err)
}
defer rows.Close()
sources := []AllowedSource{}
for rows.Next() {
var src AllowedSource
err := rows.Scan(
&src.ID, &src.PolicyID, &src.Domain, &src.Name, &src.Description,
&src.License, &src.LegalBasis, &src.CitationTemplate, &src.TrustBoost,
&src.IsActive, &src.CreatedAt, &src.UpdatedAt, &src.PolicyName,
)
if err != nil {
return nil, 0, fmt.Errorf("failed to scan source: %w", err)
}
sources = append(sources, src)
}
return sources, total, nil
}
// UpdateSource updates an existing source.
func (s *Store) UpdateSource(ctx context.Context, id uuid.UUID, req *UpdateAllowedSourceRequest) (*AllowedSource, error) {
source, err := s.GetSource(ctx, id)
if err != nil {
return nil, err
}
if source == nil {
return nil, fmt.Errorf("source not found")
}
if req.Domain != nil {
source.Domain = *req.Domain
}
if req.Name != nil {
source.Name = *req.Name
}
if req.Description != nil {
source.Description = req.Description
}
if req.License != nil {
source.License = *req.License
}
if req.LegalBasis != nil {
source.LegalBasis = req.LegalBasis
}
if req.CitationTemplate != nil {
source.CitationTemplate = req.CitationTemplate
}
if req.TrustBoost != nil {
source.TrustBoost = *req.TrustBoost
}
if req.IsActive != nil {
source.IsActive = *req.IsActive
}
source.UpdatedAt = time.Now()
query := `
UPDATE allowed_sources
SET domain = $2, name = $3, description = $4, license = $5, legal_basis = $6,
citation_template = $7, trust_boost = $8, is_active = $9, updated_at = $10
WHERE id = $1`
_, err = s.pool.Exec(ctx, query,
id, source.Domain, source.Name, source.Description, source.License,
source.LegalBasis, source.CitationTemplate, source.TrustBoost,
source.IsActive, source.UpdatedAt,
)
if err != nil {
return nil, fmt.Errorf("failed to update source: %w", err)
}
return source, nil
}
// DeleteSource deletes a source by ID.
func (s *Store) DeleteSource(ctx context.Context, id uuid.UUID) error {
query := `DELETE FROM allowed_sources WHERE id = $1`
_, err := s.pool.Exec(ctx, query, id)
if err != nil {
return fmt.Errorf("failed to delete source: %w", err)
}
return nil
}
// =============================================================================
// OPERATION PERMISSIONS
// =============================================================================
// GetOperationsBySourceID retrieves all operation permissions for a source.
func (s *Store) GetOperationsBySourceID(ctx context.Context, sourceID uuid.UUID) ([]OperationPermission, error) {
query := `
SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
FROM operation_permissions
WHERE source_id = $1
ORDER BY operation`
rows, err := s.pool.Query(ctx, query, sourceID)
if err != nil {
return nil, fmt.Errorf("failed to get operations: %w", err)
}
defer rows.Close()
ops := []OperationPermission{}
for rows.Next() {
var op OperationPermission
err := rows.Scan(
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
)
if err != nil {
return nil, fmt.Errorf("failed to scan operation: %w", err)
}
ops = append(ops, op)
}
return ops, nil
}
// UpdateOperationPermission updates an operation permission.
func (s *Store) UpdateOperationPermission(ctx context.Context, id uuid.UUID, req *UpdateOperationPermissionRequest) (*OperationPermission, error) {
query := `SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
FROM operation_permissions WHERE id = $1`
op := &OperationPermission{}
err := s.pool.QueryRow(ctx, query, id).Scan(
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
)
if err == pgx.ErrNoRows {
return nil, fmt.Errorf("operation permission not found")
}
if err != nil {
return nil, fmt.Errorf("failed to get operation: %w", err)
}
if req.IsAllowed != nil {
op.IsAllowed = *req.IsAllowed
}
if req.RequiresCitation != nil {
op.RequiresCitation = *req.RequiresCitation
}
if req.Notes != nil {
op.Notes = req.Notes
}
op.UpdatedAt = time.Now()
updateQuery := `
UPDATE operation_permissions
SET is_allowed = $2, requires_citation = $3, notes = $4, updated_at = $5
WHERE id = $1`
_, err = s.pool.Exec(ctx, updateQuery, id, op.IsAllowed, op.RequiresCitation, op.Notes, op.UpdatedAt)
if err != nil {
return nil, fmt.Errorf("failed to update operation: %w", err)
}
return op, nil
}
// GetOperationsMatrix retrieves all operation permissions grouped by source.
func (s *Store) GetOperationsMatrix(ctx context.Context) ([]AllowedSource, error) {
query := `
SELECT als.id, als.domain, als.name, als.license, als.is_active,
sp.name as policy_name, sp.bundesland
FROM allowed_sources als
JOIN source_policies sp ON als.policy_id = sp.id
WHERE als.is_active = true AND sp.is_active = true
ORDER BY sp.bundesland NULLS FIRST, als.name`
rows, err := s.pool.Query(ctx, query)
if err != nil {
return nil, fmt.Errorf("failed to get operations matrix: %w", err)
}
defer rows.Close()
sources := []AllowedSource{}
for rows.Next() {
var src AllowedSource
var bundesland *Bundesland
err := rows.Scan(
&src.ID, &src.Domain, &src.Name, &src.License, &src.IsActive,
&src.PolicyName, &bundesland,
)
if err != nil {
return nil, fmt.Errorf("failed to scan source: %w", err)
}
// Load operations for each source
ops, err := s.GetOperationsBySourceID(ctx, src.ID)
if err != nil {
return nil, err
}
src.Operations = ops
sources = append(sources, src)
}
return sources, nil
}
// =============================================================================
// PII RULES
// =============================================================================
@@ -765,404 +357,3 @@ func (s *Store) DeletePIIRule(ctx context.Context, id uuid.UUID) error {
}
return nil
}
// =============================================================================
// AUDIT LOG
// =============================================================================
// CreateAuditLog creates a new audit log entry.
func (s *Store) CreateAuditLog(ctx context.Context, entry *PolicyAuditLog) error {
entry.ID = uuid.New()
entry.CreatedAt = time.Now()
query := `
INSERT INTO policy_audit_log (id, action, entity_type, entity_id, old_value, new_value,
user_id, user_email, ip_address, user_agent, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`
_, err := s.pool.Exec(ctx, query,
entry.ID, entry.Action, entry.EntityType, entry.EntityID,
entry.OldValue, entry.NewValue, entry.UserID, entry.UserEmail,
entry.IPAddress, entry.UserAgent, entry.CreatedAt,
)
if err != nil {
return fmt.Errorf("failed to create audit log: %w", err)
}
return nil
}
// ListAuditLogs retrieves audit logs with filters.
func (s *Store) ListAuditLogs(ctx context.Context, filter *AuditLogFilter) ([]PolicyAuditLog, int, error) {
baseQuery := `FROM policy_audit_log WHERE 1=1`
args := []interface{}{}
argCount := 0
if filter.EntityType != nil {
argCount++
baseQuery += fmt.Sprintf(" AND entity_type = $%d", argCount)
args = append(args, *filter.EntityType)
}
if filter.EntityID != nil {
argCount++
baseQuery += fmt.Sprintf(" AND entity_id = $%d", argCount)
args = append(args, *filter.EntityID)
}
if filter.Action != nil {
argCount++
baseQuery += fmt.Sprintf(" AND action = $%d", argCount)
args = append(args, *filter.Action)
}
if filter.UserEmail != nil {
argCount++
baseQuery += fmt.Sprintf(" AND user_email ILIKE $%d", argCount)
args = append(args, "%"+*filter.UserEmail+"%")
}
if filter.FromDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
args = append(args, *filter.FromDate)
}
if filter.ToDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
args = append(args, *filter.ToDate)
}
// Count query
var total int
countQuery := "SELECT COUNT(*) " + baseQuery
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
if err != nil {
return nil, 0, fmt.Errorf("failed to count audit logs: %w", err)
}
// Data query
dataQuery := `SELECT id, action, entity_type, entity_id, old_value, new_value,
user_id, user_email, ip_address, user_agent, created_at ` + baseQuery +
` ORDER BY created_at DESC`
if filter.Limit > 0 {
argCount++
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
args = append(args, filter.Limit)
}
if filter.Offset > 0 {
argCount++
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
args = append(args, filter.Offset)
}
rows, err := s.pool.Query(ctx, dataQuery, args...)
if err != nil {
return nil, 0, fmt.Errorf("failed to list audit logs: %w", err)
}
defer rows.Close()
logs := []PolicyAuditLog{}
for rows.Next() {
var l PolicyAuditLog
err := rows.Scan(
&l.ID, &l.Action, &l.EntityType, &l.EntityID, &l.OldValue, &l.NewValue,
&l.UserID, &l.UserEmail, &l.IPAddress, &l.UserAgent, &l.CreatedAt,
)
if err != nil {
return nil, 0, fmt.Errorf("failed to scan audit log: %w", err)
}
logs = append(logs, l)
}
return logs, total, nil
}
// =============================================================================
// BLOCKED CONTENT LOG
// =============================================================================
// CreateBlockedContentLog creates a new blocked content log entry.
func (s *Store) CreateBlockedContentLog(ctx context.Context, entry *BlockedContentLog) error {
entry.ID = uuid.New()
entry.CreatedAt = time.Now()
query := `
INSERT INTO blocked_content_log (id, url, domain, block_reason, matched_rule_id, details, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)`
_, err := s.pool.Exec(ctx, query,
entry.ID, entry.URL, entry.Domain, entry.BlockReason,
entry.MatchedRuleID, entry.Details, entry.CreatedAt,
)
if err != nil {
return fmt.Errorf("failed to create blocked content log: %w", err)
}
return nil
}
// ListBlockedContent retrieves blocked content logs with filters.
func (s *Store) ListBlockedContent(ctx context.Context, filter *BlockedContentFilter) ([]BlockedContentLog, int, error) {
baseQuery := `FROM blocked_content_log WHERE 1=1`
args := []interface{}{}
argCount := 0
if filter.Domain != nil {
argCount++
baseQuery += fmt.Sprintf(" AND domain ILIKE $%d", argCount)
args = append(args, "%"+*filter.Domain+"%")
}
if filter.BlockReason != nil {
argCount++
baseQuery += fmt.Sprintf(" AND block_reason = $%d", argCount)
args = append(args, *filter.BlockReason)
}
if filter.FromDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
args = append(args, *filter.FromDate)
}
if filter.ToDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
args = append(args, *filter.ToDate)
}
// Count query
var total int
countQuery := "SELECT COUNT(*) " + baseQuery
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
if err != nil {
return nil, 0, fmt.Errorf("failed to count blocked content: %w", err)
}
// Data query
dataQuery := `SELECT id, url, domain, block_reason, matched_rule_id, details, created_at ` + baseQuery +
` ORDER BY created_at DESC`
if filter.Limit > 0 {
argCount++
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
args = append(args, filter.Limit)
}
if filter.Offset > 0 {
argCount++
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
args = append(args, filter.Offset)
}
rows, err := s.pool.Query(ctx, dataQuery, args...)
if err != nil {
return nil, 0, fmt.Errorf("failed to list blocked content: %w", err)
}
defer rows.Close()
logs := []BlockedContentLog{}
for rows.Next() {
var l BlockedContentLog
err := rows.Scan(
&l.ID, &l.URL, &l.Domain, &l.BlockReason,
&l.MatchedRuleID, &l.Details, &l.CreatedAt,
)
if err != nil {
return nil, 0, fmt.Errorf("failed to scan blocked content: %w", err)
}
logs = append(logs, l)
}
return logs, total, nil
}
// =============================================================================
// STATISTICS
// =============================================================================
// GetStats retrieves aggregated statistics for the policy system.
func (s *Store) GetStats(ctx context.Context) (*PolicyStats, error) {
stats := &PolicyStats{
SourcesByLicense: make(map[string]int),
BlocksByReason: make(map[string]int),
}
// Active policies
err := s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM source_policies WHERE is_active = true`).Scan(&stats.ActivePolicies)
if err != nil {
return nil, fmt.Errorf("failed to count active policies: %w", err)
}
// Total sources
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources`).Scan(&stats.TotalSources)
if err != nil {
return nil, fmt.Errorf("failed to count total sources: %w", err)
}
// Active sources
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources WHERE is_active = true`).Scan(&stats.ActiveSources)
if err != nil {
return nil, fmt.Errorf("failed to count active sources: %w", err)
}
// Blocked today
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log WHERE created_at >= CURRENT_DATE`).Scan(&stats.BlockedToday)
if err != nil {
return nil, fmt.Errorf("failed to count blocked today: %w", err)
}
// Blocked total
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log`).Scan(&stats.BlockedTotal)
if err != nil {
return nil, fmt.Errorf("failed to count blocked total: %w", err)
}
// Active PII rules
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM pii_rules WHERE is_active = true`).Scan(&stats.PIIRulesActive)
if err != nil {
return nil, fmt.Errorf("failed to count active PII rules: %w", err)
}
// Sources by license
rows, err := s.pool.Query(ctx, `SELECT license, COUNT(*) FROM allowed_sources GROUP BY license`)
if err != nil {
return nil, fmt.Errorf("failed to count sources by license: %w", err)
}
defer rows.Close()
for rows.Next() {
var license string
var count int
if err := rows.Scan(&license, &count); err != nil {
return nil, err
}
stats.SourcesByLicense[license] = count
}
// Blocks by reason
rows, err = s.pool.Query(ctx, `SELECT block_reason, COUNT(*) FROM blocked_content_log GROUP BY block_reason`)
if err != nil {
return nil, fmt.Errorf("failed to count blocks by reason: %w", err)
}
defer rows.Close()
for rows.Next() {
var reason string
var count int
if err := rows.Scan(&reason, &count); err != nil {
return nil, err
}
stats.BlocksByReason[reason] = count
}
// Compliance score (simplified: active sources / total sources)
if stats.TotalSources > 0 {
stats.ComplianceScore = float64(stats.ActiveSources) / float64(stats.TotalSources) * 100
}
return stats, nil
}
// =============================================================================
// YAML LOADER
// =============================================================================
// LoadFromYAML loads initial policy data from YAML configuration.
func (s *Store) LoadFromYAML(ctx context.Context, config *BundeslaenderConfig) error {
// Load federal policy
if config.Federal.Name != "" {
err := s.loadPolicy(ctx, nil, &config.Federal, &config.DefaultOperations)
if err != nil {
return fmt.Errorf("failed to load federal policy: %w", err)
}
}
// Load Bundesland policies
for code, policyConfig := range config.Bundeslaender {
if code == "federal" || code == "default_operations" || code == "pii_rules" {
continue
}
bl := Bundesland(code)
err := s.loadPolicy(ctx, &bl, &policyConfig, &config.DefaultOperations)
if err != nil {
return fmt.Errorf("failed to load policy for %s: %w", code, err)
}
}
// Load PII rules
for _, ruleConfig := range config.PIIRules {
err := s.loadPIIRule(ctx, &ruleConfig)
if err != nil {
return fmt.Errorf("failed to load PII rule %s: %w", ruleConfig.Name, err)
}
}
return nil
}
func (s *Store) loadPolicy(ctx context.Context, bundesland *Bundesland, config *PolicyConfig, ops *OperationsConfig) error {
// Create policy
policy, err := s.CreatePolicy(ctx, &CreateSourcePolicyRequest{
Name: config.Name,
Bundesland: bundesland,
})
if err != nil {
return err
}
// Create sources
for _, srcConfig := range config.Sources {
trustBoost := 0.5
if srcConfig.TrustBoost > 0 {
trustBoost = srcConfig.TrustBoost
}
var legalBasis, citation *string
if srcConfig.LegalBasis != "" {
legalBasis = &srcConfig.LegalBasis
}
if srcConfig.CitationTemplate != "" {
citation = &srcConfig.CitationTemplate
}
_, err := s.CreateSource(ctx, &CreateAllowedSourceRequest{
PolicyID: policy.ID,
Domain: srcConfig.Domain,
Name: srcConfig.Name,
License: License(srcConfig.License),
LegalBasis: legalBasis,
CitationTemplate: citation,
TrustBoost: &trustBoost,
})
if err != nil {
return fmt.Errorf("failed to create source %s: %w", srcConfig.Domain, err)
}
}
return nil
}
func (s *Store) loadPIIRule(ctx context.Context, config *PIIRuleConfig) error {
severity := PIISeverityBlock
if config.Severity != "" {
severity = PIISeverity(config.Severity)
}
_, err := s.CreatePIIRule(ctx, &CreatePIIRuleRequest{
Name: config.Name,
RuleType: PIIRuleType(config.Type),
Pattern: config.Pattern,
Severity: severity,
})
return err
}
// ToJSON converts an entity to JSON for audit logging.
func ToJSON(v interface{}) json.RawMessage {
data, _ := json.Marshal(v)
return data
}

View File

@@ -0,0 +1,411 @@
package policy
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/google/uuid"
)
// =============================================================================
// AUDIT LOG
// =============================================================================
// CreateAuditLog creates a new audit log entry.
func (s *Store) CreateAuditLog(ctx context.Context, entry *PolicyAuditLog) error {
entry.ID = uuid.New()
entry.CreatedAt = time.Now()
query := `
INSERT INTO policy_audit_log (id, action, entity_type, entity_id, old_value, new_value,
user_id, user_email, ip_address, user_agent, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)`
_, err := s.pool.Exec(ctx, query,
entry.ID, entry.Action, entry.EntityType, entry.EntityID,
entry.OldValue, entry.NewValue, entry.UserID, entry.UserEmail,
entry.IPAddress, entry.UserAgent, entry.CreatedAt,
)
if err != nil {
return fmt.Errorf("failed to create audit log: %w", err)
}
return nil
}
// ListAuditLogs retrieves audit logs with filters.
func (s *Store) ListAuditLogs(ctx context.Context, filter *AuditLogFilter) ([]PolicyAuditLog, int, error) {
baseQuery := `FROM policy_audit_log WHERE 1=1`
args := []interface{}{}
argCount := 0
if filter.EntityType != nil {
argCount++
baseQuery += fmt.Sprintf(" AND entity_type = $%d", argCount)
args = append(args, *filter.EntityType)
}
if filter.EntityID != nil {
argCount++
baseQuery += fmt.Sprintf(" AND entity_id = $%d", argCount)
args = append(args, *filter.EntityID)
}
if filter.Action != nil {
argCount++
baseQuery += fmt.Sprintf(" AND action = $%d", argCount)
args = append(args, *filter.Action)
}
if filter.UserEmail != nil {
argCount++
baseQuery += fmt.Sprintf(" AND user_email ILIKE $%d", argCount)
args = append(args, "%"+*filter.UserEmail+"%")
}
if filter.FromDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
args = append(args, *filter.FromDate)
}
if filter.ToDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
args = append(args, *filter.ToDate)
}
// Count query
var total int
countQuery := "SELECT COUNT(*) " + baseQuery
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
if err != nil {
return nil, 0, fmt.Errorf("failed to count audit logs: %w", err)
}
// Data query
dataQuery := `SELECT id, action, entity_type, entity_id, old_value, new_value,
user_id, user_email, ip_address, user_agent, created_at ` + baseQuery +
` ORDER BY created_at DESC`
if filter.Limit > 0 {
argCount++
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
args = append(args, filter.Limit)
}
if filter.Offset > 0 {
argCount++
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
args = append(args, filter.Offset)
}
rows, err := s.pool.Query(ctx, dataQuery, args...)
if err != nil {
return nil, 0, fmt.Errorf("failed to list audit logs: %w", err)
}
defer rows.Close()
logs := []PolicyAuditLog{}
for rows.Next() {
var l PolicyAuditLog
err := rows.Scan(
&l.ID, &l.Action, &l.EntityType, &l.EntityID, &l.OldValue, &l.NewValue,
&l.UserID, &l.UserEmail, &l.IPAddress, &l.UserAgent, &l.CreatedAt,
)
if err != nil {
return nil, 0, fmt.Errorf("failed to scan audit log: %w", err)
}
logs = append(logs, l)
}
return logs, total, nil
}
// =============================================================================
// BLOCKED CONTENT LOG
// =============================================================================
// CreateBlockedContentLog creates a new blocked content log entry.
func (s *Store) CreateBlockedContentLog(ctx context.Context, entry *BlockedContentLog) error {
entry.ID = uuid.New()
entry.CreatedAt = time.Now()
query := `
INSERT INTO blocked_content_log (id, url, domain, block_reason, matched_rule_id, details, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)`
_, err := s.pool.Exec(ctx, query,
entry.ID, entry.URL, entry.Domain, entry.BlockReason,
entry.MatchedRuleID, entry.Details, entry.CreatedAt,
)
if err != nil {
return fmt.Errorf("failed to create blocked content log: %w", err)
}
return nil
}
// ListBlockedContent retrieves blocked content logs with filters.
func (s *Store) ListBlockedContent(ctx context.Context, filter *BlockedContentFilter) ([]BlockedContentLog, int, error) {
baseQuery := `FROM blocked_content_log WHERE 1=1`
args := []interface{}{}
argCount := 0
if filter.Domain != nil {
argCount++
baseQuery += fmt.Sprintf(" AND domain ILIKE $%d", argCount)
args = append(args, "%"+*filter.Domain+"%")
}
if filter.BlockReason != nil {
argCount++
baseQuery += fmt.Sprintf(" AND block_reason = $%d", argCount)
args = append(args, *filter.BlockReason)
}
if filter.FromDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at >= $%d", argCount)
args = append(args, *filter.FromDate)
}
if filter.ToDate != nil {
argCount++
baseQuery += fmt.Sprintf(" AND created_at <= $%d", argCount)
args = append(args, *filter.ToDate)
}
// Count query
var total int
countQuery := "SELECT COUNT(*) " + baseQuery
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
if err != nil {
return nil, 0, fmt.Errorf("failed to count blocked content: %w", err)
}
// Data query
dataQuery := `SELECT id, url, domain, block_reason, matched_rule_id, details, created_at ` + baseQuery +
` ORDER BY created_at DESC`
if filter.Limit > 0 {
argCount++
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
args = append(args, filter.Limit)
}
if filter.Offset > 0 {
argCount++
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
args = append(args, filter.Offset)
}
rows, err := s.pool.Query(ctx, dataQuery, args...)
if err != nil {
return nil, 0, fmt.Errorf("failed to list blocked content: %w", err)
}
defer rows.Close()
logs := []BlockedContentLog{}
for rows.Next() {
var l BlockedContentLog
err := rows.Scan(
&l.ID, &l.URL, &l.Domain, &l.BlockReason,
&l.MatchedRuleID, &l.Details, &l.CreatedAt,
)
if err != nil {
return nil, 0, fmt.Errorf("failed to scan blocked content: %w", err)
}
logs = append(logs, l)
}
return logs, total, nil
}
// =============================================================================
// STATISTICS
// =============================================================================
// GetStats retrieves aggregated statistics for the policy system.
func (s *Store) GetStats(ctx context.Context) (*PolicyStats, error) {
stats := &PolicyStats{
SourcesByLicense: make(map[string]int),
BlocksByReason: make(map[string]int),
}
// Active policies
err := s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM source_policies WHERE is_active = true`).Scan(&stats.ActivePolicies)
if err != nil {
return nil, fmt.Errorf("failed to count active policies: %w", err)
}
// Total sources
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources`).Scan(&stats.TotalSources)
if err != nil {
return nil, fmt.Errorf("failed to count total sources: %w", err)
}
// Active sources
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM allowed_sources WHERE is_active = true`).Scan(&stats.ActiveSources)
if err != nil {
return nil, fmt.Errorf("failed to count active sources: %w", err)
}
// Blocked today
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log WHERE created_at >= CURRENT_DATE`).Scan(&stats.BlockedToday)
if err != nil {
return nil, fmt.Errorf("failed to count blocked today: %w", err)
}
// Blocked total
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM blocked_content_log`).Scan(&stats.BlockedTotal)
if err != nil {
return nil, fmt.Errorf("failed to count blocked total: %w", err)
}
// Active PII rules
err = s.pool.QueryRow(ctx, `SELECT COUNT(*) FROM pii_rules WHERE is_active = true`).Scan(&stats.PIIRulesActive)
if err != nil {
return nil, fmt.Errorf("failed to count active PII rules: %w", err)
}
// Sources by license
rows, err := s.pool.Query(ctx, `SELECT license, COUNT(*) FROM allowed_sources GROUP BY license`)
if err != nil {
return nil, fmt.Errorf("failed to count sources by license: %w", err)
}
defer rows.Close()
for rows.Next() {
var license string
var count int
if err := rows.Scan(&license, &count); err != nil {
return nil, err
}
stats.SourcesByLicense[license] = count
}
// Blocks by reason
rows, err = s.pool.Query(ctx, `SELECT block_reason, COUNT(*) FROM blocked_content_log GROUP BY block_reason`)
if err != nil {
return nil, fmt.Errorf("failed to count blocks by reason: %w", err)
}
defer rows.Close()
for rows.Next() {
var reason string
var count int
if err := rows.Scan(&reason, &count); err != nil {
return nil, err
}
stats.BlocksByReason[reason] = count
}
// Compliance score (simplified: active sources / total sources)
if stats.TotalSources > 0 {
stats.ComplianceScore = float64(stats.ActiveSources) / float64(stats.TotalSources) * 100
}
return stats, nil
}
// =============================================================================
// YAML LOADER
// =============================================================================
// LoadFromYAML loads initial policy data from YAML configuration.
func (s *Store) LoadFromYAML(ctx context.Context, config *BundeslaenderConfig) error {
// Load federal policy
if config.Federal.Name != "" {
err := s.loadPolicy(ctx, nil, &config.Federal, &config.DefaultOperations)
if err != nil {
return fmt.Errorf("failed to load federal policy: %w", err)
}
}
// Load Bundesland policies
for code, policyConfig := range config.Bundeslaender {
if code == "federal" || code == "default_operations" || code == "pii_rules" {
continue
}
bl := Bundesland(code)
err := s.loadPolicy(ctx, &bl, &policyConfig, &config.DefaultOperations)
if err != nil {
return fmt.Errorf("failed to load policy for %s: %w", code, err)
}
}
// Load PII rules
for _, ruleConfig := range config.PIIRules {
err := s.loadPIIRule(ctx, &ruleConfig)
if err != nil {
return fmt.Errorf("failed to load PII rule %s: %w", ruleConfig.Name, err)
}
}
return nil
}
func (s *Store) loadPolicy(ctx context.Context, bundesland *Bundesland, config *PolicyConfig, ops *OperationsConfig) error {
// Create policy
policy, err := s.CreatePolicy(ctx, &CreateSourcePolicyRequest{
Name: config.Name,
Bundesland: bundesland,
})
if err != nil {
return err
}
// Create sources
for _, srcConfig := range config.Sources {
trustBoost := 0.5
if srcConfig.TrustBoost > 0 {
trustBoost = srcConfig.TrustBoost
}
var legalBasis, citation *string
if srcConfig.LegalBasis != "" {
legalBasis = &srcConfig.LegalBasis
}
if srcConfig.CitationTemplate != "" {
citation = &srcConfig.CitationTemplate
}
_, err := s.CreateSource(ctx, &CreateAllowedSourceRequest{
PolicyID: policy.ID,
Domain: srcConfig.Domain,
Name: srcConfig.Name,
License: License(srcConfig.License),
LegalBasis: legalBasis,
CitationTemplate: citation,
TrustBoost: &trustBoost,
})
if err != nil {
return fmt.Errorf("failed to create source %s: %w", srcConfig.Domain, err)
}
}
return nil
}
func (s *Store) loadPIIRule(ctx context.Context, config *PIIRuleConfig) error {
severity := PIISeverityBlock
if config.Severity != "" {
severity = PIISeverity(config.Severity)
}
_, err := s.CreatePIIRule(ctx, &CreatePIIRuleRequest{
Name: config.Name,
RuleType: PIIRuleType(config.Type),
Pattern: config.Pattern,
Severity: severity,
})
return err
}
// ToJSON converts an entity to JSON for audit logging.
func ToJSON(v interface{}) json.RawMessage {
data, _ := json.Marshal(v)
return data
}

View File

@@ -0,0 +1,417 @@
package policy
import (
"context"
"fmt"
"time"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
)
// =============================================================================
// ALLOWED SOURCES
// =============================================================================
// CreateSource creates a new allowed source.
func (s *Store) CreateSource(ctx context.Context, req *CreateAllowedSourceRequest) (*AllowedSource, error) {
trustBoost := 0.5
if req.TrustBoost != nil {
trustBoost = *req.TrustBoost
}
source := &AllowedSource{
ID: uuid.New(),
PolicyID: req.PolicyID,
Domain: req.Domain,
Name: req.Name,
Description: req.Description,
License: req.License,
LegalBasis: req.LegalBasis,
CitationTemplate: req.CitationTemplate,
TrustBoost: trustBoost,
IsActive: true,
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
query := `
INSERT INTO allowed_sources (id, policy_id, domain, name, description, license,
legal_basis, citation_template, trust_boost, is_active,
created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
RETURNING id`
err := s.pool.QueryRow(ctx, query,
source.ID, source.PolicyID, source.Domain, source.Name, source.Description,
source.License, source.LegalBasis, source.CitationTemplate, source.TrustBoost,
source.IsActive, source.CreatedAt, source.UpdatedAt,
).Scan(&source.ID)
if err != nil {
return nil, fmt.Errorf("failed to create source: %w", err)
}
// Create default operation permissions
err = s.createDefaultOperations(ctx, source.ID)
if err != nil {
return nil, fmt.Errorf("failed to create default operations: %w", err)
}
return source, nil
}
// createDefaultOperations creates default operation permissions for a source.
func (s *Store) createDefaultOperations(ctx context.Context, sourceID uuid.UUID) error {
defaults := []struct {
op Operation
allowed bool
citation bool
}{
{OperationLookup, true, true},
{OperationRAG, true, true},
{OperationTraining, false, false}, // VERBOTEN by default
{OperationExport, true, true},
}
for _, d := range defaults {
query := `
INSERT INTO operation_permissions (id, source_id, operation, is_allowed, requires_citation, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)`
_, err := s.pool.Exec(ctx, query, uuid.New(), sourceID, d.op, d.allowed, d.citation, time.Now(), time.Now())
if err != nil {
return err
}
}
return nil
}
// GetSource retrieves a source by ID.
func (s *Store) GetSource(ctx context.Context, id uuid.UUID) (*AllowedSource, error) {
query := `
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
als.created_at, als.updated_at, sp.name as policy_name
FROM allowed_sources als
JOIN source_policies sp ON als.policy_id = sp.id
WHERE als.id = $1`
source := &AllowedSource{}
err := s.pool.QueryRow(ctx, query, id).Scan(
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
&source.IsActive, &source.CreatedAt, &source.UpdatedAt, &source.PolicyName,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get source: %w", err)
}
// Load operations
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
if err != nil {
return nil, err
}
source.Operations = ops
return source, nil
}
// GetSourceByDomain retrieves a source by domain with optional bundesland filter.
func (s *Store) GetSourceByDomain(ctx context.Context, domain string, bundesland *Bundesland) (*AllowedSource, error) {
query := `
SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
als.created_at, als.updated_at
FROM allowed_sources als
JOIN source_policies sp ON als.policy_id = sp.id
WHERE als.is_active = true
AND sp.is_active = true
AND (als.domain = $1 OR $1 LIKE '%.' || als.domain)
AND (sp.bundesland IS NULL OR sp.bundesland = $2)
LIMIT 1`
source := &AllowedSource{}
err := s.pool.QueryRow(ctx, query, domain, bundesland).Scan(
&source.ID, &source.PolicyID, &source.Domain, &source.Name, &source.Description,
&source.License, &source.LegalBasis, &source.CitationTemplate, &source.TrustBoost,
&source.IsActive, &source.CreatedAt, &source.UpdatedAt,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get source by domain: %w", err)
}
// Load operations
ops, err := s.GetOperationsBySourceID(ctx, source.ID)
if err != nil {
return nil, err
}
source.Operations = ops
return source, nil
}
// ListSources retrieves sources with optional filters.
func (s *Store) ListSources(ctx context.Context, filter *SourceListFilter) ([]AllowedSource, int, error) {
baseQuery := `FROM allowed_sources als JOIN source_policies sp ON als.policy_id = sp.id WHERE 1=1`
args := []interface{}{}
argCount := 0
if filter.PolicyID != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.policy_id = $%d", argCount)
args = append(args, *filter.PolicyID)
}
if filter.Domain != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.domain ILIKE $%d", argCount)
args = append(args, "%"+*filter.Domain+"%")
}
if filter.License != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.license = $%d", argCount)
args = append(args, *filter.License)
}
if filter.IsActive != nil {
argCount++
baseQuery += fmt.Sprintf(" AND als.is_active = $%d", argCount)
args = append(args, *filter.IsActive)
}
// Count query
var total int
countQuery := "SELECT COUNT(*) " + baseQuery
err := s.pool.QueryRow(ctx, countQuery, args...).Scan(&total)
if err != nil {
return nil, 0, fmt.Errorf("failed to count sources: %w", err)
}
// Data query
dataQuery := `SELECT als.id, als.policy_id, als.domain, als.name, als.description, als.license,
als.legal_basis, als.citation_template, als.trust_boost, als.is_active,
als.created_at, als.updated_at, sp.name as policy_name ` + baseQuery +
` ORDER BY als.created_at DESC`
if filter.Limit > 0 {
argCount++
dataQuery += fmt.Sprintf(" LIMIT $%d", argCount)
args = append(args, filter.Limit)
}
if filter.Offset > 0 {
argCount++
dataQuery += fmt.Sprintf(" OFFSET $%d", argCount)
args = append(args, filter.Offset)
}
rows, err := s.pool.Query(ctx, dataQuery, args...)
if err != nil {
return nil, 0, fmt.Errorf("failed to list sources: %w", err)
}
defer rows.Close()
sources := []AllowedSource{}
for rows.Next() {
var src AllowedSource
err := rows.Scan(
&src.ID, &src.PolicyID, &src.Domain, &src.Name, &src.Description,
&src.License, &src.LegalBasis, &src.CitationTemplate, &src.TrustBoost,
&src.IsActive, &src.CreatedAt, &src.UpdatedAt, &src.PolicyName,
)
if err != nil {
return nil, 0, fmt.Errorf("failed to scan source: %w", err)
}
sources = append(sources, src)
}
return sources, total, nil
}
// UpdateSource updates an existing source.
func (s *Store) UpdateSource(ctx context.Context, id uuid.UUID, req *UpdateAllowedSourceRequest) (*AllowedSource, error) {
source, err := s.GetSource(ctx, id)
if err != nil {
return nil, err
}
if source == nil {
return nil, fmt.Errorf("source not found")
}
if req.Domain != nil {
source.Domain = *req.Domain
}
if req.Name != nil {
source.Name = *req.Name
}
if req.Description != nil {
source.Description = req.Description
}
if req.License != nil {
source.License = *req.License
}
if req.LegalBasis != nil {
source.LegalBasis = req.LegalBasis
}
if req.CitationTemplate != nil {
source.CitationTemplate = req.CitationTemplate
}
if req.TrustBoost != nil {
source.TrustBoost = *req.TrustBoost
}
if req.IsActive != nil {
source.IsActive = *req.IsActive
}
source.UpdatedAt = time.Now()
query := `
UPDATE allowed_sources
SET domain = $2, name = $3, description = $4, license = $5, legal_basis = $6,
citation_template = $7, trust_boost = $8, is_active = $9, updated_at = $10
WHERE id = $1`
_, err = s.pool.Exec(ctx, query,
id, source.Domain, source.Name, source.Description, source.License,
source.LegalBasis, source.CitationTemplate, source.TrustBoost,
source.IsActive, source.UpdatedAt,
)
if err != nil {
return nil, fmt.Errorf("failed to update source: %w", err)
}
return source, nil
}
// DeleteSource deletes a source by ID.
func (s *Store) DeleteSource(ctx context.Context, id uuid.UUID) error {
query := `DELETE FROM allowed_sources WHERE id = $1`
_, err := s.pool.Exec(ctx, query, id)
if err != nil {
return fmt.Errorf("failed to delete source: %w", err)
}
return nil
}
// =============================================================================
// OPERATION PERMISSIONS
// =============================================================================
// GetOperationsBySourceID retrieves all operation permissions for a source.
func (s *Store) GetOperationsBySourceID(ctx context.Context, sourceID uuid.UUID) ([]OperationPermission, error) {
query := `
SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
FROM operation_permissions
WHERE source_id = $1
ORDER BY operation`
rows, err := s.pool.Query(ctx, query, sourceID)
if err != nil {
return nil, fmt.Errorf("failed to get operations: %w", err)
}
defer rows.Close()
ops := []OperationPermission{}
for rows.Next() {
var op OperationPermission
err := rows.Scan(
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
)
if err != nil {
return nil, fmt.Errorf("failed to scan operation: %w", err)
}
ops = append(ops, op)
}
return ops, nil
}
// UpdateOperationPermission updates an operation permission.
func (s *Store) UpdateOperationPermission(ctx context.Context, id uuid.UUID, req *UpdateOperationPermissionRequest) (*OperationPermission, error) {
query := `SELECT id, source_id, operation, is_allowed, requires_citation, notes, created_at, updated_at
FROM operation_permissions WHERE id = $1`
op := &OperationPermission{}
err := s.pool.QueryRow(ctx, query, id).Scan(
&op.ID, &op.SourceID, &op.Operation, &op.IsAllowed,
&op.RequiresCitation, &op.Notes, &op.CreatedAt, &op.UpdatedAt,
)
if err == pgx.ErrNoRows {
return nil, fmt.Errorf("operation permission not found")
}
if err != nil {
return nil, fmt.Errorf("failed to get operation: %w", err)
}
if req.IsAllowed != nil {
op.IsAllowed = *req.IsAllowed
}
if req.RequiresCitation != nil {
op.RequiresCitation = *req.RequiresCitation
}
if req.Notes != nil {
op.Notes = req.Notes
}
op.UpdatedAt = time.Now()
updateQuery := `
UPDATE operation_permissions
SET is_allowed = $2, requires_citation = $3, notes = $4, updated_at = $5
WHERE id = $1`
_, err = s.pool.Exec(ctx, updateQuery, id, op.IsAllowed, op.RequiresCitation, op.Notes, op.UpdatedAt)
if err != nil {
return nil, fmt.Errorf("failed to update operation: %w", err)
}
return op, nil
}
// GetOperationsMatrix retrieves all operation permissions grouped by source.
func (s *Store) GetOperationsMatrix(ctx context.Context) ([]AllowedSource, error) {
query := `
SELECT als.id, als.domain, als.name, als.license, als.is_active,
sp.name as policy_name, sp.bundesland
FROM allowed_sources als
JOIN source_policies sp ON als.policy_id = sp.id
WHERE als.is_active = true AND sp.is_active = true
ORDER BY sp.bundesland NULLS FIRST, als.name`
rows, err := s.pool.Query(ctx, query)
if err != nil {
return nil, fmt.Errorf("failed to get operations matrix: %w", err)
}
defer rows.Close()
sources := []AllowedSource{}
for rows.Next() {
var src AllowedSource
var bundesland *Bundesland
err := rows.Scan(
&src.ID, &src.Domain, &src.Name, &src.License, &src.IsActive,
&src.PolicyName, &bundesland,
)
if err != nil {
return nil, fmt.Errorf("failed to scan source: %w", err)
}
// Load operations for each source
ops, err := s.GetOperationsBySourceID(ctx, src.ID)
if err != nil {
return nil, err
}
src.Operations = ops
sources = append(sources, src)
}
return sources, nil
}

View File

@@ -214,355 +214,6 @@ func (s *Service) Search(ctx context.Context, req *SearchRequest) (*SearchRespon
}, nil
}
// buildQuery constructs the OpenSearch query
func (s *Service) buildQuery(req *SearchRequest) map[string]interface{} {
// Main query
must := []map[string]interface{}{}
filter := []map[string]interface{}{}
// Text search
if req.Query != "" {
must = append(must, map[string]interface{}{
"multi_match": map[string]interface{}{
"query": req.Query,
"fields": []string{"title^3", "content_text"},
"type": "best_fields",
},
})
}
// Filters
if len(req.Filters.Language) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"language": req.Filters.Language},
})
}
if len(req.Filters.CountryHint) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
})
}
if len(req.Filters.SourceCategory) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
})
}
if len(req.Filters.DocType) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
})
}
if len(req.Filters.SchoolLevel) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
})
}
if len(req.Filters.Subjects) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
})
}
if len(req.Filters.State) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"state": req.Filters.State},
})
}
if req.Filters.MinTrustScore > 0 {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
},
})
}
if req.Filters.DateFrom != "" {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
},
})
}
// Build bool query
boolQuery := map[string]interface{}{}
if len(must) > 0 {
boolQuery["must"] = must
}
if len(filter) > 0 {
boolQuery["filter"] = filter
}
// Construct full query
query := map[string]interface{}{
"query": map[string]interface{}{
"bool": boolQuery,
},
"from": req.Offset,
"size": req.Limit,
"_source": []string{
"doc_id", "title", "url", "domain", "language",
"doc_type", "school_level", "subjects",
"trust_score", "quality_score", "snippet_text",
},
}
// Add highlighting if requested
if req.Include.Highlights {
query["highlight"] = map[string]interface{}{
"fields": map[string]interface{}{
"title": map[string]interface{}{},
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
},
}
}
// Add function score for trust/quality boosting
query["query"] = map[string]interface{}{
"function_score": map[string]interface{}{
"query": query["query"],
"functions": []map[string]interface{}{
{
"field_value_factor": map[string]interface{}{
"field": "trust_score",
"factor": 1.5,
"modifier": "sqrt",
"missing": 0.5,
},
},
{
"field_value_factor": map[string]interface{}{
"field": "quality_score",
"factor": 1.0,
"modifier": "sqrt",
"missing": 0.5,
},
},
},
"score_mode": "multiply",
"boost_mode": "multiply",
},
}
return query
}
// buildSemanticQuery constructs a pure vector search query using k-NN
func (s *Service) buildSemanticQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
filter := s.buildFilters(req)
// k-NN query for semantic search
knnQuery := map[string]interface{}{
"content_embedding": map[string]interface{}{
"vector": embedding,
"k": req.Limit + req.Offset, // Get enough results for pagination
},
}
// Add filter if present
if len(filter) > 0 {
knnQuery["content_embedding"].(map[string]interface{})["filter"] = map[string]interface{}{
"bool": map[string]interface{}{
"filter": filter,
},
}
}
query := map[string]interface{}{
"knn": knnQuery,
"from": req.Offset,
"size": req.Limit,
"_source": []string{
"doc_id", "title", "url", "domain", "language",
"doc_type", "school_level", "subjects",
"trust_score", "quality_score", "snippet_text",
},
}
// Add highlighting if requested
if req.Include.Highlights {
query["highlight"] = map[string]interface{}{
"fields": map[string]interface{}{
"title": map[string]interface{}{},
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
},
}
}
return query
}
// buildHybridQuery constructs a combined BM25 + vector search query
func (s *Service) buildHybridQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
filter := s.buildFilters(req)
// Build the bool query for BM25
must := []map[string]interface{}{}
if req.Query != "" {
must = append(must, map[string]interface{}{
"multi_match": map[string]interface{}{
"query": req.Query,
"fields": []string{"title^3", "content_text"},
"type": "best_fields",
},
})
}
boolQuery := map[string]interface{}{}
if len(must) > 0 {
boolQuery["must"] = must
}
if len(filter) > 0 {
boolQuery["filter"] = filter
}
// Convert embedding to []interface{} for JSON
embeddingInterface := make([]interface{}, len(embedding))
for i, v := range embedding {
embeddingInterface[i] = v
}
// Hybrid query using script_score to combine BM25 and cosine similarity
// This is a simpler approach than OpenSearch's neural search plugin
query := map[string]interface{}{
"query": map[string]interface{}{
"script_score": map[string]interface{}{
"query": map[string]interface{}{
"bool": boolQuery,
},
"script": map[string]interface{}{
"source": "cosineSimilarity(params.query_vector, 'content_embedding') + 1.0 + _score * 0.5",
"params": map[string]interface{}{
"query_vector": embeddingInterface,
},
},
},
},
"from": req.Offset,
"size": req.Limit,
"_source": []string{
"doc_id", "title", "url", "domain", "language",
"doc_type", "school_level", "subjects",
"trust_score", "quality_score", "snippet_text",
},
}
// Add highlighting if requested
if req.Include.Highlights {
query["highlight"] = map[string]interface{}{
"fields": map[string]interface{}{
"title": map[string]interface{}{},
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
},
}
}
return query
}
// buildFilters constructs the filter array for queries
func (s *Service) buildFilters(req *SearchRequest) []map[string]interface{} {
filter := []map[string]interface{}{}
if len(req.Filters.Language) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"language": req.Filters.Language},
})
}
if len(req.Filters.CountryHint) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
})
}
if len(req.Filters.SourceCategory) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
})
}
if len(req.Filters.DocType) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
})
}
if len(req.Filters.SchoolLevel) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
})
}
if len(req.Filters.Subjects) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
})
}
if len(req.Filters.State) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"state": req.Filters.State},
})
}
if req.Filters.MinTrustScore > 0 {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
},
})
}
if req.Filters.DateFrom != "" {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
},
})
}
return filter
}
// hitToResult converts an OpenSearch hit to SearchResult
func (s *Service) hitToResult(source map[string]interface{}, score float64, highlight map[string][]string, include SearchInclude) SearchResult {
result := SearchResult{
DocID: getString(source, "doc_id"),
Title: getString(source, "title"),
URL: getString(source, "url"),
Domain: getString(source, "domain"),
Language: getString(source, "language"),
DocType: getString(source, "doc_type"),
SchoolLevel: getString(source, "school_level"),
Subjects: getStringArray(source, "subjects"),
Scores: Scores{
BM25: score,
Trust: getFloat(source, "trust_score"),
Quality: getFloat(source, "quality_score"),
Final: score, // MVP: final = BM25 * trust * quality (via function_score)
},
}
if include.Snippets {
result.Snippet = getString(source, "snippet_text")
}
if include.Highlights && highlight != nil {
if h, ok := highlight["content_text"]; ok {
result.Highlights = h
}
}
return result
}
// Helper functions
func getString(m map[string]interface{}, key string) string {
if v, ok := m[key].(string); ok {

View File

@@ -0,0 +1,350 @@
package search
// buildQuery constructs the OpenSearch query
func (s *Service) buildQuery(req *SearchRequest) map[string]interface{} {
// Main query
must := []map[string]interface{}{}
filter := []map[string]interface{}{}
// Text search
if req.Query != "" {
must = append(must, map[string]interface{}{
"multi_match": map[string]interface{}{
"query": req.Query,
"fields": []string{"title^3", "content_text"},
"type": "best_fields",
},
})
}
// Filters
if len(req.Filters.Language) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"language": req.Filters.Language},
})
}
if len(req.Filters.CountryHint) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
})
}
if len(req.Filters.SourceCategory) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
})
}
if len(req.Filters.DocType) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
})
}
if len(req.Filters.SchoolLevel) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
})
}
if len(req.Filters.Subjects) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
})
}
if len(req.Filters.State) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"state": req.Filters.State},
})
}
if req.Filters.MinTrustScore > 0 {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
},
})
}
if req.Filters.DateFrom != "" {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
},
})
}
// Build bool query
boolQuery := map[string]interface{}{}
if len(must) > 0 {
boolQuery["must"] = must
}
if len(filter) > 0 {
boolQuery["filter"] = filter
}
// Construct full query
query := map[string]interface{}{
"query": map[string]interface{}{
"bool": boolQuery,
},
"from": req.Offset,
"size": req.Limit,
"_source": []string{
"doc_id", "title", "url", "domain", "language",
"doc_type", "school_level", "subjects",
"trust_score", "quality_score", "snippet_text",
},
}
// Add highlighting if requested
if req.Include.Highlights {
query["highlight"] = map[string]interface{}{
"fields": map[string]interface{}{
"title": map[string]interface{}{},
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
},
}
}
// Add function score for trust/quality boosting
query["query"] = map[string]interface{}{
"function_score": map[string]interface{}{
"query": query["query"],
"functions": []map[string]interface{}{
{
"field_value_factor": map[string]interface{}{
"field": "trust_score",
"factor": 1.5,
"modifier": "sqrt",
"missing": 0.5,
},
},
{
"field_value_factor": map[string]interface{}{
"field": "quality_score",
"factor": 1.0,
"modifier": "sqrt",
"missing": 0.5,
},
},
},
"score_mode": "multiply",
"boost_mode": "multiply",
},
}
return query
}
// buildSemanticQuery constructs a pure vector search query using k-NN
func (s *Service) buildSemanticQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
filter := s.buildFilters(req)
// k-NN query for semantic search
knnQuery := map[string]interface{}{
"content_embedding": map[string]interface{}{
"vector": embedding,
"k": req.Limit + req.Offset, // Get enough results for pagination
},
}
// Add filter if present
if len(filter) > 0 {
knnQuery["content_embedding"].(map[string]interface{})["filter"] = map[string]interface{}{
"bool": map[string]interface{}{
"filter": filter,
},
}
}
query := map[string]interface{}{
"knn": knnQuery,
"from": req.Offset,
"size": req.Limit,
"_source": []string{
"doc_id", "title", "url", "domain", "language",
"doc_type", "school_level", "subjects",
"trust_score", "quality_score", "snippet_text",
},
}
// Add highlighting if requested
if req.Include.Highlights {
query["highlight"] = map[string]interface{}{
"fields": map[string]interface{}{
"title": map[string]interface{}{},
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
},
}
}
return query
}
// buildHybridQuery constructs a combined BM25 + vector search query
func (s *Service) buildHybridQuery(req *SearchRequest, embedding []float32) map[string]interface{} {
filter := s.buildFilters(req)
// Build the bool query for BM25
must := []map[string]interface{}{}
if req.Query != "" {
must = append(must, map[string]interface{}{
"multi_match": map[string]interface{}{
"query": req.Query,
"fields": []string{"title^3", "content_text"},
"type": "best_fields",
},
})
}
boolQuery := map[string]interface{}{}
if len(must) > 0 {
boolQuery["must"] = must
}
if len(filter) > 0 {
boolQuery["filter"] = filter
}
// Convert embedding to []interface{} for JSON
embeddingInterface := make([]interface{}, len(embedding))
for i, v := range embedding {
embeddingInterface[i] = v
}
// Hybrid query using script_score to combine BM25 and cosine similarity
// This is a simpler approach than OpenSearch's neural search plugin
query := map[string]interface{}{
"query": map[string]interface{}{
"script_score": map[string]interface{}{
"query": map[string]interface{}{
"bool": boolQuery,
},
"script": map[string]interface{}{
"source": "cosineSimilarity(params.query_vector, 'content_embedding') + 1.0 + _score * 0.5",
"params": map[string]interface{}{
"query_vector": embeddingInterface,
},
},
},
},
"from": req.Offset,
"size": req.Limit,
"_source": []string{
"doc_id", "title", "url", "domain", "language",
"doc_type", "school_level", "subjects",
"trust_score", "quality_score", "snippet_text",
},
}
// Add highlighting if requested
if req.Include.Highlights {
query["highlight"] = map[string]interface{}{
"fields": map[string]interface{}{
"title": map[string]interface{}{},
"content_text": map[string]interface{}{"fragment_size": 150, "number_of_fragments": 3},
},
}
}
return query
}
// buildFilters constructs the filter array for queries
func (s *Service) buildFilters(req *SearchRequest) []map[string]interface{} {
filter := []map[string]interface{}{}
if len(req.Filters.Language) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"language": req.Filters.Language},
})
}
if len(req.Filters.CountryHint) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"country_hint": req.Filters.CountryHint},
})
}
if len(req.Filters.SourceCategory) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"source_category": req.Filters.SourceCategory},
})
}
if len(req.Filters.DocType) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"doc_type": req.Filters.DocType},
})
}
if len(req.Filters.SchoolLevel) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"school_level": req.Filters.SchoolLevel},
})
}
if len(req.Filters.Subjects) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"subjects": req.Filters.Subjects},
})
}
if len(req.Filters.State) > 0 {
filter = append(filter, map[string]interface{}{
"terms": map[string]interface{}{"state": req.Filters.State},
})
}
if req.Filters.MinTrustScore > 0 {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"trust_score": map[string]interface{}{"gte": req.Filters.MinTrustScore},
},
})
}
if req.Filters.DateFrom != "" {
filter = append(filter, map[string]interface{}{
"range": map[string]interface{}{
"fetch_time": map[string]interface{}{"gte": req.Filters.DateFrom},
},
})
}
return filter
}
// hitToResult converts an OpenSearch hit to SearchResult
func (s *Service) hitToResult(source map[string]interface{}, score float64, highlight map[string][]string, include SearchInclude) SearchResult {
result := SearchResult{
DocID: getString(source, "doc_id"),
Title: getString(source, "title"),
URL: getString(source, "url"),
Domain: getString(source, "domain"),
Language: getString(source, "language"),
DocType: getString(source, "doc_type"),
SchoolLevel: getString(source, "school_level"),
Subjects: getStringArray(source, "subjects"),
Scores: Scores{
BM25: score,
Trust: getFloat(source, "trust_score"),
Quality: getFloat(source, "quality_score"),
Final: score, // MVP: final = BM25 * trust * quality (via function_score)
},
}
if include.Snippets {
result.Snippet = getString(source, "snippet_text")
}
if include.Highlights && highlight != nil {
if h, ok := highlight["content_text"]; ok {
result.Highlights = h
}
}
return result
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,247 @@
package staff
import (
"bytes"
"context"
"log"
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/breakpilot/edu-search-service/internal/database"
)
// findStaffPages discovers staff listing pages on a university website
func (c *StaffCrawler) findStaffPages(ctx context.Context, uni *database.University) ([]string, error) {
var pages []string
// Use custom pattern if available
if uni.StaffPagePattern != nil && *uni.StaffPagePattern != "" {
pages = append(pages, *uni.StaffPagePattern)
return pages, nil
}
// Try common patterns
baseURL := strings.TrimSuffix(uni.URL, "/")
commonPaths := []string{
"/personen",
"/team",
"/mitarbeiter",
"/mitarbeitende",
"/staff",
"/people",
"/ueber-uns/team",
"/about/team",
"/fakultaet/personen",
"/institute",
}
for _, path := range commonPaths {
testURL := baseURL + path
exists, err := c.checkPageExists(ctx, testURL)
if err == nil && exists {
pages = append(pages, testURL)
}
}
// Also try to find staff links on the main page
mainPageLinks, err := c.findStaffLinksOnPage(ctx, baseURL)
if err == nil {
pages = append(pages, mainPageLinks...)
}
// UOL-specific: Find department/personen pages through navigation
// Check for both uol.de and uni-oldenburg.de (they are the same university)
if strings.Contains(baseURL, "uol.de") || strings.Contains(baseURL, "uni-oldenburg.de") {
log.Printf("[UOL] Detected Uni Oldenburg, using UOL-specific crawler for %s", baseURL)
uolPages, err := c.findUOLDepartmentPages(ctx, baseURL)
if err == nil {
log.Printf("[UOL] Found %d department pages", len(uolPages))
pages = append(pages, uolPages...)
} else {
log.Printf("[UOL] Error finding department pages: %v", err)
}
}
// Deduplicate
seen := make(map[string]bool)
var unique []string
for _, p := range pages {
if !seen[p] {
seen[p] = true
unique = append(unique, p)
}
}
return unique, nil
}
// findUOLDepartmentPages finds department person pages for Uni Oldenburg
func (c *StaffCrawler) findUOLDepartmentPages(ctx context.Context, baseURL string) ([]string, error) {
var pages []string
// UOL uses both uol.de and uni-oldenburg.de domains
// Departments have /personen or /team subpages
// Helper to check if URL is UOL-related
isUOLURL := func(url string) bool {
lower := strings.ToLower(url)
return strings.Contains(lower, "uol.de") || strings.Contains(lower, "uni-oldenburg.de")
}
// First try to find department links from known starting points
startPages := []string{
"https://uol.de/informatik/department/abteilungen-und-einrichtungen",
"https://uol.de/fk2",
"https://uol.de/fk1",
"https://uol.de/fk3",
"https://uol.de/fk4",
"https://uol.de/fk5",
"https://uol.de/fk6",
baseURL,
}
deptPaths := make(map[string]bool)
for _, startURL := range startPages {
log.Printf("[UOL] Scanning start page: %s", startURL)
body, err := c.fetchPage(ctx, startURL)
if err != nil {
log.Printf("[UOL] Error fetching %s: %v", startURL, err)
continue
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
continue
}
// Find links to department pages (they typically have /personen subpages)
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
return
}
// Look for department-like paths
hrefLower := strings.ToLower(href)
isDeptPath := isUOLURL(href) &&
!strings.Contains(hrefLower, "/studium") &&
!strings.Contains(hrefLower, "/forschung") &&
!strings.Contains(hrefLower, "/aktuelles") &&
!strings.Contains(hrefLower, "/kontakt")
if isDeptPath {
fullURL := resolveURL(startURL, href)
if fullURL != "" && isUOLURL(fullURL) {
// Add personen page for this department
personenURL := strings.TrimSuffix(fullURL, "/") + "/personen"
deptPaths[personenURL] = true
}
}
})
// Also look for direct /personen or /team links
doc.Find("a[href*='/personen'], a[href*='/team']").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
fullURL := resolveURL(startURL, href)
if fullURL != "" && isUOLURL(fullURL) {
deptPaths[fullURL] = true
}
}
})
}
// Add well-known department personen pages directly (these exist for sure)
knownDepts := []string{
"https://uol.de/socps/personen",
"https://uol.de/vlba/team",
"https://uol.de/informatik/department",
"https://uol.de/se/team",
"https://uol.de/ei/personen",
"https://uol.de/is/team",
"https://uol.de/paedagogik/personen",
"https://uol.de/psychologie/personen",
"https://uol.de/germanistik/personen",
"https://uol.de/physik/personen",
"https://uol.de/chemie/personen",
"https://uol.de/biologie/personen",
"https://uol.de/mathe/personen",
}
for _, dept := range knownDepts {
deptPaths[dept] = true
}
log.Printf("[UOL] Checking %d potential department pages", len(deptPaths))
// Verify which pages actually exist
for path := range deptPaths {
exists, err := c.checkPageExists(ctx, path)
if err == nil && exists {
log.Printf("[UOL] Found valid page: %s", path)
pages = append(pages, path)
}
}
log.Printf("[UOL] Found %d valid department/personen pages", len(pages))
return pages, nil
}
// checkPageExists checks if a URL returns a 200 status
func (c *StaffCrawler) checkPageExists(ctx context.Context, urlStr string) (bool, error) {
c.waitForRateLimit(urlStr)
req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
if err != nil {
return false, err
}
req.Header.Set("User-Agent", c.userAgent)
resp, err := c.client.Do(req)
if err != nil {
return false, err
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK, nil
}
// findStaffLinksOnPage finds links to staff pages on a given page
func (c *StaffCrawler) findStaffLinksOnPage(ctx context.Context, pageURL string) ([]string, error) {
body, err := c.fetchPage(ctx, pageURL)
if err != nil {
return nil, err
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
return nil, err
}
var links []string
staffKeywords := []string{"team", "personen", "mitarbeiter", "staff", "people", "dozent", "professor"}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
return
}
text := strings.ToLower(s.Text())
hrefLower := strings.ToLower(href)
for _, keyword := range staffKeywords {
if strings.Contains(text, keyword) || strings.Contains(hrefLower, keyword) {
fullURL := resolveURL(pageURL, href)
if fullURL != "" {
links = append(links, fullURL)
}
break
}
}
})
return links, nil
}

View File

@@ -0,0 +1,364 @@
package staff
import (
"bytes"
"context"
"fmt"
"log"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/breakpilot/edu-search-service/internal/database"
)
// EnrichStaffProfiles fetches individual profile pages and extracts detailed info
// like email, phone, office, research interests, and publication links
func (c *StaffCrawler) EnrichStaffProfiles(ctx context.Context, uni *database.University) (int, error) {
// Get all staff for this university that have profile URLs
staffList, err := c.repo.SearchStaff(ctx, database.StaffSearchParams{
UniversityID: &uni.ID,
Limit: 10000,
})
if err != nil {
return 0, fmt.Errorf("failed to search staff: %w", err)
}
log.Printf("[Profile Enrichment] Starting enrichment for %d staff members at %s", staffList.Total, uni.Name)
enriched := 0
for _, staff := range staffList.Staff {
select {
case <-ctx.Done():
return enriched, ctx.Err()
default:
}
// Skip if no profile URL
if staff.ProfileURL == nil || *staff.ProfileURL == "" {
continue
}
// Skip if already has email (already enriched)
if staff.Email != nil && *staff.Email != "" {
continue
}
// Fetch and extract profile details
details, err := c.extractProfileDetails(ctx, *staff.ProfileURL)
if err != nil {
log.Printf("[Profile Enrichment] Error fetching %s: %v", *staff.ProfileURL, err)
continue
}
// Update staff record with new details
updated := false
if details.Email != "" && staff.Email == nil {
staff.Email = &details.Email
updated = true
}
if details.Phone != "" && staff.Phone == nil {
staff.Phone = &details.Phone
updated = true
}
if details.Office != "" && staff.Office == nil {
staff.Office = &details.Office
updated = true
}
if details.ORCID != "" && staff.ORCID == nil {
staff.ORCID = &details.ORCID
updated = true
}
if details.GoogleScholarID != "" && staff.GoogleScholarID == nil {
staff.GoogleScholarID = &details.GoogleScholarID
updated = true
}
if details.ResearchgateURL != "" && staff.ResearchgateURL == nil {
staff.ResearchgateURL = &details.ResearchgateURL
updated = true
}
if details.LinkedInURL != "" && staff.LinkedInURL == nil {
staff.LinkedInURL = &details.LinkedInURL
updated = true
}
if details.PersonalWebsite != "" && staff.PersonalWebsite == nil {
staff.PersonalWebsite = &details.PersonalWebsite
updated = true
}
if len(details.ResearchInterests) > 0 && len(staff.ResearchInterests) == 0 {
staff.ResearchInterests = details.ResearchInterests
updated = true
}
if details.PhotoURL != "" && staff.PhotoURL == nil {
staff.PhotoURL = &details.PhotoURL
updated = true
}
if updated {
err = c.repo.CreateStaff(ctx, &staff)
if err != nil {
log.Printf("[Profile Enrichment] Error updating %s: %v", staff.LastName, err)
continue
}
enriched++
log.Printf("[Profile Enrichment] Enriched: %s %s (email=%v)", stringValue(staff.FirstName), staff.LastName, details.Email != "")
}
}
log.Printf("[Profile Enrichment] Completed: enriched %d of %d staff members", enriched, staffList.Total)
return enriched, nil
}
// ProfileDetails contains extracted details from a profile page
type ProfileDetails struct {
Email string
Phone string
Office string
ORCID string
GoogleScholarID string
ResearchgateURL string
LinkedInURL string
PersonalWebsite string
ResearchInterests []string
PhotoURL string
}
// extractProfileDetails extracts contact info from an individual profile page
func (c *StaffCrawler) extractProfileDetails(ctx context.Context, profileURL string) (*ProfileDetails, error) {
body, err := c.fetchPage(ctx, profileURL)
if err != nil {
return nil, err
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
return nil, err
}
details := &ProfileDetails{}
// UOL-specific: Look for definition list pattern (dt/dd pairs)
// This is the most reliable way to get contact info on UOL pages
doc.Find("dt").Each(func(i int, dt *goquery.Selection) {
label := strings.TrimSpace(strings.ToLower(dt.Text()))
dd := dt.Next()
if dd.Length() == 0 || goquery.NodeName(dd) != "dd" {
return
}
value := strings.TrimSpace(dd.Text())
switch {
case strings.Contains(label, "email") || strings.Contains(label, "e-mail"):
if details.Email == "" {
// Get email from mailto link if present
dd.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
if details.Email != "" {
return
}
href, _ := a.Attr("href")
email := strings.TrimPrefix(href, "mailto:")
email = strings.Split(email, "?")[0]
if strings.Contains(email, "@") {
details.Email = strings.TrimSpace(email)
}
})
// Fallback: extract from text
if details.Email == "" && strings.Contains(value, "@") {
emailPattern := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,10}`)
if match := emailPattern.FindString(value); match != "" {
details.Email = match
}
}
}
case strings.Contains(label, "telefon") || strings.Contains(label, "phone") || strings.Contains(label, "tel"):
if details.Phone == "" {
// Get phone from tel: link if present
dd.Find("a[href^='tel:']").Each(func(j int, a *goquery.Selection) {
if details.Phone != "" {
return
}
href, _ := a.Attr("href")
phone := strings.TrimPrefix(href, "tel:")
if len(phone) >= 8 {
details.Phone = phone
}
})
// Fallback: extract from text
if details.Phone == "" {
phonePattern := regexp.MustCompile(`\+?[\d\s\-/()]{8,20}`)
if match := phonePattern.FindString(value); match != "" {
details.Phone = strings.TrimSpace(match)
}
}
}
case strings.Contains(label, "raum") || strings.Contains(label, "büro") || strings.Contains(label, "office"):
if details.Office == "" {
details.Office = value
}
}
})
// Fallback: Extract email from mailto links if not found via dt/dd
if details.Email == "" {
doc.Find("a[href^='mailto:']").Each(func(i int, s *goquery.Selection) {
if details.Email != "" {
return
}
href, _ := s.Attr("href")
email := strings.TrimPrefix(href, "mailto:")
email = strings.Split(email, "?")[0]
// Only accept personal email addresses (not generic like info@, sekretariat@)
if strings.Contains(email, "@") {
emailLower := strings.ToLower(email)
isGeneric := strings.HasPrefix(emailLower, "info@") ||
strings.HasPrefix(emailLower, "sekretariat@") ||
strings.HasPrefix(emailLower, "kontakt@") ||
strings.HasPrefix(emailLower, "office@") ||
strings.HasPrefix(emailLower, "fachschaft@")
if !isGeneric {
details.Email = strings.TrimSpace(email)
}
}
})
}
// Fallback: Extract phone if not found via dt/dd
if details.Phone == "" {
doc.Find("a[href^='tel:']").Each(func(i int, s *goquery.Selection) {
if details.Phone != "" {
return
}
href, _ := s.Attr("href")
phone := strings.TrimPrefix(href, "tel:")
if len(phone) >= 8 {
details.Phone = phone
}
})
}
// Extract ORCID
doc.Find("a[href*='orcid.org']").Each(func(i int, s *goquery.Selection) {
if details.ORCID != "" {
return
}
href, _ := s.Attr("href")
orcidPattern := regexp.MustCompile(`\d{4}-\d{4}-\d{4}-\d{3}[\dX]`)
if match := orcidPattern.FindString(href); match != "" {
details.ORCID = match
}
})
// Extract Google Scholar ID
doc.Find("a[href*='scholar.google']").Each(func(i int, s *goquery.Selection) {
if details.GoogleScholarID != "" {
return
}
href, _ := s.Attr("href")
// Extract user ID from URL like scholar.google.com/citations?user=XXXXX
if strings.Contains(href, "user=") {
parts := strings.Split(href, "user=")
if len(parts) > 1 {
userID := strings.Split(parts[1], "&")[0]
details.GoogleScholarID = userID
}
}
})
// Extract ResearchGate URL
doc.Find("a[href*='researchgate.net']").Each(func(i int, s *goquery.Selection) {
if details.ResearchgateURL != "" {
return
}
href, _ := s.Attr("href")
if strings.Contains(href, "researchgate.net") {
details.ResearchgateURL = href
}
})
// Extract LinkedIn URL
doc.Find("a[href*='linkedin.com']").Each(func(i int, s *goquery.Selection) {
if details.LinkedInURL != "" {
return
}
href, _ := s.Attr("href")
if strings.Contains(href, "linkedin.com") {
details.LinkedInURL = href
}
})
// Extract personal website (non-university links)
doc.Find("a[href^='http']").Each(func(i int, s *goquery.Selection) {
if details.PersonalWebsite != "" {
return
}
href, _ := s.Attr("href")
text := strings.ToLower(s.Text())
// Skip university links, social media, etc.
if strings.Contains(href, "uni-oldenburg.de") || strings.Contains(href, "uol.de") ||
strings.Contains(href, "linkedin") || strings.Contains(href, "researchgate") ||
strings.Contains(href, "orcid.org") || strings.Contains(href, "scholar.google") ||
strings.Contains(href, "twitter") || strings.Contains(href, "facebook") {
return
}
// Look for personal website indicators
if strings.Contains(text, "homepage") || strings.Contains(text, "website") ||
strings.Contains(text, "personal") || strings.Contains(text, "www") {
details.PersonalWebsite = href
}
})
// Extract photo URL
doc.Find("img").Each(func(i int, s *goquery.Selection) {
if details.PhotoURL != "" {
return
}
src, exists := s.Attr("src")
if !exists {
return
}
// Skip icons, logos, etc.
srcLower := strings.ToLower(src)
if strings.Contains(srcLower, "icon") || strings.Contains(srcLower, "logo") ||
strings.Contains(srcLower, "placeholder") || strings.Contains(srcLower, "default") {
return
}
// Look for images that might be profile photos
alt, _ := s.Attr("alt")
altLower := strings.ToLower(alt)
classes, _ := s.Attr("class")
classesLower := strings.ToLower(classes)
if strings.Contains(altLower, "foto") || strings.Contains(altLower, "photo") ||
strings.Contains(altLower, "portrait") || strings.Contains(altLower, "bild") ||
strings.Contains(classesLower, "photo") || strings.Contains(classesLower, "portrait") ||
strings.Contains(classesLower, "profile") {
details.PhotoURL = resolveURL(profileURL, src)
}
})
// Extract research interests/areas
// Look for sections about research, forschung, schwerpunkte
doc.Find("*").Each(func(i int, s *goquery.Selection) {
if len(details.ResearchInterests) > 0 {
return
}
text := strings.ToLower(s.Text())
if strings.Contains(text, "forschung") || strings.Contains(text, "research") ||
strings.Contains(text, "schwerpunkt") || strings.Contains(text, "interest") {
// Check if parent has a list of items
s.Parent().Find("li").Each(func(j int, li *goquery.Selection) {
interest := strings.TrimSpace(li.Text())
if len(interest) > 3 && len(interest) < 200 {
details.ResearchInterests = append(details.ResearchInterests, interest)
}
})
}
})
return details, nil
}

View File

@@ -0,0 +1,495 @@
package staff
import (
"bytes"
"context"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/breakpilot/edu-search-service/internal/database"
)
// extractStaffFromPage extracts staff information from a staff listing page
func (c *StaffCrawler) extractStaffFromPage(ctx context.Context, pageURL string, uni *database.University) ([]*database.UniversityStaff, error) {
body, err := c.fetchPage(ctx, pageURL)
if err != nil {
return nil, err
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
return nil, err
}
var staff []*database.UniversityStaff
// Try different extraction strategies
extractors := []func(*goquery.Document, string) []*database.UniversityStaff{
c.extractFromUOLPatterns, // UOL-specific patterns first
c.extractFromPersonCards,
c.extractFromTable,
c.extractFromList,
c.extractFromVCard,
}
for _, extractor := range extractors {
extracted := extractor(doc, pageURL)
if len(extracted) > 0 {
staff = append(staff, extracted...)
}
}
return staff, nil
}
// extractFromUOLPatterns extracts staff using Uni Oldenburg specific patterns
// UOL uses: nav#left-nav for person lists, p.mit-icon.person for person links,
// and /suche/person?username=XXX for person API
// Also captures hierarchy from section headers (Leitung, Mitarbeiter, etc.)
func (c *StaffCrawler) extractFromUOLPatterns(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
var staff []*database.UniversityStaff
seen := make(map[string]bool)
// Extract department name from page title or breadcrumb
deptName := ""
doc.Find("h1").First().Each(func(i int, s *goquery.Selection) {
deptName = strings.TrimSpace(s.Text())
})
// Pattern 5 (NEW): Parse content with hierarchy headers
// UOL pages have structure like:
// #### Leitung
// <ul><li><a href="...">Prof. Dr. Name</a></li></ul>
// #### Wissenschaftliche Mitarbeiterinnen und Mitarbeiter
// <ul><li><a href="...">M. Sc. Name</a></li></ul>
currentRole := ""
var leaderName string // Track the department head for supervisor assignment
// Walk through content area looking for headers and lists
doc.Find("#content h4, #content h3, #content ul li a, .inhalt h4, .inhalt h3, .inhalt ul li a").Each(func(i int, s *goquery.Selection) {
tagName := goquery.NodeName(s)
// Check if this is a section header
if tagName == "h3" || tagName == "h4" {
headerText := strings.ToLower(strings.TrimSpace(s.Text()))
if strings.Contains(headerText, "leitung") {
currentRole = "leitung"
} else if strings.Contains(headerText, "sekretariat") {
currentRole = "sekretariat"
} else if strings.Contains(headerText, "wissenschaftlich") || strings.Contains(headerText, "mitarbeiter") {
currentRole = "mitarbeiter"
} else if strings.Contains(headerText, "doktorand") || strings.Contains(headerText, "promovierend") {
currentRole = "doktorand"
} else if strings.Contains(headerText, "technisch") {
currentRole = "technisch"
} else if strings.Contains(headerText, "extern") {
currentRole = "extern"
} else if strings.Contains(headerText, "student") || strings.Contains(headerText, "hilfskr") || strings.Contains(headerText, "hiwi") {
currentRole = "hiwi"
}
return
}
// Process person links under current header
if tagName == "a" {
href, exists := s.Attr("href")
if !exists {
return
}
// Check if this looks like a person page link
if !strings.Contains(href, "/personen/") && !strings.Contains(href, "suche/person") {
return
}
name := strings.TrimSpace(s.Text())
if name == "" || seen[name] || !c.looksLikeName(name) {
return
}
seen[name] = true
person := &database.UniversityStaff{}
person.FullName = &name
c.parseName(name, person)
if person.LastName != "" {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
// Set team role based on current section
if currentRole != "" {
person.TeamRole = &currentRole
}
// Track leader for supervisor assignment
if currentRole == "leitung" && leaderName == "" {
leaderName = name
person.IsProfessor = true
posType := "professor"
person.PositionType = &posType
}
staff = append(staff, person)
}
}
})
// Pattern 1: nav#left-nav ul li a - side navigation with person links
// Format: /abteilung/personen/prof-dr-name or /abteilung/personen/m-sc-name
doc.Find("nav#left-nav ul li a, #left-navi li a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
return
}
// Check if this looks like a person page link
if !strings.Contains(href, "/personen/") {
return
}
name := strings.TrimSpace(s.Text())
if name == "" || seen[name] {
return
}
seen[name] = true
person := &database.UniversityStaff{}
person.FullName = &name
c.parseName(name, person)
if person.LastName != "" {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
staff = append(staff, person)
}
})
// Pattern 2: p.mit-icon.person a - inline person references
// Format: <p class="mit-icon person"><a href="/suche/person/USERNAME">Prof. Dr. Name</a></p>
// OR: <p class="mit-icon person"><a href="/abteilung/personen/prof-dr-name">Prof. Dr. Name</a></p>
doc.Find("p.mit-icon.person a, .mit-icon.person a").Each(func(i int, s *goquery.Selection) {
name := strings.TrimSpace(s.Text())
if name == "" || seen[name] {
return
}
seen[name] = true
person := &database.UniversityStaff{}
person.FullName = &name
c.parseName(name, person)
if person.LastName != "" {
href, exists := s.Attr("href")
if exists {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
}
staff = append(staff, person)
}
})
// Pattern 3: Links to /suche/person?username=XXX
doc.Find("a[href*='suche/person']").Each(func(i int, s *goquery.Selection) {
name := strings.TrimSpace(s.Text())
// Skip non-person text like "Internetkoordinator"
if name == "" || seen[name] || !c.looksLikeName(name) {
return
}
seen[name] = true
person := &database.UniversityStaff{}
person.FullName = &name
c.parseName(name, person)
if person.LastName != "" {
href, exists := s.Attr("href")
if exists {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
}
staff = append(staff, person)
}
})
// Pattern 4: Breadcrumb navigation sublinks with person names
// Format: <ul class="sublinks"><li><a href="/dept/personen/name">Prof. Dr. Name</a></li>
doc.Find(".sublinks li a, nav#navizeile .sublinks li a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists || !strings.Contains(href, "/personen/") {
return
}
name := strings.TrimSpace(s.Text())
if name == "" || seen[name] {
return
}
seen[name] = true
person := &database.UniversityStaff{}
person.FullName = &name
c.parseName(name, person)
if person.LastName != "" {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
staff = append(staff, person)
}
})
if len(staff) > 0 {
log.Printf("[UOL Extractor] Found %d staff members using UOL patterns (dept: %s)", len(staff), deptName)
}
return staff
}
// extractFromPersonCards extracts staff from card-style layouts
func (c *StaffCrawler) extractFromPersonCards(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
var staff []*database.UniversityStaff
// Common card selectors
cardSelectors := []string{
".person-card",
".staff-card",
".team-member",
".mitarbeiter",
".person",
".employee",
"[itemtype='http://schema.org/Person']",
".vcard",
}
for _, selector := range cardSelectors {
doc.Find(selector).Each(func(i int, s *goquery.Selection) {
person := c.extractPersonFromElement(s, baseURL)
if person != nil && person.LastName != "" {
staff = append(staff, person)
}
})
if len(staff) > 0 {
break
}
}
return staff
}
// extractFromTable extracts staff from table layouts
func (c *StaffCrawler) extractFromTable(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
var staff []*database.UniversityStaff
doc.Find("table").Each(func(i int, table *goquery.Selection) {
// Check if this looks like a staff table
headerText := strings.ToLower(table.Find("th").Text())
if !strings.Contains(headerText, "name") && !strings.Contains(headerText, "person") {
return
}
table.Find("tr").Each(func(j int, row *goquery.Selection) {
if row.Find("th").Length() > 0 {
return // Skip header row
}
cells := row.Find("td")
if cells.Length() < 2 {
return
}
person := &database.UniversityStaff{}
// First cell usually contains name
nameCell := cells.First()
name := strings.TrimSpace(nameCell.Text())
person.FullName = &name
c.parseName(name, person)
// Look for email
row.Find("a[href^='mailto:']").Each(func(k int, a *goquery.Selection) {
href, _ := a.Attr("href")
email := strings.TrimPrefix(href, "mailto:")
person.Email = &email
})
// Look for profile link
nameCell.Find("a[href]").Each(func(k int, a *goquery.Selection) {
href, exists := a.Attr("href")
if exists && !strings.HasPrefix(href, "mailto:") {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
}
})
// Extract position from other cells
cells.Each(func(k int, cell *goquery.Selection) {
text := strings.TrimSpace(cell.Text())
if c.looksLikePosition(text) {
person.Position = &text
person.PositionType = c.classifyPosition(text)
person.IsProfessor = c.isProfessor(text)
}
})
if person.LastName != "" {
staff = append(staff, person)
}
})
})
return staff
}
// extractFromList extracts staff from list layouts
func (c *StaffCrawler) extractFromList(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
var staff []*database.UniversityStaff
listSelectors := []string{"ul.staff", "ul.team", "ul.mitarbeiter", ".staff-list li", ".team-list li"}
for _, selector := range listSelectors {
doc.Find(selector).Each(func(i int, li *goquery.Selection) {
person := c.extractPersonFromElement(li, baseURL)
if person != nil && person.LastName != "" {
staff = append(staff, person)
}
})
if len(staff) > 0 {
break
}
}
return staff
}
// extractFromVCard extracts staff from vCard/hCard microformats
func (c *StaffCrawler) extractFromVCard(doc *goquery.Document, baseURL string) []*database.UniversityStaff {
var staff []*database.UniversityStaff
doc.Find(".vcard, .h-card").Each(func(i int, s *goquery.Selection) {
person := &database.UniversityStaff{}
// Name
fn := s.Find(".fn, .p-name").Text()
if fn != "" {
person.FullName = &fn
c.parseName(fn, person)
}
// Email
email := s.Find(".email, .u-email").Text()
if email == "" {
s.Find("a[href^='mailto:']").Each(func(j int, a *goquery.Selection) {
href, _ := a.Attr("href")
email = strings.TrimPrefix(href, "mailto:")
})
}
if email != "" {
person.Email = &email
}
// Title/Position
title := s.Find(".title, .p-job-title, .role").Text()
if title != "" {
person.Position = &title
person.PositionType = c.classifyPosition(title)
person.IsProfessor = c.isProfessor(title)
}
// Photo
s.Find(".photo, .u-photo, img").Each(func(j int, img *goquery.Selection) {
src, exists := img.Attr("src")
if exists {
fullURL := resolveURL(baseURL, src)
person.PhotoURL = &fullURL
}
})
// Profile URL
s.Find("a[href].url, a[href].u-url").Each(func(j int, a *goquery.Selection) {
href, exists := a.Attr("href")
if exists {
fullURL := resolveURL(baseURL, href)
person.ProfileURL = &fullURL
}
})
if person.LastName != "" {
staff = append(staff, person)
}
})
return staff
}
// extractPersonFromElement extracts a person from a generic HTML element
func (c *StaffCrawler) extractPersonFromElement(s *goquery.Selection, baseURL string) *database.UniversityStaff {
person := &database.UniversityStaff{}
// Try to find name
nameSelectors := []string{".name", ".person-name", "h2", "h3", "h4", ".title", "strong", "b"}
for _, sel := range nameSelectors {
name := strings.TrimSpace(s.Find(sel).First().Text())
if name != "" && len(name) < 100 && !c.looksLikePosition(name) {
person.FullName = &name
c.parseName(name, person)
break
}
}
// If no name found, try the whole text
if person.LastName == "" {
text := strings.TrimSpace(s.Text())
lines := strings.Split(text, "\n")
if len(lines) > 0 {
firstLine := strings.TrimSpace(lines[0])
if len(firstLine) > 0 && len(firstLine) < 100 {
person.FullName = &firstLine
c.parseName(firstLine, person)
}
}
}
// Extract email
s.Find("a[href^='mailto:']").Each(func(i int, a *goquery.Selection) {
href, _ := a.Attr("href")
email := strings.TrimPrefix(href, "mailto:")
email = strings.Split(email, "?")[0] // Remove query params
person.Email = &email
})
// Extract position
positionSelectors := []string{".position", ".role", ".job-title", ".funktion", "small", ".subtitle"}
for _, sel := range positionSelectors {
pos := strings.TrimSpace(s.Find(sel).First().Text())
if pos != "" && c.looksLikePosition(pos) {
person.Position = &pos
person.PositionType = c.classifyPosition(pos)
person.IsProfessor = c.isProfessor(pos)
break
}
}
// Extract photo
s.Find("img").Each(func(i int, img *goquery.Selection) {
src, exists := img.Attr("src")
if exists && !strings.Contains(src, "placeholder") && !strings.Contains(src, "icon") {
fullURL := resolveURL(baseURL, src)
person.PhotoURL = &fullURL
}
})
// Extract profile link
s.Find("a[href]").Each(func(i int, a *goquery.Selection) {
href, exists := a.Attr("href")
if exists && !strings.HasPrefix(href, "mailto:") && !strings.HasPrefix(href, "tel:") {
fullURL := resolveURL(baseURL, href)
if person.ProfileURL == nil {
person.ProfileURL = &fullURL
}
}
})
return person
}