feat: edu-search-service migriert, voice-service/geo-service entfernt

- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00
parent d4e1d6bab6
commit 414e0f5ec0
73 changed files with 23938 additions and 92 deletions
--- a/edu-search-service/internal/publications/pub_crawler.go
+++ b/edu-search-service/internal/publications/pub_crawler.go
@@ -0,0 +1,268 @@
+package publications
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"sync"
+	"time"
+
+	"github.com/breakpilot/edu-search-service/internal/database"
+	"github.com/google/uuid"
+)
+
+// PublicationCrawler crawls publications for university staff
+type PublicationCrawler struct {
+	repo           *database.Repository
+	crossref       *CrossRefClient
+	rateLimit      time.Duration
+	mu             sync.Mutex
+	lastRequest    time.Time
+}
+
+// CrawlResult contains the result of a publication crawl
+type CrawlResult struct {
+	StaffID       uuid.UUID
+	PubsFound     int
+	PubsNew       int
+	PubsUpdated   int
+	Errors        []string
+	Duration      time.Duration
+}
+
+// NewPublicationCrawler creates a new publication crawler
+func NewPublicationCrawler(repo *database.Repository, email string) *PublicationCrawler {
+	return &PublicationCrawler{
+		repo:      repo,
+		crossref:  NewCrossRefClient(email),
+		rateLimit: time.Second, // CrossRef polite pool: 50 req/sec max
+	}
+}
+
+// CrawlForStaff crawls publications for a single staff member
+func (c *PublicationCrawler) CrawlForStaff(ctx context.Context, staff *database.UniversityStaff) (*CrawlResult, error) {
+	start := time.Now()
+	result := &CrawlResult{
+		StaffID: staff.ID,
+	}
+
+	log.Printf("Starting publication crawl for %s", *staff.FullName)
+
+	var pubs []*database.Publication
+
+	// Strategy 1: Search by ORCID (most reliable)
+	if staff.ORCID != nil && *staff.ORCID != "" {
+		c.waitForRateLimit()
+		orcidPubs, err := c.crossref.SearchByORCID(ctx, *staff.ORCID, 100)
+		if err != nil {
+			result.Errors = append(result.Errors, fmt.Sprintf("ORCID search error: %v", err))
+		} else {
+			pubs = append(pubs, orcidPubs...)
+			log.Printf("Found %d publications via ORCID for %s", len(orcidPubs), *staff.FullName)
+		}
+	}
+
+	// Strategy 2: Search by author name
+	if staff.FullName != nil && *staff.FullName != "" {
+		c.waitForRateLimit()
+		namePubs, err := c.crossref.SearchByAuthor(ctx, *staff.FullName, 50)
+		if err != nil {
+			result.Errors = append(result.Errors, fmt.Sprintf("Name search error: %v", err))
+		} else {
+			// Deduplicate
+			for _, pub := range namePubs {
+				if !containsPub(pubs, pub) {
+					pubs = append(pubs, pub)
+				}
+			}
+			log.Printf("Found %d additional publications via name search for %s", len(namePubs), *staff.FullName)
+		}
+	}
+
+	// Save publications and create links
+	for _, pub := range pubs {
+		// Save publication
+		err := c.repo.CreatePublication(ctx, pub)
+		if err != nil {
+			result.Errors = append(result.Errors, fmt.Sprintf("Save error for %s: %v", pub.Title, err))
+			continue
+		}
+
+		result.PubsFound++
+
+		// Link to staff
+		link := &database.StaffPublication{
+			StaffID:       staff.ID,
+			PublicationID: pub.ID,
+		}
+
+		// Determine author position
+		pos := findAuthorPosition(pub, staff)
+		if pos > 0 {
+			link.AuthorPosition = &pos
+		}
+
+		if err := c.repo.LinkStaffPublication(ctx, link); err != nil {
+			result.Errors = append(result.Errors, fmt.Sprintf("Link error: %v", err))
+		}
+	}
+
+	result.Duration = time.Since(start)
+
+	log.Printf("Completed publication crawl for %s: found=%d, duration=%v",
+		*staff.FullName, result.PubsFound, result.Duration)
+
+	return result, nil
+}
+
+// CrawlForUniversity crawls publications for all staff at a university
+func (c *PublicationCrawler) CrawlForUniversity(ctx context.Context, uniID uuid.UUID, limit int) (*database.UniversityCrawlStatus, error) {
+	log.Printf("Starting publication crawl for university %s", uniID)
+
+	// Get staff with ORCID first (more reliable)
+	params := database.StaffSearchParams{
+		UniversityID: &uniID,
+		Limit:        limit,
+	}
+
+	result, err := c.repo.SearchStaff(ctx, params)
+	if err != nil {
+		return nil, err
+	}
+
+	status := &database.UniversityCrawlStatus{
+		UniversityID:     uniID,
+		PubCrawlStatus:   "running",
+	}
+
+	var totalPubs int
+	var errors []string
+
+	for _, staff := range result.Staff {
+		select {
+		case <-ctx.Done():
+			status.PubCrawlStatus = "cancelled"
+			status.PubErrors = append(errors, "Crawl cancelled")
+			return status, ctx.Err()
+		default:
+		}
+
+		crawlResult, err := c.CrawlForStaff(ctx, &staff)
+		if err != nil {
+			errors = append(errors, fmt.Sprintf("%s: %v", staff.LastName, err))
+			continue
+		}
+
+		totalPubs += crawlResult.PubsFound
+		errors = append(errors, crawlResult.Errors...)
+	}
+
+	now := time.Now()
+	status.LastPubCrawl = &now
+	status.PubCrawlStatus = "completed"
+	status.PubCount = totalPubs
+	status.PubErrors = errors
+
+	// Update status in database
+	if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil {
+		log.Printf("Warning: Failed to update crawl status: %v", err)
+	}
+
+	log.Printf("Completed publication crawl for university %s: %d publications found", uniID, totalPubs)
+
+	return status, nil
+}
+
+// ResolveDOI resolves a DOI and saves the publication
+func (c *PublicationCrawler) ResolveDOI(ctx context.Context, doi string) (*database.Publication, error) {
+	c.waitForRateLimit()
+
+	pub, err := c.crossref.GetWorkByDOI(ctx, doi)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := c.repo.CreatePublication(ctx, pub); err != nil {
+		return nil, err
+	}
+
+	return pub, nil
+}
+
+// waitForRateLimit enforces rate limiting
+func (c *PublicationCrawler) waitForRateLimit() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	elapsed := time.Since(c.lastRequest)
+	if elapsed < c.rateLimit {
+		time.Sleep(c.rateLimit - elapsed)
+	}
+
+	c.lastRequest = time.Now()
+}
+
+// containsPub checks if a publication is already in the list (by DOI or title)
+func containsPub(pubs []*database.Publication, pub *database.Publication) bool {
+	for _, existing := range pubs {
+		// Check DOI
+		if pub.DOI != nil && existing.DOI != nil && *pub.DOI == *existing.DOI {
+			return true
+		}
+		// Check title (rough match)
+		if pub.Title == existing.Title {
+			return true
+		}
+	}
+	return false
+}
+
+// findAuthorPosition finds the position of a staff member in the author list
+func findAuthorPosition(pub *database.Publication, staff *database.UniversityStaff) int {
+	for i, author := range pub.Authors {
+		// Check if author name matches staff
+		if staff.LastName != "" && containsIgnoreCase(author, staff.LastName) {
+			return i + 1
+		}
+	}
+	return 0
+}
+
+// containsIgnoreCase checks if s contains substr (case insensitive)
+func containsIgnoreCase(s, substr string) bool {
+	return len(s) >= len(substr) &&
+		(s == substr ||
+			len(substr) == 0 ||
+			(len(s) > 0 && containsIgnoreCaseHelper(s, substr)))
+}
+
+func containsIgnoreCaseHelper(s, substr string) bool {
+	for i := 0; i <= len(s)-len(substr); i++ {
+		if equalFold(s[i:i+len(substr)], substr) {
+			return true
+		}
+	}
+	return false
+}
+
+func equalFold(s1, s2 string) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i := 0; i < len(s1); i++ {
+		c1, c2 := s1[i], s2[i]
+		if c1 != c2 {
+			// Simple ASCII case folding
+			if c1 >= 'A' && c1 <= 'Z' {
+				c1 += 'a' - 'A'
+			}
+			if c2 >= 'A' && c2 <= 'Z' {
+				c2 += 'a' - 'A'
+			}
+			if c1 != c2 {
+				return false
+			}
+		}
+	}
+	return true
+}