Files
breakpilot-lehrer/edu-search-service/internal/publications/pub_crawler.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

269 lines
6.7 KiB
Go

package publications
import (
"context"
"fmt"
"log"
"sync"
"time"
"github.com/breakpilot/edu-search-service/internal/database"
"github.com/google/uuid"
)
// PublicationCrawler crawls publications for university staff
type PublicationCrawler struct {
repo *database.Repository
crossref *CrossRefClient
rateLimit time.Duration
mu sync.Mutex
lastRequest time.Time
}
// CrawlResult contains the result of a publication crawl
type CrawlResult struct {
StaffID uuid.UUID
PubsFound int
PubsNew int
PubsUpdated int
Errors []string
Duration time.Duration
}
// NewPublicationCrawler creates a new publication crawler
func NewPublicationCrawler(repo *database.Repository, email string) *PublicationCrawler {
return &PublicationCrawler{
repo: repo,
crossref: NewCrossRefClient(email),
rateLimit: time.Second, // CrossRef polite pool: 50 req/sec max
}
}
// CrawlForStaff crawls publications for a single staff member
func (c *PublicationCrawler) CrawlForStaff(ctx context.Context, staff *database.UniversityStaff) (*CrawlResult, error) {
start := time.Now()
result := &CrawlResult{
StaffID: staff.ID,
}
log.Printf("Starting publication crawl for %s", *staff.FullName)
var pubs []*database.Publication
// Strategy 1: Search by ORCID (most reliable)
if staff.ORCID != nil && *staff.ORCID != "" {
c.waitForRateLimit()
orcidPubs, err := c.crossref.SearchByORCID(ctx, *staff.ORCID, 100)
if err != nil {
result.Errors = append(result.Errors, fmt.Sprintf("ORCID search error: %v", err))
} else {
pubs = append(pubs, orcidPubs...)
log.Printf("Found %d publications via ORCID for %s", len(orcidPubs), *staff.FullName)
}
}
// Strategy 2: Search by author name
if staff.FullName != nil && *staff.FullName != "" {
c.waitForRateLimit()
namePubs, err := c.crossref.SearchByAuthor(ctx, *staff.FullName, 50)
if err != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Name search error: %v", err))
} else {
// Deduplicate
for _, pub := range namePubs {
if !containsPub(pubs, pub) {
pubs = append(pubs, pub)
}
}
log.Printf("Found %d additional publications via name search for %s", len(namePubs), *staff.FullName)
}
}
// Save publications and create links
for _, pub := range pubs {
// Save publication
err := c.repo.CreatePublication(ctx, pub)
if err != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Save error for %s: %v", pub.Title, err))
continue
}
result.PubsFound++
// Link to staff
link := &database.StaffPublication{
StaffID: staff.ID,
PublicationID: pub.ID,
}
// Determine author position
pos := findAuthorPosition(pub, staff)
if pos > 0 {
link.AuthorPosition = &pos
}
if err := c.repo.LinkStaffPublication(ctx, link); err != nil {
result.Errors = append(result.Errors, fmt.Sprintf("Link error: %v", err))
}
}
result.Duration = time.Since(start)
log.Printf("Completed publication crawl for %s: found=%d, duration=%v",
*staff.FullName, result.PubsFound, result.Duration)
return result, nil
}
// CrawlForUniversity crawls publications for all staff at a university
func (c *PublicationCrawler) CrawlForUniversity(ctx context.Context, uniID uuid.UUID, limit int) (*database.UniversityCrawlStatus, error) {
log.Printf("Starting publication crawl for university %s", uniID)
// Get staff with ORCID first (more reliable)
params := database.StaffSearchParams{
UniversityID: &uniID,
Limit: limit,
}
result, err := c.repo.SearchStaff(ctx, params)
if err != nil {
return nil, err
}
status := &database.UniversityCrawlStatus{
UniversityID: uniID,
PubCrawlStatus: "running",
}
var totalPubs int
var errors []string
for _, staff := range result.Staff {
select {
case <-ctx.Done():
status.PubCrawlStatus = "cancelled"
status.PubErrors = append(errors, "Crawl cancelled")
return status, ctx.Err()
default:
}
crawlResult, err := c.CrawlForStaff(ctx, &staff)
if err != nil {
errors = append(errors, fmt.Sprintf("%s: %v", staff.LastName, err))
continue
}
totalPubs += crawlResult.PubsFound
errors = append(errors, crawlResult.Errors...)
}
now := time.Now()
status.LastPubCrawl = &now
status.PubCrawlStatus = "completed"
status.PubCount = totalPubs
status.PubErrors = errors
// Update status in database
if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil {
log.Printf("Warning: Failed to update crawl status: %v", err)
}
log.Printf("Completed publication crawl for university %s: %d publications found", uniID, totalPubs)
return status, nil
}
// ResolveDOI resolves a DOI and saves the publication
func (c *PublicationCrawler) ResolveDOI(ctx context.Context, doi string) (*database.Publication, error) {
c.waitForRateLimit()
pub, err := c.crossref.GetWorkByDOI(ctx, doi)
if err != nil {
return nil, err
}
if err := c.repo.CreatePublication(ctx, pub); err != nil {
return nil, err
}
return pub, nil
}
// waitForRateLimit enforces rate limiting
func (c *PublicationCrawler) waitForRateLimit() {
c.mu.Lock()
defer c.mu.Unlock()
elapsed := time.Since(c.lastRequest)
if elapsed < c.rateLimit {
time.Sleep(c.rateLimit - elapsed)
}
c.lastRequest = time.Now()
}
// containsPub checks if a publication is already in the list (by DOI or title)
func containsPub(pubs []*database.Publication, pub *database.Publication) bool {
for _, existing := range pubs {
// Check DOI
if pub.DOI != nil && existing.DOI != nil && *pub.DOI == *existing.DOI {
return true
}
// Check title (rough match)
if pub.Title == existing.Title {
return true
}
}
return false
}
// findAuthorPosition finds the position of a staff member in the author list
func findAuthorPosition(pub *database.Publication, staff *database.UniversityStaff) int {
for i, author := range pub.Authors {
// Check if author name matches staff
if staff.LastName != "" && containsIgnoreCase(author, staff.LastName) {
return i + 1
}
}
return 0
}
// containsIgnoreCase checks if s contains substr (case insensitive)
func containsIgnoreCase(s, substr string) bool {
return len(s) >= len(substr) &&
(s == substr ||
len(substr) == 0 ||
(len(s) > 0 && containsIgnoreCaseHelper(s, substr)))
}
func containsIgnoreCaseHelper(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if equalFold(s[i:i+len(substr)], substr) {
return true
}
}
return false
}
func equalFold(s1, s2 string) bool {
if len(s1) != len(s2) {
return false
}
for i := 0; i < len(s1); i++ {
c1, c2 := s1[i], s2[i]
if c1 != c2 {
// Simple ASCII case folding
if c1 >= 'A' && c1 <= 'Z' {
c1 += 'a' - 'A'
}
if c2 >= 'A' && c2 <= 'Z' {
c2 += 'a' - 'A'
}
if c1 != c2 {
return false
}
}
}
return true
}