All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
269 lines
6.7 KiB
Go
269 lines
6.7 KiB
Go
package publications
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/breakpilot/edu-search-service/internal/database"
|
|
"github.com/google/uuid"
|
|
)
|
|
|
|
// PublicationCrawler crawls publications for university staff
|
|
type PublicationCrawler struct {
|
|
repo *database.Repository
|
|
crossref *CrossRefClient
|
|
rateLimit time.Duration
|
|
mu sync.Mutex
|
|
lastRequest time.Time
|
|
}
|
|
|
|
// CrawlResult contains the result of a publication crawl
|
|
type CrawlResult struct {
|
|
StaffID uuid.UUID
|
|
PubsFound int
|
|
PubsNew int
|
|
PubsUpdated int
|
|
Errors []string
|
|
Duration time.Duration
|
|
}
|
|
|
|
// NewPublicationCrawler creates a new publication crawler
|
|
func NewPublicationCrawler(repo *database.Repository, email string) *PublicationCrawler {
|
|
return &PublicationCrawler{
|
|
repo: repo,
|
|
crossref: NewCrossRefClient(email),
|
|
rateLimit: time.Second, // CrossRef polite pool: 50 req/sec max
|
|
}
|
|
}
|
|
|
|
// CrawlForStaff crawls publications for a single staff member
|
|
func (c *PublicationCrawler) CrawlForStaff(ctx context.Context, staff *database.UniversityStaff) (*CrawlResult, error) {
|
|
start := time.Now()
|
|
result := &CrawlResult{
|
|
StaffID: staff.ID,
|
|
}
|
|
|
|
log.Printf("Starting publication crawl for %s", *staff.FullName)
|
|
|
|
var pubs []*database.Publication
|
|
|
|
// Strategy 1: Search by ORCID (most reliable)
|
|
if staff.ORCID != nil && *staff.ORCID != "" {
|
|
c.waitForRateLimit()
|
|
orcidPubs, err := c.crossref.SearchByORCID(ctx, *staff.ORCID, 100)
|
|
if err != nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("ORCID search error: %v", err))
|
|
} else {
|
|
pubs = append(pubs, orcidPubs...)
|
|
log.Printf("Found %d publications via ORCID for %s", len(orcidPubs), *staff.FullName)
|
|
}
|
|
}
|
|
|
|
// Strategy 2: Search by author name
|
|
if staff.FullName != nil && *staff.FullName != "" {
|
|
c.waitForRateLimit()
|
|
namePubs, err := c.crossref.SearchByAuthor(ctx, *staff.FullName, 50)
|
|
if err != nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("Name search error: %v", err))
|
|
} else {
|
|
// Deduplicate
|
|
for _, pub := range namePubs {
|
|
if !containsPub(pubs, pub) {
|
|
pubs = append(pubs, pub)
|
|
}
|
|
}
|
|
log.Printf("Found %d additional publications via name search for %s", len(namePubs), *staff.FullName)
|
|
}
|
|
}
|
|
|
|
// Save publications and create links
|
|
for _, pub := range pubs {
|
|
// Save publication
|
|
err := c.repo.CreatePublication(ctx, pub)
|
|
if err != nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("Save error for %s: %v", pub.Title, err))
|
|
continue
|
|
}
|
|
|
|
result.PubsFound++
|
|
|
|
// Link to staff
|
|
link := &database.StaffPublication{
|
|
StaffID: staff.ID,
|
|
PublicationID: pub.ID,
|
|
}
|
|
|
|
// Determine author position
|
|
pos := findAuthorPosition(pub, staff)
|
|
if pos > 0 {
|
|
link.AuthorPosition = &pos
|
|
}
|
|
|
|
if err := c.repo.LinkStaffPublication(ctx, link); err != nil {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("Link error: %v", err))
|
|
}
|
|
}
|
|
|
|
result.Duration = time.Since(start)
|
|
|
|
log.Printf("Completed publication crawl for %s: found=%d, duration=%v",
|
|
*staff.FullName, result.PubsFound, result.Duration)
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// CrawlForUniversity crawls publications for all staff at a university
|
|
func (c *PublicationCrawler) CrawlForUniversity(ctx context.Context, uniID uuid.UUID, limit int) (*database.UniversityCrawlStatus, error) {
|
|
log.Printf("Starting publication crawl for university %s", uniID)
|
|
|
|
// Get staff with ORCID first (more reliable)
|
|
params := database.StaffSearchParams{
|
|
UniversityID: &uniID,
|
|
Limit: limit,
|
|
}
|
|
|
|
result, err := c.repo.SearchStaff(ctx, params)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
status := &database.UniversityCrawlStatus{
|
|
UniversityID: uniID,
|
|
PubCrawlStatus: "running",
|
|
}
|
|
|
|
var totalPubs int
|
|
var errors []string
|
|
|
|
for _, staff := range result.Staff {
|
|
select {
|
|
case <-ctx.Done():
|
|
status.PubCrawlStatus = "cancelled"
|
|
status.PubErrors = append(errors, "Crawl cancelled")
|
|
return status, ctx.Err()
|
|
default:
|
|
}
|
|
|
|
crawlResult, err := c.CrawlForStaff(ctx, &staff)
|
|
if err != nil {
|
|
errors = append(errors, fmt.Sprintf("%s: %v", staff.LastName, err))
|
|
continue
|
|
}
|
|
|
|
totalPubs += crawlResult.PubsFound
|
|
errors = append(errors, crawlResult.Errors...)
|
|
}
|
|
|
|
now := time.Now()
|
|
status.LastPubCrawl = &now
|
|
status.PubCrawlStatus = "completed"
|
|
status.PubCount = totalPubs
|
|
status.PubErrors = errors
|
|
|
|
// Update status in database
|
|
if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil {
|
|
log.Printf("Warning: Failed to update crawl status: %v", err)
|
|
}
|
|
|
|
log.Printf("Completed publication crawl for university %s: %d publications found", uniID, totalPubs)
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// ResolveDOI resolves a DOI and saves the publication
|
|
func (c *PublicationCrawler) ResolveDOI(ctx context.Context, doi string) (*database.Publication, error) {
|
|
c.waitForRateLimit()
|
|
|
|
pub, err := c.crossref.GetWorkByDOI(ctx, doi)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := c.repo.CreatePublication(ctx, pub); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return pub, nil
|
|
}
|
|
|
|
// waitForRateLimit enforces rate limiting
|
|
func (c *PublicationCrawler) waitForRateLimit() {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
elapsed := time.Since(c.lastRequest)
|
|
if elapsed < c.rateLimit {
|
|
time.Sleep(c.rateLimit - elapsed)
|
|
}
|
|
|
|
c.lastRequest = time.Now()
|
|
}
|
|
|
|
// containsPub checks if a publication is already in the list (by DOI or title)
|
|
func containsPub(pubs []*database.Publication, pub *database.Publication) bool {
|
|
for _, existing := range pubs {
|
|
// Check DOI
|
|
if pub.DOI != nil && existing.DOI != nil && *pub.DOI == *existing.DOI {
|
|
return true
|
|
}
|
|
// Check title (rough match)
|
|
if pub.Title == existing.Title {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// findAuthorPosition finds the position of a staff member in the author list
|
|
func findAuthorPosition(pub *database.Publication, staff *database.UniversityStaff) int {
|
|
for i, author := range pub.Authors {
|
|
// Check if author name matches staff
|
|
if staff.LastName != "" && containsIgnoreCase(author, staff.LastName) {
|
|
return i + 1
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// containsIgnoreCase checks if s contains substr (case insensitive)
|
|
func containsIgnoreCase(s, substr string) bool {
|
|
return len(s) >= len(substr) &&
|
|
(s == substr ||
|
|
len(substr) == 0 ||
|
|
(len(s) > 0 && containsIgnoreCaseHelper(s, substr)))
|
|
}
|
|
|
|
func containsIgnoreCaseHelper(s, substr string) bool {
|
|
for i := 0; i <= len(s)-len(substr); i++ {
|
|
if equalFold(s[i:i+len(substr)], substr) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func equalFold(s1, s2 string) bool {
|
|
if len(s1) != len(s2) {
|
|
return false
|
|
}
|
|
for i := 0; i < len(s1); i++ {
|
|
c1, c2 := s1[i], s2[i]
|
|
if c1 != c2 {
|
|
// Simple ASCII case folding
|
|
if c1 >= 'A' && c1 <= 'Z' {
|
|
c1 += 'a' - 'A'
|
|
}
|
|
if c2 >= 'A' && c2 <= 'Z' {
|
|
c2 += 'a' - 'A'
|
|
}
|
|
if c1 != c2 {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|