feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
268
edu-search-service/internal/publications/pub_crawler.go
Normal file
268
edu-search-service/internal/publications/pub_crawler.go
Normal file
@@ -0,0 +1,268 @@
|
||||
package publications
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// PublicationCrawler crawls publications for university staff
|
||||
type PublicationCrawler struct {
|
||||
repo *database.Repository
|
||||
crossref *CrossRefClient
|
||||
rateLimit time.Duration
|
||||
mu sync.Mutex
|
||||
lastRequest time.Time
|
||||
}
|
||||
|
||||
// CrawlResult contains the result of a publication crawl
|
||||
type CrawlResult struct {
|
||||
StaffID uuid.UUID
|
||||
PubsFound int
|
||||
PubsNew int
|
||||
PubsUpdated int
|
||||
Errors []string
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// NewPublicationCrawler creates a new publication crawler
|
||||
func NewPublicationCrawler(repo *database.Repository, email string) *PublicationCrawler {
|
||||
return &PublicationCrawler{
|
||||
repo: repo,
|
||||
crossref: NewCrossRefClient(email),
|
||||
rateLimit: time.Second, // CrossRef polite pool: 50 req/sec max
|
||||
}
|
||||
}
|
||||
|
||||
// CrawlForStaff crawls publications for a single staff member
|
||||
func (c *PublicationCrawler) CrawlForStaff(ctx context.Context, staff *database.UniversityStaff) (*CrawlResult, error) {
|
||||
start := time.Now()
|
||||
result := &CrawlResult{
|
||||
StaffID: staff.ID,
|
||||
}
|
||||
|
||||
log.Printf("Starting publication crawl for %s", *staff.FullName)
|
||||
|
||||
var pubs []*database.Publication
|
||||
|
||||
// Strategy 1: Search by ORCID (most reliable)
|
||||
if staff.ORCID != nil && *staff.ORCID != "" {
|
||||
c.waitForRateLimit()
|
||||
orcidPubs, err := c.crossref.SearchByORCID(ctx, *staff.ORCID, 100)
|
||||
if err != nil {
|
||||
result.Errors = append(result.Errors, fmt.Sprintf("ORCID search error: %v", err))
|
||||
} else {
|
||||
pubs = append(pubs, orcidPubs...)
|
||||
log.Printf("Found %d publications via ORCID for %s", len(orcidPubs), *staff.FullName)
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Search by author name
|
||||
if staff.FullName != nil && *staff.FullName != "" {
|
||||
c.waitForRateLimit()
|
||||
namePubs, err := c.crossref.SearchByAuthor(ctx, *staff.FullName, 50)
|
||||
if err != nil {
|
||||
result.Errors = append(result.Errors, fmt.Sprintf("Name search error: %v", err))
|
||||
} else {
|
||||
// Deduplicate
|
||||
for _, pub := range namePubs {
|
||||
if !containsPub(pubs, pub) {
|
||||
pubs = append(pubs, pub)
|
||||
}
|
||||
}
|
||||
log.Printf("Found %d additional publications via name search for %s", len(namePubs), *staff.FullName)
|
||||
}
|
||||
}
|
||||
|
||||
// Save publications and create links
|
||||
for _, pub := range pubs {
|
||||
// Save publication
|
||||
err := c.repo.CreatePublication(ctx, pub)
|
||||
if err != nil {
|
||||
result.Errors = append(result.Errors, fmt.Sprintf("Save error for %s: %v", pub.Title, err))
|
||||
continue
|
||||
}
|
||||
|
||||
result.PubsFound++
|
||||
|
||||
// Link to staff
|
||||
link := &database.StaffPublication{
|
||||
StaffID: staff.ID,
|
||||
PublicationID: pub.ID,
|
||||
}
|
||||
|
||||
// Determine author position
|
||||
pos := findAuthorPosition(pub, staff)
|
||||
if pos > 0 {
|
||||
link.AuthorPosition = &pos
|
||||
}
|
||||
|
||||
if err := c.repo.LinkStaffPublication(ctx, link); err != nil {
|
||||
result.Errors = append(result.Errors, fmt.Sprintf("Link error: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
result.Duration = time.Since(start)
|
||||
|
||||
log.Printf("Completed publication crawl for %s: found=%d, duration=%v",
|
||||
*staff.FullName, result.PubsFound, result.Duration)
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// CrawlForUniversity crawls publications for all staff at a university
|
||||
func (c *PublicationCrawler) CrawlForUniversity(ctx context.Context, uniID uuid.UUID, limit int) (*database.UniversityCrawlStatus, error) {
|
||||
log.Printf("Starting publication crawl for university %s", uniID)
|
||||
|
||||
// Get staff with ORCID first (more reliable)
|
||||
params := database.StaffSearchParams{
|
||||
UniversityID: &uniID,
|
||||
Limit: limit,
|
||||
}
|
||||
|
||||
result, err := c.repo.SearchStaff(ctx, params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
status := &database.UniversityCrawlStatus{
|
||||
UniversityID: uniID,
|
||||
PubCrawlStatus: "running",
|
||||
}
|
||||
|
||||
var totalPubs int
|
||||
var errors []string
|
||||
|
||||
for _, staff := range result.Staff {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
status.PubCrawlStatus = "cancelled"
|
||||
status.PubErrors = append(errors, "Crawl cancelled")
|
||||
return status, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
crawlResult, err := c.CrawlForStaff(ctx, &staff)
|
||||
if err != nil {
|
||||
errors = append(errors, fmt.Sprintf("%s: %v", staff.LastName, err))
|
||||
continue
|
||||
}
|
||||
|
||||
totalPubs += crawlResult.PubsFound
|
||||
errors = append(errors, crawlResult.Errors...)
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
status.LastPubCrawl = &now
|
||||
status.PubCrawlStatus = "completed"
|
||||
status.PubCount = totalPubs
|
||||
status.PubErrors = errors
|
||||
|
||||
// Update status in database
|
||||
if err := c.repo.UpdateCrawlStatus(ctx, status); err != nil {
|
||||
log.Printf("Warning: Failed to update crawl status: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Completed publication crawl for university %s: %d publications found", uniID, totalPubs)
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// ResolveDOI resolves a DOI and saves the publication
|
||||
func (c *PublicationCrawler) ResolveDOI(ctx context.Context, doi string) (*database.Publication, error) {
|
||||
c.waitForRateLimit()
|
||||
|
||||
pub, err := c.crossref.GetWorkByDOI(ctx, doi)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := c.repo.CreatePublication(ctx, pub); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return pub, nil
|
||||
}
|
||||
|
||||
// waitForRateLimit enforces rate limiting
|
||||
func (c *PublicationCrawler) waitForRateLimit() {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
elapsed := time.Since(c.lastRequest)
|
||||
if elapsed < c.rateLimit {
|
||||
time.Sleep(c.rateLimit - elapsed)
|
||||
}
|
||||
|
||||
c.lastRequest = time.Now()
|
||||
}
|
||||
|
||||
// containsPub checks if a publication is already in the list (by DOI or title)
|
||||
func containsPub(pubs []*database.Publication, pub *database.Publication) bool {
|
||||
for _, existing := range pubs {
|
||||
// Check DOI
|
||||
if pub.DOI != nil && existing.DOI != nil && *pub.DOI == *existing.DOI {
|
||||
return true
|
||||
}
|
||||
// Check title (rough match)
|
||||
if pub.Title == existing.Title {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// findAuthorPosition finds the position of a staff member in the author list
|
||||
func findAuthorPosition(pub *database.Publication, staff *database.UniversityStaff) int {
|
||||
for i, author := range pub.Authors {
|
||||
// Check if author name matches staff
|
||||
if staff.LastName != "" && containsIgnoreCase(author, staff.LastName) {
|
||||
return i + 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// containsIgnoreCase checks if s contains substr (case insensitive)
|
||||
func containsIgnoreCase(s, substr string) bool {
|
||||
return len(s) >= len(substr) &&
|
||||
(s == substr ||
|
||||
len(substr) == 0 ||
|
||||
(len(s) > 0 && containsIgnoreCaseHelper(s, substr)))
|
||||
}
|
||||
|
||||
func containsIgnoreCaseHelper(s, substr string) bool {
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if equalFold(s[i:i+len(substr)], substr) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func equalFold(s1, s2 string) bool {
|
||||
if len(s1) != len(s2) {
|
||||
return false
|
||||
}
|
||||
for i := 0; i < len(s1); i++ {
|
||||
c1, c2 := s1[i], s2[i]
|
||||
if c1 != c2 {
|
||||
// Simple ASCII case folding
|
||||
if c1 >= 'A' && c1 <= 'Z' {
|
||||
c1 += 'a' - 'A'
|
||||
}
|
||||
if c2 >= 'A' && c2 <= 'Z' {
|
||||
c2 += 'a' - 'A'
|
||||
}
|
||||
if c1 != c2 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
Reference in New Issue
Block a user