All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
370 lines
9.1 KiB
Go
370 lines
9.1 KiB
Go
package publications
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/breakpilot/edu-search-service/internal/database"
|
|
"github.com/google/uuid"
|
|
)
|
|
|
|
// CrossRefClient is a client for the CrossRef API
|
|
type CrossRefClient struct {
|
|
client *http.Client
|
|
baseURL string
|
|
userAgent string
|
|
email string // For polite pool access
|
|
}
|
|
|
|
// CrossRefResponse represents the top-level API response
|
|
type CrossRefResponse struct {
|
|
Status string `json:"status"`
|
|
MessageType string `json:"message-type"`
|
|
MessageVersion string `json:"message-version"`
|
|
Message CrossRefResult `json:"message"`
|
|
}
|
|
|
|
// CrossRefResult contains the actual results
|
|
type CrossRefResult struct {
|
|
TotalResults int `json:"total-results"`
|
|
Items []CrossRefWork `json:"items"`
|
|
Query *CrossRefQuery `json:"query,omitempty"`
|
|
}
|
|
|
|
// CrossRefQuery contains query info
|
|
type CrossRefQuery struct {
|
|
StartIndex int `json:"start-index"`
|
|
SearchTerms string `json:"search-terms"`
|
|
}
|
|
|
|
// CrossRefWork represents a single work/publication
|
|
type CrossRefWork struct {
|
|
DOI string `json:"DOI"`
|
|
Title []string `json:"title"`
|
|
ContainerTitle []string `json:"container-title"`
|
|
Publisher string `json:"publisher"`
|
|
Type string `json:"type"`
|
|
Author []CrossRefAuthor `json:"author"`
|
|
Issued CrossRefDate `json:"issued"`
|
|
PublishedPrint CrossRefDate `json:"published-print"`
|
|
Abstract string `json:"abstract"`
|
|
URL string `json:"URL"`
|
|
Link []CrossRefLink `json:"link"`
|
|
Subject []string `json:"subject"`
|
|
ISSN []string `json:"ISSN"`
|
|
ISBN []string `json:"ISBN"`
|
|
IsCitedByCount int `json:"is-referenced-by-count"`
|
|
}
|
|
|
|
// CrossRefAuthor represents an author
|
|
type CrossRefAuthor struct {
|
|
Given string `json:"given"`
|
|
Family string `json:"family"`
|
|
ORCID string `json:"ORCID"`
|
|
Affiliation []struct {
|
|
Name string `json:"name"`
|
|
} `json:"affiliation"`
|
|
Sequence string `json:"sequence"` // "first" or "additional"
|
|
}
|
|
|
|
// CrossRefDate represents a date
|
|
type CrossRefDate struct {
|
|
DateParts [][]int `json:"date-parts"`
|
|
}
|
|
|
|
// CrossRefLink represents a link to the work
|
|
type CrossRefLink struct {
|
|
URL string `json:"URL"`
|
|
ContentType string `json:"content-type"`
|
|
}
|
|
|
|
// NewCrossRefClient creates a new CrossRef API client
|
|
func NewCrossRefClient(email string) *CrossRefClient {
|
|
return &CrossRefClient{
|
|
client: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
},
|
|
baseURL: "https://api.crossref.org",
|
|
userAgent: "BreakPilot-EduBot/1.0 (https://breakpilot.de; mailto:" + email + ")",
|
|
email: email,
|
|
}
|
|
}
|
|
|
|
// GetWorkByDOI retrieves a work by its DOI
|
|
func (c *CrossRefClient) GetWorkByDOI(ctx context.Context, doi string) (*database.Publication, error) {
|
|
// Clean DOI
|
|
doi = strings.TrimSpace(doi)
|
|
doi = strings.TrimPrefix(doi, "https://doi.org/")
|
|
doi = strings.TrimPrefix(doi, "http://doi.org/")
|
|
|
|
endpoint := fmt.Sprintf("%s/works/%s", c.baseURL, url.PathEscape(doi))
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.userAgent)
|
|
|
|
resp, err := c.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusNotFound {
|
|
return nil, fmt.Errorf("DOI not found: %s", doi)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var result struct {
|
|
Status string `json:"status"`
|
|
Message CrossRefWork `json:"message"`
|
|
}
|
|
|
|
if err := json.Unmarshal(body, &result); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return c.convertToPub(&result.Message), nil
|
|
}
|
|
|
|
// SearchByAuthor searches for publications by author name
|
|
func (c *CrossRefClient) SearchByAuthor(ctx context.Context, authorName string, limit int) ([]*database.Publication, error) {
|
|
if limit <= 0 {
|
|
limit = 20
|
|
}
|
|
|
|
endpoint := fmt.Sprintf("%s/works?query.author=%s&rows=%d&sort=published&order=desc",
|
|
c.baseURL, url.QueryEscape(authorName), limit)
|
|
|
|
return c.searchWorks(ctx, endpoint)
|
|
}
|
|
|
|
// SearchByAffiliation searches for publications by affiliation (university)
|
|
func (c *CrossRefClient) SearchByAffiliation(ctx context.Context, affiliation string, limit int) ([]*database.Publication, error) {
|
|
if limit <= 0 {
|
|
limit = 20
|
|
}
|
|
|
|
endpoint := fmt.Sprintf("%s/works?query.affiliation=%s&rows=%d&sort=published&order=desc",
|
|
c.baseURL, url.QueryEscape(affiliation), limit)
|
|
|
|
return c.searchWorks(ctx, endpoint)
|
|
}
|
|
|
|
// SearchByORCID searches for publications by ORCID
|
|
func (c *CrossRefClient) SearchByORCID(ctx context.Context, orcid string, limit int) ([]*database.Publication, error) {
|
|
if limit <= 0 {
|
|
limit = 100
|
|
}
|
|
|
|
// ORCID format: 0000-0000-0000-0000
|
|
orcid = strings.TrimPrefix(orcid, "https://orcid.org/")
|
|
|
|
endpoint := fmt.Sprintf("%s/works?filter=orcid:%s&rows=%d&sort=published&order=desc",
|
|
c.baseURL, url.QueryEscape(orcid), limit)
|
|
|
|
return c.searchWorks(ctx, endpoint)
|
|
}
|
|
|
|
// SearchByTitle searches for publications by title
|
|
func (c *CrossRefClient) SearchByTitle(ctx context.Context, title string, limit int) ([]*database.Publication, error) {
|
|
if limit <= 0 {
|
|
limit = 10
|
|
}
|
|
|
|
endpoint := fmt.Sprintf("%s/works?query.title=%s&rows=%d",
|
|
c.baseURL, url.QueryEscape(title), limit)
|
|
|
|
return c.searchWorks(ctx, endpoint)
|
|
}
|
|
|
|
// searchWorks performs a generic search
|
|
func (c *CrossRefClient) searchWorks(ctx context.Context, endpoint string) ([]*database.Publication, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.userAgent)
|
|
|
|
resp, err := c.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var result CrossRefResponse
|
|
if err := json.Unmarshal(body, &result); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var pubs []*database.Publication
|
|
for _, work := range result.Message.Items {
|
|
pubs = append(pubs, c.convertToPub(&work))
|
|
}
|
|
|
|
return pubs, nil
|
|
}
|
|
|
|
// convertToPub converts a CrossRef work to our Publication model
|
|
func (c *CrossRefClient) convertToPub(work *CrossRefWork) *database.Publication {
|
|
pub := &database.Publication{
|
|
ID: uuid.New(),
|
|
CitationCount: work.IsCitedByCount,
|
|
CrawledAt: time.Now(),
|
|
}
|
|
|
|
// Title
|
|
if len(work.Title) > 0 {
|
|
pub.Title = work.Title[0]
|
|
}
|
|
|
|
// DOI
|
|
if work.DOI != "" {
|
|
pub.DOI = &work.DOI
|
|
}
|
|
|
|
// URL
|
|
if work.URL != "" {
|
|
pub.URL = &work.URL
|
|
}
|
|
|
|
// Abstract (clean HTML)
|
|
if work.Abstract != "" {
|
|
abstract := cleanHTML(work.Abstract)
|
|
pub.Abstract = &abstract
|
|
}
|
|
|
|
// Year
|
|
if len(work.Issued.DateParts) > 0 && len(work.Issued.DateParts[0]) > 0 {
|
|
year := work.Issued.DateParts[0][0]
|
|
pub.Year = &year
|
|
if len(work.Issued.DateParts[0]) > 1 {
|
|
month := work.Issued.DateParts[0][1]
|
|
pub.Month = &month
|
|
}
|
|
}
|
|
|
|
// Type
|
|
pubType := mapCrossRefType(work.Type)
|
|
pub.PubType = &pubType
|
|
|
|
// Venue
|
|
if len(work.ContainerTitle) > 0 {
|
|
venue := work.ContainerTitle[0]
|
|
pub.Venue = &venue
|
|
}
|
|
|
|
// Publisher
|
|
if work.Publisher != "" {
|
|
pub.Publisher = &work.Publisher
|
|
}
|
|
|
|
// ISBN
|
|
if len(work.ISBN) > 0 {
|
|
pub.ISBN = &work.ISBN[0]
|
|
}
|
|
|
|
// ISSN
|
|
if len(work.ISSN) > 0 {
|
|
pub.ISSN = &work.ISSN[0]
|
|
}
|
|
|
|
// Keywords/Subjects
|
|
if len(work.Subject) > 0 {
|
|
pub.Keywords = work.Subject
|
|
}
|
|
|
|
// PDF URL
|
|
for _, link := range work.Link {
|
|
if strings.Contains(link.ContentType, "pdf") {
|
|
pub.PDFURL = &link.URL
|
|
break
|
|
}
|
|
}
|
|
|
|
// Authors
|
|
var authors []string
|
|
for _, author := range work.Author {
|
|
name := strings.TrimSpace(author.Given + " " + author.Family)
|
|
if name != "" {
|
|
authors = append(authors, name)
|
|
}
|
|
}
|
|
pub.Authors = authors
|
|
|
|
// Source
|
|
source := "crossref"
|
|
pub.Source = &source
|
|
|
|
// Store raw data
|
|
rawData, _ := json.Marshal(work)
|
|
pub.RawData = rawData
|
|
|
|
return pub
|
|
}
|
|
|
|
// mapCrossRefType maps CrossRef types to our types
|
|
func mapCrossRefType(crType string) string {
|
|
switch crType {
|
|
case "journal-article":
|
|
return "journal"
|
|
case "proceedings-article", "conference-paper":
|
|
return "conference"
|
|
case "book":
|
|
return "book"
|
|
case "book-chapter":
|
|
return "book_chapter"
|
|
case "dissertation":
|
|
return "thesis"
|
|
case "posted-content":
|
|
return "preprint"
|
|
default:
|
|
return "other"
|
|
}
|
|
}
|
|
|
|
// cleanHTML removes HTML tags from text
|
|
func cleanHTML(html string) string {
|
|
// Simple HTML tag removal
|
|
result := html
|
|
result = strings.ReplaceAll(result, "<jats:p>", "")
|
|
result = strings.ReplaceAll(result, "</jats:p>", " ")
|
|
result = strings.ReplaceAll(result, "<jats:italic>", "")
|
|
result = strings.ReplaceAll(result, "</jats:italic>", "")
|
|
result = strings.ReplaceAll(result, "<jats:bold>", "")
|
|
result = strings.ReplaceAll(result, "</jats:bold>", "")
|
|
result = strings.ReplaceAll(result, "<p>", "")
|
|
result = strings.ReplaceAll(result, "</p>", " ")
|
|
|
|
// Collapse whitespace
|
|
result = strings.Join(strings.Fields(result), " ")
|
|
|
|
return strings.TrimSpace(result)
|
|
}
|