Files
breakpilot-lehrer/edu-search-service/internal/publications/crossref_client.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

370 lines
9.1 KiB
Go

package publications
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/breakpilot/edu-search-service/internal/database"
"github.com/google/uuid"
)
// CrossRefClient is a client for the CrossRef API
type CrossRefClient struct {
client *http.Client
baseURL string
userAgent string
email string // For polite pool access
}
// CrossRefResponse represents the top-level API response
type CrossRefResponse struct {
Status string `json:"status"`
MessageType string `json:"message-type"`
MessageVersion string `json:"message-version"`
Message CrossRefResult `json:"message"`
}
// CrossRefResult contains the actual results
type CrossRefResult struct {
TotalResults int `json:"total-results"`
Items []CrossRefWork `json:"items"`
Query *CrossRefQuery `json:"query,omitempty"`
}
// CrossRefQuery contains query info
type CrossRefQuery struct {
StartIndex int `json:"start-index"`
SearchTerms string `json:"search-terms"`
}
// CrossRefWork represents a single work/publication
type CrossRefWork struct {
DOI string `json:"DOI"`
Title []string `json:"title"`
ContainerTitle []string `json:"container-title"`
Publisher string `json:"publisher"`
Type string `json:"type"`
Author []CrossRefAuthor `json:"author"`
Issued CrossRefDate `json:"issued"`
PublishedPrint CrossRefDate `json:"published-print"`
Abstract string `json:"abstract"`
URL string `json:"URL"`
Link []CrossRefLink `json:"link"`
Subject []string `json:"subject"`
ISSN []string `json:"ISSN"`
ISBN []string `json:"ISBN"`
IsCitedByCount int `json:"is-referenced-by-count"`
}
// CrossRefAuthor represents an author
type CrossRefAuthor struct {
Given string `json:"given"`
Family string `json:"family"`
ORCID string `json:"ORCID"`
Affiliation []struct {
Name string `json:"name"`
} `json:"affiliation"`
Sequence string `json:"sequence"` // "first" or "additional"
}
// CrossRefDate represents a date
type CrossRefDate struct {
DateParts [][]int `json:"date-parts"`
}
// CrossRefLink represents a link to the work
type CrossRefLink struct {
URL string `json:"URL"`
ContentType string `json:"content-type"`
}
// NewCrossRefClient creates a new CrossRef API client
func NewCrossRefClient(email string) *CrossRefClient {
return &CrossRefClient{
client: &http.Client{
Timeout: 30 * time.Second,
},
baseURL: "https://api.crossref.org",
userAgent: "BreakPilot-EduBot/1.0 (https://breakpilot.de; mailto:" + email + ")",
email: email,
}
}
// GetWorkByDOI retrieves a work by its DOI
func (c *CrossRefClient) GetWorkByDOI(ctx context.Context, doi string) (*database.Publication, error) {
// Clean DOI
doi = strings.TrimSpace(doi)
doi = strings.TrimPrefix(doi, "https://doi.org/")
doi = strings.TrimPrefix(doi, "http://doi.org/")
endpoint := fmt.Sprintf("%s/works/%s", c.baseURL, url.PathEscape(doi))
req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
resp, err := c.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
return nil, fmt.Errorf("DOI not found: %s", doi)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
var result struct {
Status string `json:"status"`
Message CrossRefWork `json:"message"`
}
if err := json.Unmarshal(body, &result); err != nil {
return nil, err
}
return c.convertToPub(&result.Message), nil
}
// SearchByAuthor searches for publications by author name
func (c *CrossRefClient) SearchByAuthor(ctx context.Context, authorName string, limit int) ([]*database.Publication, error) {
if limit <= 0 {
limit = 20
}
endpoint := fmt.Sprintf("%s/works?query.author=%s&rows=%d&sort=published&order=desc",
c.baseURL, url.QueryEscape(authorName), limit)
return c.searchWorks(ctx, endpoint)
}
// SearchByAffiliation searches for publications by affiliation (university)
func (c *CrossRefClient) SearchByAffiliation(ctx context.Context, affiliation string, limit int) ([]*database.Publication, error) {
if limit <= 0 {
limit = 20
}
endpoint := fmt.Sprintf("%s/works?query.affiliation=%s&rows=%d&sort=published&order=desc",
c.baseURL, url.QueryEscape(affiliation), limit)
return c.searchWorks(ctx, endpoint)
}
// SearchByORCID searches for publications by ORCID
func (c *CrossRefClient) SearchByORCID(ctx context.Context, orcid string, limit int) ([]*database.Publication, error) {
if limit <= 0 {
limit = 100
}
// ORCID format: 0000-0000-0000-0000
orcid = strings.TrimPrefix(orcid, "https://orcid.org/")
endpoint := fmt.Sprintf("%s/works?filter=orcid:%s&rows=%d&sort=published&order=desc",
c.baseURL, url.QueryEscape(orcid), limit)
return c.searchWorks(ctx, endpoint)
}
// SearchByTitle searches for publications by title
func (c *CrossRefClient) SearchByTitle(ctx context.Context, title string, limit int) ([]*database.Publication, error) {
if limit <= 0 {
limit = 10
}
endpoint := fmt.Sprintf("%s/works?query.title=%s&rows=%d",
c.baseURL, url.QueryEscape(title), limit)
return c.searchWorks(ctx, endpoint)
}
// searchWorks performs a generic search
func (c *CrossRefClient) searchWorks(ctx context.Context, endpoint string) ([]*database.Publication, error) {
req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
resp, err := c.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
var result CrossRefResponse
if err := json.Unmarshal(body, &result); err != nil {
return nil, err
}
var pubs []*database.Publication
for _, work := range result.Message.Items {
pubs = append(pubs, c.convertToPub(&work))
}
return pubs, nil
}
// convertToPub converts a CrossRef work to our Publication model
func (c *CrossRefClient) convertToPub(work *CrossRefWork) *database.Publication {
pub := &database.Publication{
ID: uuid.New(),
CitationCount: work.IsCitedByCount,
CrawledAt: time.Now(),
}
// Title
if len(work.Title) > 0 {
pub.Title = work.Title[0]
}
// DOI
if work.DOI != "" {
pub.DOI = &work.DOI
}
// URL
if work.URL != "" {
pub.URL = &work.URL
}
// Abstract (clean HTML)
if work.Abstract != "" {
abstract := cleanHTML(work.Abstract)
pub.Abstract = &abstract
}
// Year
if len(work.Issued.DateParts) > 0 && len(work.Issued.DateParts[0]) > 0 {
year := work.Issued.DateParts[0][0]
pub.Year = &year
if len(work.Issued.DateParts[0]) > 1 {
month := work.Issued.DateParts[0][1]
pub.Month = &month
}
}
// Type
pubType := mapCrossRefType(work.Type)
pub.PubType = &pubType
// Venue
if len(work.ContainerTitle) > 0 {
venue := work.ContainerTitle[0]
pub.Venue = &venue
}
// Publisher
if work.Publisher != "" {
pub.Publisher = &work.Publisher
}
// ISBN
if len(work.ISBN) > 0 {
pub.ISBN = &work.ISBN[0]
}
// ISSN
if len(work.ISSN) > 0 {
pub.ISSN = &work.ISSN[0]
}
// Keywords/Subjects
if len(work.Subject) > 0 {
pub.Keywords = work.Subject
}
// PDF URL
for _, link := range work.Link {
if strings.Contains(link.ContentType, "pdf") {
pub.PDFURL = &link.URL
break
}
}
// Authors
var authors []string
for _, author := range work.Author {
name := strings.TrimSpace(author.Given + " " + author.Family)
if name != "" {
authors = append(authors, name)
}
}
pub.Authors = authors
// Source
source := "crossref"
pub.Source = &source
// Store raw data
rawData, _ := json.Marshal(work)
pub.RawData = rawData
return pub
}
// mapCrossRefType maps CrossRef types to our types
func mapCrossRefType(crType string) string {
switch crType {
case "journal-article":
return "journal"
case "proceedings-article", "conference-paper":
return "conference"
case "book":
return "book"
case "book-chapter":
return "book_chapter"
case "dissertation":
return "thesis"
case "posted-content":
return "preprint"
default:
return "other"
}
}
// cleanHTML removes HTML tags from text
func cleanHTML(html string) string {
// Simple HTML tag removal
result := html
result = strings.ReplaceAll(result, "<jats:p>", "")
result = strings.ReplaceAll(result, "</jats:p>", " ")
result = strings.ReplaceAll(result, "<jats:italic>", "")
result = strings.ReplaceAll(result, "</jats:italic>", "")
result = strings.ReplaceAll(result, "<jats:bold>", "")
result = strings.ReplaceAll(result, "</jats:bold>", "")
result = strings.ReplaceAll(result, "<p>", "")
result = strings.ReplaceAll(result, "</p>", " ")
// Collapse whitespace
result = strings.Join(strings.Fields(result), " ")
return strings.TrimSpace(result)
}