feat: edu-search-service migriert, voice-service/geo-service entfernt
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
369
edu-search-service/internal/publications/crossref_client.go
Normal file
369
edu-search-service/internal/publications/crossref_client.go
Normal file
@@ -0,0 +1,369 @@
|
||||
package publications
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/breakpilot/edu-search-service/internal/database"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// CrossRefClient is a client for the CrossRef API
|
||||
type CrossRefClient struct {
|
||||
client *http.Client
|
||||
baseURL string
|
||||
userAgent string
|
||||
email string // For polite pool access
|
||||
}
|
||||
|
||||
// CrossRefResponse represents the top-level API response
|
||||
type CrossRefResponse struct {
|
||||
Status string `json:"status"`
|
||||
MessageType string `json:"message-type"`
|
||||
MessageVersion string `json:"message-version"`
|
||||
Message CrossRefResult `json:"message"`
|
||||
}
|
||||
|
||||
// CrossRefResult contains the actual results
|
||||
type CrossRefResult struct {
|
||||
TotalResults int `json:"total-results"`
|
||||
Items []CrossRefWork `json:"items"`
|
||||
Query *CrossRefQuery `json:"query,omitempty"`
|
||||
}
|
||||
|
||||
// CrossRefQuery contains query info
|
||||
type CrossRefQuery struct {
|
||||
StartIndex int `json:"start-index"`
|
||||
SearchTerms string `json:"search-terms"`
|
||||
}
|
||||
|
||||
// CrossRefWork represents a single work/publication
|
||||
type CrossRefWork struct {
|
||||
DOI string `json:"DOI"`
|
||||
Title []string `json:"title"`
|
||||
ContainerTitle []string `json:"container-title"`
|
||||
Publisher string `json:"publisher"`
|
||||
Type string `json:"type"`
|
||||
Author []CrossRefAuthor `json:"author"`
|
||||
Issued CrossRefDate `json:"issued"`
|
||||
PublishedPrint CrossRefDate `json:"published-print"`
|
||||
Abstract string `json:"abstract"`
|
||||
URL string `json:"URL"`
|
||||
Link []CrossRefLink `json:"link"`
|
||||
Subject []string `json:"subject"`
|
||||
ISSN []string `json:"ISSN"`
|
||||
ISBN []string `json:"ISBN"`
|
||||
IsCitedByCount int `json:"is-referenced-by-count"`
|
||||
}
|
||||
|
||||
// CrossRefAuthor represents an author
|
||||
type CrossRefAuthor struct {
|
||||
Given string `json:"given"`
|
||||
Family string `json:"family"`
|
||||
ORCID string `json:"ORCID"`
|
||||
Affiliation []struct {
|
||||
Name string `json:"name"`
|
||||
} `json:"affiliation"`
|
||||
Sequence string `json:"sequence"` // "first" or "additional"
|
||||
}
|
||||
|
||||
// CrossRefDate represents a date
|
||||
type CrossRefDate struct {
|
||||
DateParts [][]int `json:"date-parts"`
|
||||
}
|
||||
|
||||
// CrossRefLink represents a link to the work
|
||||
type CrossRefLink struct {
|
||||
URL string `json:"URL"`
|
||||
ContentType string `json:"content-type"`
|
||||
}
|
||||
|
||||
// NewCrossRefClient creates a new CrossRef API client
|
||||
func NewCrossRefClient(email string) *CrossRefClient {
|
||||
return &CrossRefClient{
|
||||
client: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
},
|
||||
baseURL: "https://api.crossref.org",
|
||||
userAgent: "BreakPilot-EduBot/1.0 (https://breakpilot.de; mailto:" + email + ")",
|
||||
email: email,
|
||||
}
|
||||
}
|
||||
|
||||
// GetWorkByDOI retrieves a work by its DOI
|
||||
func (c *CrossRefClient) GetWorkByDOI(ctx context.Context, doi string) (*database.Publication, error) {
|
||||
// Clean DOI
|
||||
doi = strings.TrimSpace(doi)
|
||||
doi = strings.TrimPrefix(doi, "https://doi.org/")
|
||||
doi = strings.TrimPrefix(doi, "http://doi.org/")
|
||||
|
||||
endpoint := fmt.Sprintf("%s/works/%s", c.baseURL, url.PathEscape(doi))
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return nil, fmt.Errorf("DOI not found: %s", doi)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Status string `json:"status"`
|
||||
Message CrossRefWork `json:"message"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return c.convertToPub(&result.Message), nil
|
||||
}
|
||||
|
||||
// SearchByAuthor searches for publications by author name
|
||||
func (c *CrossRefClient) SearchByAuthor(ctx context.Context, authorName string, limit int) ([]*database.Publication, error) {
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/works?query.author=%s&rows=%d&sort=published&order=desc",
|
||||
c.baseURL, url.QueryEscape(authorName), limit)
|
||||
|
||||
return c.searchWorks(ctx, endpoint)
|
||||
}
|
||||
|
||||
// SearchByAffiliation searches for publications by affiliation (university)
|
||||
func (c *CrossRefClient) SearchByAffiliation(ctx context.Context, affiliation string, limit int) ([]*database.Publication, error) {
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/works?query.affiliation=%s&rows=%d&sort=published&order=desc",
|
||||
c.baseURL, url.QueryEscape(affiliation), limit)
|
||||
|
||||
return c.searchWorks(ctx, endpoint)
|
||||
}
|
||||
|
||||
// SearchByORCID searches for publications by ORCID
|
||||
func (c *CrossRefClient) SearchByORCID(ctx context.Context, orcid string, limit int) ([]*database.Publication, error) {
|
||||
if limit <= 0 {
|
||||
limit = 100
|
||||
}
|
||||
|
||||
// ORCID format: 0000-0000-0000-0000
|
||||
orcid = strings.TrimPrefix(orcid, "https://orcid.org/")
|
||||
|
||||
endpoint := fmt.Sprintf("%s/works?filter=orcid:%s&rows=%d&sort=published&order=desc",
|
||||
c.baseURL, url.QueryEscape(orcid), limit)
|
||||
|
||||
return c.searchWorks(ctx, endpoint)
|
||||
}
|
||||
|
||||
// SearchByTitle searches for publications by title
|
||||
func (c *CrossRefClient) SearchByTitle(ctx context.Context, title string, limit int) ([]*database.Publication, error) {
|
||||
if limit <= 0 {
|
||||
limit = 10
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/works?query.title=%s&rows=%d",
|
||||
c.baseURL, url.QueryEscape(title), limit)
|
||||
|
||||
return c.searchWorks(ctx, endpoint)
|
||||
}
|
||||
|
||||
// searchWorks performs a generic search
|
||||
func (c *CrossRefClient) searchWorks(ctx context.Context, endpoint string) ([]*database.Publication, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var result CrossRefResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var pubs []*database.Publication
|
||||
for _, work := range result.Message.Items {
|
||||
pubs = append(pubs, c.convertToPub(&work))
|
||||
}
|
||||
|
||||
return pubs, nil
|
||||
}
|
||||
|
||||
// convertToPub converts a CrossRef work to our Publication model
|
||||
func (c *CrossRefClient) convertToPub(work *CrossRefWork) *database.Publication {
|
||||
pub := &database.Publication{
|
||||
ID: uuid.New(),
|
||||
CitationCount: work.IsCitedByCount,
|
||||
CrawledAt: time.Now(),
|
||||
}
|
||||
|
||||
// Title
|
||||
if len(work.Title) > 0 {
|
||||
pub.Title = work.Title[0]
|
||||
}
|
||||
|
||||
// DOI
|
||||
if work.DOI != "" {
|
||||
pub.DOI = &work.DOI
|
||||
}
|
||||
|
||||
// URL
|
||||
if work.URL != "" {
|
||||
pub.URL = &work.URL
|
||||
}
|
||||
|
||||
// Abstract (clean HTML)
|
||||
if work.Abstract != "" {
|
||||
abstract := cleanHTML(work.Abstract)
|
||||
pub.Abstract = &abstract
|
||||
}
|
||||
|
||||
// Year
|
||||
if len(work.Issued.DateParts) > 0 && len(work.Issued.DateParts[0]) > 0 {
|
||||
year := work.Issued.DateParts[0][0]
|
||||
pub.Year = &year
|
||||
if len(work.Issued.DateParts[0]) > 1 {
|
||||
month := work.Issued.DateParts[0][1]
|
||||
pub.Month = &month
|
||||
}
|
||||
}
|
||||
|
||||
// Type
|
||||
pubType := mapCrossRefType(work.Type)
|
||||
pub.PubType = &pubType
|
||||
|
||||
// Venue
|
||||
if len(work.ContainerTitle) > 0 {
|
||||
venue := work.ContainerTitle[0]
|
||||
pub.Venue = &venue
|
||||
}
|
||||
|
||||
// Publisher
|
||||
if work.Publisher != "" {
|
||||
pub.Publisher = &work.Publisher
|
||||
}
|
||||
|
||||
// ISBN
|
||||
if len(work.ISBN) > 0 {
|
||||
pub.ISBN = &work.ISBN[0]
|
||||
}
|
||||
|
||||
// ISSN
|
||||
if len(work.ISSN) > 0 {
|
||||
pub.ISSN = &work.ISSN[0]
|
||||
}
|
||||
|
||||
// Keywords/Subjects
|
||||
if len(work.Subject) > 0 {
|
||||
pub.Keywords = work.Subject
|
||||
}
|
||||
|
||||
// PDF URL
|
||||
for _, link := range work.Link {
|
||||
if strings.Contains(link.ContentType, "pdf") {
|
||||
pub.PDFURL = &link.URL
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Authors
|
||||
var authors []string
|
||||
for _, author := range work.Author {
|
||||
name := strings.TrimSpace(author.Given + " " + author.Family)
|
||||
if name != "" {
|
||||
authors = append(authors, name)
|
||||
}
|
||||
}
|
||||
pub.Authors = authors
|
||||
|
||||
// Source
|
||||
source := "crossref"
|
||||
pub.Source = &source
|
||||
|
||||
// Store raw data
|
||||
rawData, _ := json.Marshal(work)
|
||||
pub.RawData = rawData
|
||||
|
||||
return pub
|
||||
}
|
||||
|
||||
// mapCrossRefType maps CrossRef types to our types
|
||||
func mapCrossRefType(crType string) string {
|
||||
switch crType {
|
||||
case "journal-article":
|
||||
return "journal"
|
||||
case "proceedings-article", "conference-paper":
|
||||
return "conference"
|
||||
case "book":
|
||||
return "book"
|
||||
case "book-chapter":
|
||||
return "book_chapter"
|
||||
case "dissertation":
|
||||
return "thesis"
|
||||
case "posted-content":
|
||||
return "preprint"
|
||||
default:
|
||||
return "other"
|
||||
}
|
||||
}
|
||||
|
||||
// cleanHTML removes HTML tags from text
|
||||
func cleanHTML(html string) string {
|
||||
// Simple HTML tag removal
|
||||
result := html
|
||||
result = strings.ReplaceAll(result, "<jats:p>", "")
|
||||
result = strings.ReplaceAll(result, "</jats:p>", " ")
|
||||
result = strings.ReplaceAll(result, "<jats:italic>", "")
|
||||
result = strings.ReplaceAll(result, "</jats:italic>", "")
|
||||
result = strings.ReplaceAll(result, "<jats:bold>", "")
|
||||
result = strings.ReplaceAll(result, "</jats:bold>", "")
|
||||
result = strings.ReplaceAll(result, "<p>", "")
|
||||
result = strings.ReplaceAll(result, "</p>", " ")
|
||||
|
||||
// Collapse whitespace
|
||||
result = strings.Join(strings.Fields(result), " ")
|
||||
|
||||
return strings.TrimSpace(result)
|
||||
}
|
||||
Reference in New Issue
Block a user