package publications import ( "context" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/breakpilot/edu-search-service/internal/database" "github.com/google/uuid" ) // CrossRefClient is a client for the CrossRef API type CrossRefClient struct { client *http.Client baseURL string userAgent string email string // For polite pool access } // CrossRefResponse represents the top-level API response type CrossRefResponse struct { Status string `json:"status"` MessageType string `json:"message-type"` MessageVersion string `json:"message-version"` Message CrossRefResult `json:"message"` } // CrossRefResult contains the actual results type CrossRefResult struct { TotalResults int `json:"total-results"` Items []CrossRefWork `json:"items"` Query *CrossRefQuery `json:"query,omitempty"` } // CrossRefQuery contains query info type CrossRefQuery struct { StartIndex int `json:"start-index"` SearchTerms string `json:"search-terms"` } // CrossRefWork represents a single work/publication type CrossRefWork struct { DOI string `json:"DOI"` Title []string `json:"title"` ContainerTitle []string `json:"container-title"` Publisher string `json:"publisher"` Type string `json:"type"` Author []CrossRefAuthor `json:"author"` Issued CrossRefDate `json:"issued"` PublishedPrint CrossRefDate `json:"published-print"` Abstract string `json:"abstract"` URL string `json:"URL"` Link []CrossRefLink `json:"link"` Subject []string `json:"subject"` ISSN []string `json:"ISSN"` ISBN []string `json:"ISBN"` IsCitedByCount int `json:"is-referenced-by-count"` } // CrossRefAuthor represents an author type CrossRefAuthor struct { Given string `json:"given"` Family string `json:"family"` ORCID string `json:"ORCID"` Affiliation []struct { Name string `json:"name"` } `json:"affiliation"` Sequence string `json:"sequence"` // "first" or "additional" } // CrossRefDate represents a date type CrossRefDate struct { DateParts [][]int `json:"date-parts"` } // CrossRefLink represents a link to the work type CrossRefLink struct { URL string `json:"URL"` ContentType string `json:"content-type"` } // NewCrossRefClient creates a new CrossRef API client func NewCrossRefClient(email string) *CrossRefClient { return &CrossRefClient{ client: &http.Client{ Timeout: 30 * time.Second, }, baseURL: "https://api.crossref.org", userAgent: "BreakPilot-EduBot/1.0 (https://breakpilot.de; mailto:" + email + ")", email: email, } } // GetWorkByDOI retrieves a work by its DOI func (c *CrossRefClient) GetWorkByDOI(ctx context.Context, doi string) (*database.Publication, error) { // Clean DOI doi = strings.TrimSpace(doi) doi = strings.TrimPrefix(doi, "https://doi.org/") doi = strings.TrimPrefix(doi, "http://doi.org/") endpoint := fmt.Sprintf("%s/works/%s", c.baseURL, url.PathEscape(doi)) req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", c.userAgent) resp, err := c.client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode == http.StatusNotFound { return nil, fmt.Errorf("DOI not found: %s", doi) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, err } var result struct { Status string `json:"status"` Message CrossRefWork `json:"message"` } if err := json.Unmarshal(body, &result); err != nil { return nil, err } return c.convertToPub(&result.Message), nil } // SearchByAuthor searches for publications by author name func (c *CrossRefClient) SearchByAuthor(ctx context.Context, authorName string, limit int) ([]*database.Publication, error) { if limit <= 0 { limit = 20 } endpoint := fmt.Sprintf("%s/works?query.author=%s&rows=%d&sort=published&order=desc", c.baseURL, url.QueryEscape(authorName), limit) return c.searchWorks(ctx, endpoint) } // SearchByAffiliation searches for publications by affiliation (university) func (c *CrossRefClient) SearchByAffiliation(ctx context.Context, affiliation string, limit int) ([]*database.Publication, error) { if limit <= 0 { limit = 20 } endpoint := fmt.Sprintf("%s/works?query.affiliation=%s&rows=%d&sort=published&order=desc", c.baseURL, url.QueryEscape(affiliation), limit) return c.searchWorks(ctx, endpoint) } // SearchByORCID searches for publications by ORCID func (c *CrossRefClient) SearchByORCID(ctx context.Context, orcid string, limit int) ([]*database.Publication, error) { if limit <= 0 { limit = 100 } // ORCID format: 0000-0000-0000-0000 orcid = strings.TrimPrefix(orcid, "https://orcid.org/") endpoint := fmt.Sprintf("%s/works?filter=orcid:%s&rows=%d&sort=published&order=desc", c.baseURL, url.QueryEscape(orcid), limit) return c.searchWorks(ctx, endpoint) } // SearchByTitle searches for publications by title func (c *CrossRefClient) SearchByTitle(ctx context.Context, title string, limit int) ([]*database.Publication, error) { if limit <= 0 { limit = 10 } endpoint := fmt.Sprintf("%s/works?query.title=%s&rows=%d", c.baseURL, url.QueryEscape(title), limit) return c.searchWorks(ctx, endpoint) } // searchWorks performs a generic search func (c *CrossRefClient) searchWorks(ctx context.Context, endpoint string) ([]*database.Publication, error) { req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", c.userAgent) resp, err := c.client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("CrossRef API error: %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, err } var result CrossRefResponse if err := json.Unmarshal(body, &result); err != nil { return nil, err } var pubs []*database.Publication for _, work := range result.Message.Items { pubs = append(pubs, c.convertToPub(&work)) } return pubs, nil } // convertToPub converts a CrossRef work to our Publication model func (c *CrossRefClient) convertToPub(work *CrossRefWork) *database.Publication { pub := &database.Publication{ ID: uuid.New(), CitationCount: work.IsCitedByCount, CrawledAt: time.Now(), } // Title if len(work.Title) > 0 { pub.Title = work.Title[0] } // DOI if work.DOI != "" { pub.DOI = &work.DOI } // URL if work.URL != "" { pub.URL = &work.URL } // Abstract (clean HTML) if work.Abstract != "" { abstract := cleanHTML(work.Abstract) pub.Abstract = &abstract } // Year if len(work.Issued.DateParts) > 0 && len(work.Issued.DateParts[0]) > 0 { year := work.Issued.DateParts[0][0] pub.Year = &year if len(work.Issued.DateParts[0]) > 1 { month := work.Issued.DateParts[0][1] pub.Month = &month } } // Type pubType := mapCrossRefType(work.Type) pub.PubType = &pubType // Venue if len(work.ContainerTitle) > 0 { venue := work.ContainerTitle[0] pub.Venue = &venue } // Publisher if work.Publisher != "" { pub.Publisher = &work.Publisher } // ISBN if len(work.ISBN) > 0 { pub.ISBN = &work.ISBN[0] } // ISSN if len(work.ISSN) > 0 { pub.ISSN = &work.ISSN[0] } // Keywords/Subjects if len(work.Subject) > 0 { pub.Keywords = work.Subject } // PDF URL for _, link := range work.Link { if strings.Contains(link.ContentType, "pdf") { pub.PDFURL = &link.URL break } } // Authors var authors []string for _, author := range work.Author { name := strings.TrimSpace(author.Given + " " + author.Family) if name != "" { authors = append(authors, name) } } pub.Authors = authors // Source source := "crossref" pub.Source = &source // Store raw data rawData, _ := json.Marshal(work) pub.RawData = rawData return pub } // mapCrossRefType maps CrossRef types to our types func mapCrossRefType(crType string) string { switch crType { case "journal-article": return "journal" case "proceedings-article", "conference-paper": return "conference" case "book": return "book" case "book-chapter": return "book_chapter" case "dissertation": return "thesis" case "posted-content": return "preprint" default: return "other" } } // cleanHTML removes HTML tags from text func cleanHTML(html string) string { // Simple HTML tag removal result := html result = strings.ReplaceAll(result, "", "") result = strings.ReplaceAll(result, "", " ") result = strings.ReplaceAll(result, "", "") result = strings.ReplaceAll(result, "", "") result = strings.ReplaceAll(result, "", "") result = strings.ReplaceAll(result, "", "") result = strings.ReplaceAll(result, "

", "") result = strings.ReplaceAll(result, "

", " ") // Collapse whitespace result = strings.Join(strings.Fields(result), " ") return strings.TrimSpace(result) }