Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/ntrs_harvester.go
T
Benjamin Admin d27c1b9e7d feat(iace): NTRS harvester + licence gate (FMEA P2 stage 1)
Stage 1 of the FailureKnowledge bulk loader: harvest NASA NTRS
lessons-learned with a strict public-reuse gate (NTRSUsable: public
release, not export-controlled/EAR/ITAR, not CUI, PUBLIC_USE_PERMITTED,
no third-party copyright). NTRSPDFURL prefers the PDF download for
downstream text/OCR extraction. GET /iace/failure-knowledge/ntrs runs
the live harvest and returns only the licence-clean records.

Pure parse/gate helpers are fixture-tested (usable vs ITAR / third-party
/ restricted / video-only); accepted licences also pass the FK allowlist.

Next: tuple extraction (abstract -> FailureKnowledge) + Playwright/OCR for
scanned PDFs -> bp_iace_failure_kb.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-13 00:16:41 +02:00

133 lines
4.1 KiB
Go

package iace
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
// NTRS (NASA Technical Reports Server) harvester — Stage 1 of the FailureKnowledge
// bulk loader. Fetches lessons-learned / failure reports, applies a strict
// public-reuse licence gate (public release, not export-controlled, not CUI,
// public-use-permitted, no embedded third-party copyright), and exposes the
// readable metadata (title/abstract) + the PDF link for downstream extraction.
// NASA NTRS documents that pass the gate are US-Government public domain.
const ntrsSearchURL = "https://ntrs.nasa.gov/api/citations/search"
type ntrsCopyright struct {
DeterminationType string `json:"determinationType"`
ContainsThirdPartyMaterial bool `json:"containsThirdPartyMaterial"`
LicenseType string `json:"licenseType"`
}
type ntrsExport struct {
IsExportControl string `json:"isExportControl"`
EAR string `json:"ear"`
ITAR string `json:"itar"`
}
type ntrsCui struct {
IsCui bool `json:"isCui"`
}
type ntrsDownload struct {
Mimetype string `json:"mimetype"`
Links struct {
Original string `json:"original"`
} `json:"links"`
}
// NTRSLesson is one harvested NTRS record (only the fields we use).
type NTRSLesson struct {
ID int64 `json:"id"`
Title string `json:"title"`
Abstract string `json:"abstract"`
Distribution string `json:"distribution"`
IsLessonsLearned bool `json:"isLessonsLearned"`
Copyright ntrsCopyright `json:"copyright"`
ExportControl ntrsExport `json:"exportControl"`
Cui *ntrsCui `json:"cui"`
Downloads []ntrsDownload `json:"downloads"`
}
type ntrsSearchResponse struct {
Results []NTRSLesson `json:"results"`
}
// NTRSUsable reports whether a lesson is publicly + commercially reusable and
// returns the licence string. The gate is conservative: any export-control, CUI,
// third-party copyright, non-public distribution, or non-public-use copyright
// determination disqualifies the record.
func NTRSUsable(l NTRSLesson) (bool, string) {
if !strings.EqualFold(l.Distribution, "PUBLIC") {
return false, ""
}
if strings.EqualFold(l.ExportControl.IsExportControl, "YES") ||
strings.EqualFold(l.ExportControl.EAR, "YES") ||
strings.EqualFold(l.ExportControl.ITAR, "YES") {
return false, ""
}
if l.Cui != nil && l.Cui.IsCui {
return false, ""
}
if l.Copyright.ContainsThirdPartyMaterial {
return false, ""
}
switch strings.ToUpper(l.Copyright.DeterminationType) {
case "PUBLIC_USE_PERMITTED", "GOV_PUBLIC_USE_PERMITTED":
return true, "Public Domain (NASA NTRS, " + l.Copyright.DeterminationType + ")"
}
return false, ""
}
// NTRSPDFURL returns the absolute URL of the first PDF download, or "".
func NTRSPDFURL(l NTRSLesson) string {
for _, d := range l.Downloads {
if strings.Contains(strings.ToLower(d.Mimetype), "pdf") && d.Links.Original != "" {
return "https://ntrs.nasa.gov" + d.Links.Original
}
}
return ""
}
// parseNTRSSearch parses an NTRS /search response body.
func parseNTRSSearch(body []byte) ([]NTRSLesson, error) {
var r ntrsSearchResponse
if err := json.Unmarshal(body, &r); err != nil {
return nil, err
}
return r.Results, nil
}
// FetchNTRSLessons queries the NTRS search API. Network call; the parsing/gating
// helpers above are pure and unit-tested.
func FetchNTRSLessons(ctx context.Context, query string, limit int) ([]NTRSLesson, error) {
if limit <= 0 || limit > 100 {
limit = 25
}
u := fmt.Sprintf("%s?q=%s&page.size=%d&highlight=false", ntrsSearchURL, url.QueryEscape(query), limit)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, err
}
resp, err := (&http.Client{Timeout: 30 * time.Second}).Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("ntrs status %d", resp.StatusCode)
}
b, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
if err != nil {
return nil, err
}
return parseNTRSSearch(b)
}