feat(iace): NTRS harvester + licence gate (FMEA P2 stage 1)
Stage 1 of the FailureKnowledge bulk loader: harvest NASA NTRS lessons-learned with a strict public-reuse gate (NTRSUsable: public release, not export-controlled/EAR/ITAR, not CUI, PUBLIC_USE_PERMITTED, no third-party copyright). NTRSPDFURL prefers the PDF download for downstream text/OCR extraction. GET /iace/failure-knowledge/ntrs runs the live harvest and returns only the licence-clean records. Pure parse/gate helpers are fixture-tested (usable vs ITAR / third-party / restricted / video-only); accepted licences also pass the FK allowlist. Next: tuple extraction (abstract -> FailureKnowledge) + Playwright/OCR for scanned PDFs -> bp_iace_failure_kb. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/breakpilot/ai-compliance-sdk/internal/iace"
|
||||
"github.com/gin-gonic/gin"
|
||||
@@ -27,3 +28,34 @@ func (h *IACEHandler) ListFailureKnowledge(c *gin.Context) {
|
||||
"total": len(items),
|
||||
})
|
||||
}
|
||||
|
||||
// HarvestNTRSFailures handles GET /failure-knowledge/ntrs.
|
||||
// Live-harvests NASA NTRS lessons-learned metadata and returns only the records
|
||||
// that pass the public-reuse licence gate (Stage 1 of the bulk loader). Tuple
|
||||
// extraction from the abstracts is a downstream step.
|
||||
func (h *IACEHandler) HarvestNTRSFailures(c *gin.Context) {
|
||||
q := c.DefaultQuery("q", "lessons learned failure")
|
||||
limit, _ := strconv.Atoi(c.DefaultQuery("limit", "25"))
|
||||
lessons, err := iace.FetchNTRSLessons(c.Request.Context(), q, limit)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
out := []gin.H{}
|
||||
skipped := 0
|
||||
for _, l := range lessons {
|
||||
ok, lic := iace.NTRSUsable(l)
|
||||
if !ok {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
out = append(out, gin.H{
|
||||
"id": l.ID, "title": l.Title, "abstract": l.Abstract,
|
||||
"license": lic, "pdf_url": iace.NTRSPDFURL(l), "is_lessons_learned": l.IsLessonsLearned,
|
||||
})
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"query": q, "usable": out, "usable_count": len(out),
|
||||
"skipped_non_open": skipped, "total_fetched": len(lessons),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ func registerIACERoutes(v1 *gin.RouterGroup, h *handlers.IACEHandler) {
|
||||
iaceRoutes.GET("/energy-sources", h.ListEnergySources)
|
||||
iaceRoutes.GET("/minimum-distances", h.ListMinimumDistances)
|
||||
iaceRoutes.GET("/failure-knowledge", h.ListFailureKnowledge)
|
||||
iaceRoutes.GET("/failure-knowledge/ntrs", h.HarvestNTRSFailures)
|
||||
iaceRoutes.GET("/tags", h.ListTags)
|
||||
iaceRoutes.GET("/hazard-patterns", h.ListHazardPatterns)
|
||||
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
package iace
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// NTRS (NASA Technical Reports Server) harvester — Stage 1 of the FailureKnowledge
|
||||
// bulk loader. Fetches lessons-learned / failure reports, applies a strict
|
||||
// public-reuse licence gate (public release, not export-controlled, not CUI,
|
||||
// public-use-permitted, no embedded third-party copyright), and exposes the
|
||||
// readable metadata (title/abstract) + the PDF link for downstream extraction.
|
||||
// NASA NTRS documents that pass the gate are US-Government public domain.
|
||||
|
||||
const ntrsSearchURL = "https://ntrs.nasa.gov/api/citations/search"
|
||||
|
||||
type ntrsCopyright struct {
|
||||
DeterminationType string `json:"determinationType"`
|
||||
ContainsThirdPartyMaterial bool `json:"containsThirdPartyMaterial"`
|
||||
LicenseType string `json:"licenseType"`
|
||||
}
|
||||
|
||||
type ntrsExport struct {
|
||||
IsExportControl string `json:"isExportControl"`
|
||||
EAR string `json:"ear"`
|
||||
ITAR string `json:"itar"`
|
||||
}
|
||||
|
||||
type ntrsCui struct {
|
||||
IsCui bool `json:"isCui"`
|
||||
}
|
||||
|
||||
type ntrsDownload struct {
|
||||
Mimetype string `json:"mimetype"`
|
||||
Links struct {
|
||||
Original string `json:"original"`
|
||||
} `json:"links"`
|
||||
}
|
||||
|
||||
// NTRSLesson is one harvested NTRS record (only the fields we use).
|
||||
type NTRSLesson struct {
|
||||
ID int64 `json:"id"`
|
||||
Title string `json:"title"`
|
||||
Abstract string `json:"abstract"`
|
||||
Distribution string `json:"distribution"`
|
||||
IsLessonsLearned bool `json:"isLessonsLearned"`
|
||||
Copyright ntrsCopyright `json:"copyright"`
|
||||
ExportControl ntrsExport `json:"exportControl"`
|
||||
Cui *ntrsCui `json:"cui"`
|
||||
Downloads []ntrsDownload `json:"downloads"`
|
||||
}
|
||||
|
||||
type ntrsSearchResponse struct {
|
||||
Results []NTRSLesson `json:"results"`
|
||||
}
|
||||
|
||||
// NTRSUsable reports whether a lesson is publicly + commercially reusable and
|
||||
// returns the licence string. The gate is conservative: any export-control, CUI,
|
||||
// third-party copyright, non-public distribution, or non-public-use copyright
|
||||
// determination disqualifies the record.
|
||||
func NTRSUsable(l NTRSLesson) (bool, string) {
|
||||
if !strings.EqualFold(l.Distribution, "PUBLIC") {
|
||||
return false, ""
|
||||
}
|
||||
if strings.EqualFold(l.ExportControl.IsExportControl, "YES") ||
|
||||
strings.EqualFold(l.ExportControl.EAR, "YES") ||
|
||||
strings.EqualFold(l.ExportControl.ITAR, "YES") {
|
||||
return false, ""
|
||||
}
|
||||
if l.Cui != nil && l.Cui.IsCui {
|
||||
return false, ""
|
||||
}
|
||||
if l.Copyright.ContainsThirdPartyMaterial {
|
||||
return false, ""
|
||||
}
|
||||
switch strings.ToUpper(l.Copyright.DeterminationType) {
|
||||
case "PUBLIC_USE_PERMITTED", "GOV_PUBLIC_USE_PERMITTED":
|
||||
return true, "Public Domain (NASA NTRS, " + l.Copyright.DeterminationType + ")"
|
||||
}
|
||||
return false, ""
|
||||
}
|
||||
|
||||
// NTRSPDFURL returns the absolute URL of the first PDF download, or "".
|
||||
func NTRSPDFURL(l NTRSLesson) string {
|
||||
for _, d := range l.Downloads {
|
||||
if strings.Contains(strings.ToLower(d.Mimetype), "pdf") && d.Links.Original != "" {
|
||||
return "https://ntrs.nasa.gov" + d.Links.Original
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseNTRSSearch parses an NTRS /search response body.
|
||||
func parseNTRSSearch(body []byte) ([]NTRSLesson, error) {
|
||||
var r ntrsSearchResponse
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return r.Results, nil
|
||||
}
|
||||
|
||||
// FetchNTRSLessons queries the NTRS search API. Network call; the parsing/gating
|
||||
// helpers above are pure and unit-tested.
|
||||
func FetchNTRSLessons(ctx context.Context, query string, limit int) ([]NTRSLesson, error) {
|
||||
if limit <= 0 || limit > 100 {
|
||||
limit = 25
|
||||
}
|
||||
u := fmt.Sprintf("%s?q=%s&page.size=%d&highlight=false", ntrsSearchURL, url.QueryEscape(query), limit)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := (&http.Client{Timeout: 30 * time.Second}).Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("ntrs status %d", resp.StatusCode)
|
||||
}
|
||||
b, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseNTRSSearch(b)
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
package iace
|
||||
|
||||
import "testing"
|
||||
|
||||
// Fixture mirrors the real NTRS /search response shape (copyright is an OBJECT).
|
||||
const ntrsFixture = `{"results":[
|
||||
{"id":20205010628,"title":"Lessons Learned from Large-Scale Aerospace Structural Testing","abstract":"A bracket fractured under load.",
|
||||
"distribution":"PUBLIC","isLessonsLearned":true,
|
||||
"copyright":{"determinationType":"GOV_PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":false,"licenseType":"NO"},
|
||||
"exportControl":{"isExportControl":"NO","ear":"NO","itar":"NO"},"cui":{"isCui":false},
|
||||
"downloads":[{"mimetype":"application/pdf","links":{"original":"/api/citations/20205010628/downloads/paper.pdf"}}]},
|
||||
{"id":1001,"title":"ITAR controlled","abstract":"x","distribution":"PUBLIC","isLessonsLearned":true,
|
||||
"copyright":{"determinationType":"GOV_PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":false},
|
||||
"exportControl":{"isExportControl":"YES","ear":"NO","itar":"YES"},"cui":null,
|
||||
"downloads":[{"mimetype":"application/pdf","links":{"original":"/x.pdf"}}]},
|
||||
{"id":1002,"title":"Third party","abstract":"x","distribution":"PUBLIC","isLessonsLearned":true,
|
||||
"copyright":{"determinationType":"PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":true},
|
||||
"exportControl":{"isExportControl":"NO"},"cui":null,"downloads":[]},
|
||||
{"id":1003,"title":"Restricted dist","abstract":"x","distribution":"RESTRICTED",
|
||||
"copyright":{"determinationType":"GOV_PUBLIC_USE_PERMITTED"},"exportControl":{"isExportControl":"NO"}},
|
||||
{"id":1004,"title":"Video only","abstract":"x","distribution":"PUBLIC","isLessonsLearned":true,
|
||||
"copyright":{"determinationType":"PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":false},
|
||||
"exportControl":{"isExportControl":"NO"},"cui":{"isCui":false},
|
||||
"downloads":[{"mimetype":"video/mp4","links":{"original":"/v.mp4"}}]}
|
||||
]}`
|
||||
|
||||
func TestParseNTRSSearch(t *testing.T) {
|
||||
ls, err := parseNTRSSearch([]byte(ntrsFixture))
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
if len(ls) != 5 {
|
||||
t.Fatalf("expected 5 results, got %d", len(ls))
|
||||
}
|
||||
if ls[0].ID != 20205010628 || ls[0].Copyright.DeterminationType != "GOV_PUBLIC_USE_PERMITTED" {
|
||||
t.Errorf("first record parsed wrong: %+v", ls[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNTRSUsable_Gate(t *testing.T) {
|
||||
ls, _ := parseNTRSSearch([]byte(ntrsFixture))
|
||||
want := []bool{true, false, false, false, true} // ok, ITAR, third-party, restricted, ok(video)
|
||||
for i, l := range ls {
|
||||
ok, lic := NTRSUsable(l)
|
||||
if ok != want[i] {
|
||||
t.Errorf("record %d (%q): usable=%v, want %v", l.ID, l.Title, ok, want[i])
|
||||
}
|
||||
if ok && lic == "" {
|
||||
t.Errorf("record %d usable but empty licence", l.ID)
|
||||
}
|
||||
// Every accepted record must also pass the failure-knowledge allowlist.
|
||||
if ok && !FailureKnowledgeLicenseAllowed(lic) {
|
||||
t.Errorf("record %d licence %q not allowed by FK allowlist", l.ID, lic)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNTRSPDFURL(t *testing.T) {
|
||||
ls, _ := parseNTRSSearch([]byte(ntrsFixture))
|
||||
if got := NTRSPDFURL(ls[0]); got != "https://ntrs.nasa.gov/api/citations/20205010628/downloads/paper.pdf" {
|
||||
t.Errorf("pdf url = %q", got)
|
||||
}
|
||||
if got := NTRSPDFURL(ls[4]); got != "" { // video-only → no PDF
|
||||
t.Errorf("video-only should have no PDF url, got %q", got)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user