From d27c1b9e7dd7e6a8ddf35d83c997f20cee46bdf1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 13 Jun 2026 00:16:41 +0200 Subject: [PATCH] feat(iace): NTRS harvester + licence gate (FMEA P2 stage 1) Stage 1 of the FailureKnowledge bulk loader: harvest NASA NTRS lessons-learned with a strict public-reuse gate (NTRSUsable: public release, not export-controlled/EAR/ITAR, not CUI, PUBLIC_USE_PERMITTED, no third-party copyright). NTRSPDFURL prefers the PDF download for downstream text/OCR extraction. GET /iace/failure-knowledge/ntrs runs the live harvest and returns only the licence-clean records. Pure parse/gate helpers are fixture-tested (usable vs ITAR / third-party / restricted / video-only); accepted licences also pass the FK allowlist. Next: tuple extraction (abstract -> FailureKnowledge) + Playwright/OCR for scanned PDFs -> bp_iace_failure_kb. Co-Authored-By: Claude Opus 4.7 --- .../api/handlers/iace_handler_failure.go | 32 +++++ ai-compliance-sdk/internal/app/routes_iace.go | 1 + .../internal/iace/ntrs_harvester.go | 132 ++++++++++++++++++ .../internal/iace/ntrs_harvester_test.go | 66 +++++++++ 4 files changed, 231 insertions(+) create mode 100644 ai-compliance-sdk/internal/iace/ntrs_harvester.go create mode 100644 ai-compliance-sdk/internal/iace/ntrs_harvester_test.go diff --git a/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go b/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go index 1ecc5b0e..35d19526 100644 --- a/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go +++ b/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go @@ -2,6 +2,7 @@ package handlers import ( "net/http" + "strconv" "github.com/breakpilot/ai-compliance-sdk/internal/iace" "github.com/gin-gonic/gin" @@ -27,3 +28,34 @@ func (h *IACEHandler) ListFailureKnowledge(c *gin.Context) { "total": len(items), }) } + +// HarvestNTRSFailures handles GET /failure-knowledge/ntrs. +// Live-harvests NASA NTRS lessons-learned metadata and returns only the records +// that pass the public-reuse licence gate (Stage 1 of the bulk loader). Tuple +// extraction from the abstracts is a downstream step. +func (h *IACEHandler) HarvestNTRSFailures(c *gin.Context) { + q := c.DefaultQuery("q", "lessons learned failure") + limit, _ := strconv.Atoi(c.DefaultQuery("limit", "25")) + lessons, err := iace.FetchNTRSLessons(c.Request.Context(), q, limit) + if err != nil { + c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()}) + return + } + out := []gin.H{} + skipped := 0 + for _, l := range lessons { + ok, lic := iace.NTRSUsable(l) + if !ok { + skipped++ + continue + } + out = append(out, gin.H{ + "id": l.ID, "title": l.Title, "abstract": l.Abstract, + "license": lic, "pdf_url": iace.NTRSPDFURL(l), "is_lessons_learned": l.IsLessonsLearned, + }) + } + c.JSON(http.StatusOK, gin.H{ + "query": q, "usable": out, "usable_count": len(out), + "skipped_non_open": skipped, "total_fetched": len(lessons), + }) +} diff --git a/ai-compliance-sdk/internal/app/routes_iace.go b/ai-compliance-sdk/internal/app/routes_iace.go index f7bb02f2..85b71e97 100644 --- a/ai-compliance-sdk/internal/app/routes_iace.go +++ b/ai-compliance-sdk/internal/app/routes_iace.go @@ -32,6 +32,7 @@ func registerIACERoutes(v1 *gin.RouterGroup, h *handlers.IACEHandler) { iaceRoutes.GET("/energy-sources", h.ListEnergySources) iaceRoutes.GET("/minimum-distances", h.ListMinimumDistances) iaceRoutes.GET("/failure-knowledge", h.ListFailureKnowledge) + iaceRoutes.GET("/failure-knowledge/ntrs", h.HarvestNTRSFailures) iaceRoutes.GET("/tags", h.ListTags) iaceRoutes.GET("/hazard-patterns", h.ListHazardPatterns) diff --git a/ai-compliance-sdk/internal/iace/ntrs_harvester.go b/ai-compliance-sdk/internal/iace/ntrs_harvester.go new file mode 100644 index 00000000..13210f88 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/ntrs_harvester.go @@ -0,0 +1,132 @@ +package iace + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// NTRS (NASA Technical Reports Server) harvester — Stage 1 of the FailureKnowledge +// bulk loader. Fetches lessons-learned / failure reports, applies a strict +// public-reuse licence gate (public release, not export-controlled, not CUI, +// public-use-permitted, no embedded third-party copyright), and exposes the +// readable metadata (title/abstract) + the PDF link for downstream extraction. +// NASA NTRS documents that pass the gate are US-Government public domain. + +const ntrsSearchURL = "https://ntrs.nasa.gov/api/citations/search" + +type ntrsCopyright struct { + DeterminationType string `json:"determinationType"` + ContainsThirdPartyMaterial bool `json:"containsThirdPartyMaterial"` + LicenseType string `json:"licenseType"` +} + +type ntrsExport struct { + IsExportControl string `json:"isExportControl"` + EAR string `json:"ear"` + ITAR string `json:"itar"` +} + +type ntrsCui struct { + IsCui bool `json:"isCui"` +} + +type ntrsDownload struct { + Mimetype string `json:"mimetype"` + Links struct { + Original string `json:"original"` + } `json:"links"` +} + +// NTRSLesson is one harvested NTRS record (only the fields we use). +type NTRSLesson struct { + ID int64 `json:"id"` + Title string `json:"title"` + Abstract string `json:"abstract"` + Distribution string `json:"distribution"` + IsLessonsLearned bool `json:"isLessonsLearned"` + Copyright ntrsCopyright `json:"copyright"` + ExportControl ntrsExport `json:"exportControl"` + Cui *ntrsCui `json:"cui"` + Downloads []ntrsDownload `json:"downloads"` +} + +type ntrsSearchResponse struct { + Results []NTRSLesson `json:"results"` +} + +// NTRSUsable reports whether a lesson is publicly + commercially reusable and +// returns the licence string. The gate is conservative: any export-control, CUI, +// third-party copyright, non-public distribution, or non-public-use copyright +// determination disqualifies the record. +func NTRSUsable(l NTRSLesson) (bool, string) { + if !strings.EqualFold(l.Distribution, "PUBLIC") { + return false, "" + } + if strings.EqualFold(l.ExportControl.IsExportControl, "YES") || + strings.EqualFold(l.ExportControl.EAR, "YES") || + strings.EqualFold(l.ExportControl.ITAR, "YES") { + return false, "" + } + if l.Cui != nil && l.Cui.IsCui { + return false, "" + } + if l.Copyright.ContainsThirdPartyMaterial { + return false, "" + } + switch strings.ToUpper(l.Copyright.DeterminationType) { + case "PUBLIC_USE_PERMITTED", "GOV_PUBLIC_USE_PERMITTED": + return true, "Public Domain (NASA NTRS, " + l.Copyright.DeterminationType + ")" + } + return false, "" +} + +// NTRSPDFURL returns the absolute URL of the first PDF download, or "". +func NTRSPDFURL(l NTRSLesson) string { + for _, d := range l.Downloads { + if strings.Contains(strings.ToLower(d.Mimetype), "pdf") && d.Links.Original != "" { + return "https://ntrs.nasa.gov" + d.Links.Original + } + } + return "" +} + +// parseNTRSSearch parses an NTRS /search response body. +func parseNTRSSearch(body []byte) ([]NTRSLesson, error) { + var r ntrsSearchResponse + if err := json.Unmarshal(body, &r); err != nil { + return nil, err + } + return r.Results, nil +} + +// FetchNTRSLessons queries the NTRS search API. Network call; the parsing/gating +// helpers above are pure and unit-tested. +func FetchNTRSLessons(ctx context.Context, query string, limit int) ([]NTRSLesson, error) { + if limit <= 0 || limit > 100 { + limit = 25 + } + u := fmt.Sprintf("%s?q=%s&page.size=%d&highlight=false", ntrsSearchURL, url.QueryEscape(query), limit) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + resp, err := (&http.Client{Timeout: 30 * time.Second}).Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("ntrs status %d", resp.StatusCode) + } + b, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20)) + if err != nil { + return nil, err + } + return parseNTRSSearch(b) +} diff --git a/ai-compliance-sdk/internal/iace/ntrs_harvester_test.go b/ai-compliance-sdk/internal/iace/ntrs_harvester_test.go new file mode 100644 index 00000000..4677d022 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/ntrs_harvester_test.go @@ -0,0 +1,66 @@ +package iace + +import "testing" + +// Fixture mirrors the real NTRS /search response shape (copyright is an OBJECT). +const ntrsFixture = `{"results":[ + {"id":20205010628,"title":"Lessons Learned from Large-Scale Aerospace Structural Testing","abstract":"A bracket fractured under load.", + "distribution":"PUBLIC","isLessonsLearned":true, + "copyright":{"determinationType":"GOV_PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":false,"licenseType":"NO"}, + "exportControl":{"isExportControl":"NO","ear":"NO","itar":"NO"},"cui":{"isCui":false}, + "downloads":[{"mimetype":"application/pdf","links":{"original":"/api/citations/20205010628/downloads/paper.pdf"}}]}, + {"id":1001,"title":"ITAR controlled","abstract":"x","distribution":"PUBLIC","isLessonsLearned":true, + "copyright":{"determinationType":"GOV_PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":false}, + "exportControl":{"isExportControl":"YES","ear":"NO","itar":"YES"},"cui":null, + "downloads":[{"mimetype":"application/pdf","links":{"original":"/x.pdf"}}]}, + {"id":1002,"title":"Third party","abstract":"x","distribution":"PUBLIC","isLessonsLearned":true, + "copyright":{"determinationType":"PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":true}, + "exportControl":{"isExportControl":"NO"},"cui":null,"downloads":[]}, + {"id":1003,"title":"Restricted dist","abstract":"x","distribution":"RESTRICTED", + "copyright":{"determinationType":"GOV_PUBLIC_USE_PERMITTED"},"exportControl":{"isExportControl":"NO"}}, + {"id":1004,"title":"Video only","abstract":"x","distribution":"PUBLIC","isLessonsLearned":true, + "copyright":{"determinationType":"PUBLIC_USE_PERMITTED","containsThirdPartyMaterial":false}, + "exportControl":{"isExportControl":"NO"},"cui":{"isCui":false}, + "downloads":[{"mimetype":"video/mp4","links":{"original":"/v.mp4"}}]} +]}` + +func TestParseNTRSSearch(t *testing.T) { + ls, err := parseNTRSSearch([]byte(ntrsFixture)) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ls) != 5 { + t.Fatalf("expected 5 results, got %d", len(ls)) + } + if ls[0].ID != 20205010628 || ls[0].Copyright.DeterminationType != "GOV_PUBLIC_USE_PERMITTED" { + t.Errorf("first record parsed wrong: %+v", ls[0]) + } +} + +func TestNTRSUsable_Gate(t *testing.T) { + ls, _ := parseNTRSSearch([]byte(ntrsFixture)) + want := []bool{true, false, false, false, true} // ok, ITAR, third-party, restricted, ok(video) + for i, l := range ls { + ok, lic := NTRSUsable(l) + if ok != want[i] { + t.Errorf("record %d (%q): usable=%v, want %v", l.ID, l.Title, ok, want[i]) + } + if ok && lic == "" { + t.Errorf("record %d usable but empty licence", l.ID) + } + // Every accepted record must also pass the failure-knowledge allowlist. + if ok && !FailureKnowledgeLicenseAllowed(lic) { + t.Errorf("record %d licence %q not allowed by FK allowlist", l.ID, lic) + } + } +} + +func TestNTRSPDFURL(t *testing.T) { + ls, _ := parseNTRSSearch([]byte(ntrsFixture)) + if got := NTRSPDFURL(ls[0]); got != "https://ntrs.nasa.gov/api/citations/20205010628/downloads/paper.pdf" { + t.Errorf("pdf url = %q", got) + } + if got := NTRSPDFURL(ls[4]); got != "" { // video-only → no PDF + t.Errorf("video-only should have no PDF url, got %q", got) + } +}