diff --git a/admin-compliance/app/sdk/iace/[projectId]/fmea/_components/SourceDocuments.tsx b/admin-compliance/app/sdk/iace/[projectId]/fmea/_components/SourceDocuments.tsx new file mode 100644 index 00000000..57908978 --- /dev/null +++ b/admin-compliance/app/sdk/iace/[projectId]/fmea/_components/SourceDocuments.tsx @@ -0,0 +1,77 @@ +'use client' + +import { useState } from 'react' +import { useFailureSources } from '../_hooks/useFailureSources' + +/** + * FMEA source-document register: shows EVERY document the failure-knowledge + * corpus is built from, with source + licence + link. Collapsible; "used" = + * a concrete failure was extracted and ingested, "geprüft" = checked but not a + * failure. Full provenance, always visible. + */ +export function SourceDocuments() { + const { data } = useFailureSources() + const [open, setOpen] = useState(false) + if (!data) return null + + const used = data.documents.filter((d) => d.used) + return ( +
+ + {open && ( +
+ {data.count === 0 ? ( +

Noch keine Quelldokumente extrahiert.

+ ) : ( +
+ + + + + + + + + + + {data.documents.map((d) => ( + + + + + + + ))} + +
DokumentQuelle / LizenzExtrahiertStatus
+ + {d.title || d.source} + + + {d.source}
{d.license} +
+ {d.used ? `${d.component} → ${d.failure_mode}` : '—'} + {d.used && d.confidence && ({d.confidence})} + + + {d.used ? 'verwendet' : 'geprüft'} + +
+
+ )} +

+ Alle Quellen sind kommerziell nutzbar (lizenzgeprüft) und über den Link einsehbar. Extraktion + Claude-basiert mit Quell-Zitat je Wert (verified=false bis stichprobengeprüft). +

+
+ )} +
+ ) +} diff --git a/admin-compliance/app/sdk/iace/[projectId]/fmea/_hooks/useFailureSources.ts b/admin-compliance/app/sdk/iace/[projectId]/fmea/_hooks/useFailureSources.ts new file mode 100644 index 00000000..890a71f5 --- /dev/null +++ b/admin-compliance/app/sdk/iace/[projectId]/fmea/_hooks/useFailureSources.ts @@ -0,0 +1,46 @@ +'use client' + +import { useEffect, useState } from 'react' + +export interface FailureSourceDoc { + id: number + title: string + source: string + license: string + url: string + used: boolean + component: string + failure_mode: string + confidence: string +} + +export interface FailureSources { + generated: string + model: string + count: number + documents: FailureSourceDoc[] +} + +/** + * Loads the FMEA source-document register (every document we use, with source + + * licence). Global, not per project — the auditable provenance of the + * failure-knowledge corpus. + */ +export function useFailureSources() { + const [data, setData] = useState(null) + + useEffect(() => { + let cancelled = false + fetch('/api/sdk/v1/iace/failure-knowledge/sources') + .then((r) => (r.ok ? r.json() : null)) + .then((j) => { + if (!cancelled) setData(j) + }) + .catch((err) => console.error('Failed to load failure sources:', err)) + return () => { + cancelled = true + } + }, []) + + return { data } +} diff --git a/admin-compliance/app/sdk/iace/[projectId]/fmea/page.tsx b/admin-compliance/app/sdk/iace/[projectId]/fmea/page.tsx index d4f2fac7..40872378 100644 --- a/admin-compliance/app/sdk/iace/[projectId]/fmea/page.tsx +++ b/admin-compliance/app/sdk/iace/[projectId]/fmea/page.tsx @@ -3,6 +3,7 @@ import { useEffect, useState } from 'react' import { useParams } from 'next/navigation' import { useFMEA, type FMEARow } from './_hooks/useFMEA' +import { SourceDocuments } from './_components/SourceDocuments' const COMP_TYPE_LABELS: Record = { mechanical: 'Mechanisch', electrical: 'Elektrisch', sensor: 'Sensor', @@ -60,6 +61,9 @@ export default function FMEAPage() { {/* Info Box */} + {/* Source-document register (provenance, always visible) */} + + {/* KI-Vorschlag + Export */}
diff --git a/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go b/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go index 35d19526..80734aad 100644 --- a/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go +++ b/ai-compliance-sdk/internal/api/handlers/iace_handler_failure.go @@ -29,6 +29,14 @@ func (h *IACEHandler) ListFailureKnowledge(c *gin.Context) { }) } +// ListFailureSources handles GET /failure-knowledge/sources. +// The source-document register for the FMEA frontend: EVERY document we use, +// with its source + licence + URL (and what was extracted). Always visible so an +// auditor sees the full provenance of the failure-knowledge corpus. +func (h *IACEHandler) ListFailureSources(c *gin.Context) { + c.JSON(http.StatusOK, iace.GetFailureSources()) +} + // HarvestNTRSFailures handles GET /failure-knowledge/ntrs. // Live-harvests NASA NTRS lessons-learned metadata and returns only the records // that pass the public-reuse licence gate (Stage 1 of the bulk loader). Tuple diff --git a/ai-compliance-sdk/internal/app/routes_iace.go b/ai-compliance-sdk/internal/app/routes_iace.go index 85b71e97..e98da4a6 100644 --- a/ai-compliance-sdk/internal/app/routes_iace.go +++ b/ai-compliance-sdk/internal/app/routes_iace.go @@ -33,6 +33,7 @@ func registerIACERoutes(v1 *gin.RouterGroup, h *handlers.IACEHandler) { iaceRoutes.GET("/minimum-distances", h.ListMinimumDistances) iaceRoutes.GET("/failure-knowledge", h.ListFailureKnowledge) iaceRoutes.GET("/failure-knowledge/ntrs", h.HarvestNTRSFailures) + iaceRoutes.GET("/failure-knowledge/sources", h.ListFailureSources) iaceRoutes.GET("/tags", h.ListTags) iaceRoutes.GET("/hazard-patterns", h.ListHazardPatterns) diff --git a/ai-compliance-sdk/internal/iace/datasources/nasa_failure_sources.json b/ai-compliance-sdk/internal/iace/datasources/nasa_failure_sources.json new file mode 100644 index 00000000..b51100f0 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/datasources/nasa_failure_sources.json @@ -0,0 +1,6 @@ +{ + "generated": "pending-first-run", + "model": "claude-haiku-4-5-20251001", + "count": 0, + "documents": [] +} diff --git a/ai-compliance-sdk/internal/iace/failure_sources.go b/ai-compliance-sdk/internal/iace/failure_sources.go new file mode 100644 index 00000000..59f59982 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/failure_sources.go @@ -0,0 +1,46 @@ +package iace + +import ( + _ "embed" + "encoding/json" +) + +// Source-document register for the FMEA failure-knowledge corpus. The manifest +// is generated by the Anthropic extraction run (scripts/fmea_anthropic_extract.py) +// and committed here, so the FMEA frontend can ALWAYS show every source document +// we use — with its source + licence — straight from the deployed binary. +// Each entry is auditable: a public URL + the licence under which we use it. + +//go:embed datasources/nasa_failure_sources.json +var failureSourcesJSON []byte + +// FailureSourceDoc is one source document in the register. +type FailureSourceDoc struct { + ID int64 `json:"id"` + Title string `json:"title"` + Source string `json:"source"` + License string `json:"license"` + URL string `json:"url"` + Used bool `json:"used"` // true = applicable failure, ingested into the corpus + Component string `json:"component"` // extracted (empty if not used) + FailureMode string `json:"failure_mode"` + Confidence string `json:"confidence"` +} + +// FailureSources is the full register manifest. +type FailureSources struct { + Generated string `json:"generated"` + Model string `json:"model"` + Count int `json:"count"` + Documents []FailureSourceDoc `json:"documents"` +} + +// GetFailureSources returns the embedded source-document register. +func GetFailureSources() FailureSources { + var fs FailureSources + _ = json.Unmarshal(failureSourcesJSON, &fs) + if fs.Documents == nil { + fs.Documents = []FailureSourceDoc{} + } + return fs +} diff --git a/ai-compliance-sdk/internal/iace/failure_sources_test.go b/ai-compliance-sdk/internal/iace/failure_sources_test.go new file mode 100644 index 00000000..c60e42a4 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/failure_sources_test.go @@ -0,0 +1,20 @@ +package iace + +import "testing" + +// The embedded manifest must always parse and every document must carry a +// source + licence (the register is the auditable provenance shown in the UI). +func TestGetFailureSources_ParsesAndAttributed(t *testing.T) { + fs := GetFailureSources() + if fs.Documents == nil { + t.Fatal("documents must never be nil") + } + for _, d := range fs.Documents { + if d.Source == "" || d.License == "" { + t.Errorf("doc %d missing source/license: %+v", d.ID, d) + } + if !FailureKnowledgeLicenseAllowed(d.License) { + t.Errorf("doc %d carries a non-allowed licence %q", d.ID, d.License) + } + } +} diff --git a/ai-compliance-sdk/scripts/fmea_anthropic_extract.py b/ai-compliance-sdk/scripts/fmea_anthropic_extract.py new file mode 100644 index 00000000..35b417a2 --- /dev/null +++ b/ai-compliance-sdk/scripts/fmea_anthropic_extract.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +"""Quote-verifiable FailureKnowledge extraction via the Anthropic API. + +Sends each NASA NTRS PDF DIRECTLY to Claude (native PDF + vision, incl. scanned) +and forces a structured tool call whose schema REQUIRES a verbatim source quote +for the key fields — every value is auditable against the document. Only +public-reuse-licensed NTRS docs are processed. applicable=true tuples are +ingested into bp_iace_failure_kb; EVERY processed doc is recorded in the source +manifest (used vs checked-not-used) for the FMEA frontend register. + +Env: ANTHROPIC_API_KEY (req), ANTHROPIC_MODEL (default claude-haiku-4-5-20251001), + MAX_DOCS (default 100), RAG (default https://127.0.0.1:8097). +Out: /tmp/fmea_harvest/anthropic.jsonl, /tmp/fmea_harvest/nasa_failure_sources.json +""" +import base64, json, os, ssl, subprocess, sys, time, urllib.parse, urllib.request, urllib.error + +API = "https://api.anthropic.com/v1/messages" +NTRS = "https://ntrs.nasa.gov" +RAG = os.environ.get("RAG", "https://127.0.0.1:8097") +COLLECTION = "bp_iace_failure_kb" +MODEL = os.environ.get("ANTHROPIC_MODEL", "claude-haiku-4-5-20251001") +KEY = os.environ.get("ANTHROPIC_API_KEY", "") +MAX_DOCS = int(os.environ.get("MAX_DOCS", "100")) +WORK = "/tmp/fmea_harvest" +OUT = f"{WORK}/anthropic.jsonl" +MANIFEST = f"{WORK}/nasa_failure_sources.json" +QUERIES = ["lessons learned failure", "lessons learned anomaly valve seal bearing motor", + "failure investigation fracture leak short corrosion", "reliability failure mechanism component"] +os.makedirs(WORK, exist_ok=True) +_ctx = ssl.create_default_context() +_ctx.check_hostname = False +_ctx.verify_mode = ssl.CERT_NONE + +TOOL = { + "name": "record_failure", + "description": "Record the hardware failure described in the document, with verbatim source quotes.", + "input_schema": {"type": "object", "properties": { + "applicable": {"type": "boolean", "description": "true ONLY if the document describes a concrete hardware/component failure"}, + "component": {"type": "string"}, "component_quote": {"type": "string", "description": "verbatim sentence naming the component"}, + "failure_mode": {"type": "string"}, "failure_mode_quote": {"type": "string", "description": "verbatim sentence describing how it failed"}, + "mechanism": {"type": "string"}, "effect": {"type": "string"}, + "hazard": {"type": "string", "enum": ["mechanical_hazard", "electrical_hazard", "thermal_hazard", "fire_explosion", "pneumatic_hydraulic", "none"]}, + "control": {"type": "string"}, "confidence": {"type": "string", "enum": ["high", "medium", "low"]}}, + "required": ["applicable", "component", "failure_mode", "component_quote", "failure_mode_quote", "confidence"]}, +} +PROMPT = ("Read this NASA engineering document and extract the single primary hardware failure. Use the " + "record_failure tool. Quote the SOURCE SENTENCE verbatim for component and failure_mode. If the " + "document does not describe a concrete component failure, call the tool with applicable=false. " + "Never invent values — leave a field empty rather than guess.") + + +def get_json(url, data=None, headers=None, timeout=120): + req = urllib.request.Request(url, data=data, headers=headers or {}) + with urllib.request.urlopen(req, timeout=timeout, context=_ctx) as r: + return json.load(r) + + +def ntrs_usable(l): + if str(l.get("distribution", "")).upper() != "PUBLIC": + return False + ec = l.get("exportControl") or {} + if any(str(ec.get(k, "")).upper() == "YES" for k in ("isExportControl", "ear", "itar")): + return False + if (l.get("cui") or {}).get("isCui"): + return False + cp = l.get("copyright") or {} + if cp.get("containsThirdPartyMaterial"): + return False + return str(cp.get("determinationType", "")).upper() in ("PUBLIC_USE_PERMITTED", "GOV_PUBLIC_USE_PERMITTED") + + +def pdf_url(l): + for d in l.get("downloads", []): + if "pdf" in str(d.get("mimetype", "")).lower(): + o = (d.get("links") or {}).get("original") + if o: + return NTRS + o + return None + + +def extract(pdf_b64): + body = json.dumps({"model": MODEL, "max_tokens": 1024, "tools": [TOOL], + "tool_choice": {"type": "tool", "name": "record_failure"}, + "messages": [{"role": "user", "content": [ + {"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": pdf_b64}}, + {"type": "text", "text": PROMPT}]}]}).encode() + hdr = {"x-api-key": KEY, "anthropic-version": "2023-06-01", "content-type": "application/json"} + for attempt in (1, 2): + try: + resp = get_json(API, body, hdr, timeout=180) + for b in resp.get("content", []): + if b.get("type") == "tool_use": + return b.get("input"), resp.get("usage", {}) + return None, resp.get("usage", {}) + except (urllib.error.URLError, ConnectionError) as e: + if attempt == 2: + raise + time.sleep(3) + return None, {} + + +def ingest(did, title, t, lic, url): + md = (f"# NASA Lesson {did}: {title}\n\n- Source: NASA NTRS {did}\n- License: {lic}\n- URL: {url}\n" + f"- verified: false (Claude-extracted, quote-checked)\n\n" + f"Component: {t.get('component','')}\nFailure mode: {t.get('failure_mode','')}\n" + f"Mechanism: {t.get('mechanism','')}\nEffect: {t.get('effect','')}\nHazard: {t.get('hazard','')}\n" + f"Control: {t.get('control','')}\nConfidence: {t.get('confidence','')}\n\n" + f"Component quote: {t.get('component_quote','')}\nFailure-mode quote: {t.get('failure_mode_quote','')}\n") + p = f"{WORK}/fk_{did}.md" + open(p, "w").write(md) + meta = json.dumps({"title": f"NASA NTRS {did}: {title}"[:120], "license": lic, "source": f"NASA NTRS {did}", "verified": "false"}) + r = subprocess.run(["curl", "-sk", "--max-time", "90", "-X", "POST", f"{RAG}/api/v1/documents/upload", + "-F", f"file=@{p}", "-F", f"collection={COLLECTION}", "-F", "data_type=failure_kb", + "-F", "use_case=iace_fmea", "-F", "year=2024", "-F", f"metadata_json={meta}"], + capture_output=True, text=True, timeout=120) + try: + os.remove(p) + except Exception: + pass + return "chunks_count" in r.stdout + + +def main(): + if not KEY: + print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1) + subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json", + "-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True) + manifest, seen, processed = [], set(), 0 + for q in QUERIES: + if processed >= MAX_DOCS: + break + for frm in (0, 100, 200): + if processed >= MAX_DOCS: + break + try: + res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90) + except Exception as e: + print(f"search '{q}'@{frm} error: {e}", flush=True); continue + for l in res.get("results", []): + if processed >= MAX_DOCS: + break + did = l.get("id") + if not did or did in seen: + continue + if not ntrs_usable(l): + continue + url = pdf_url(l) + if not url: + continue + seen.add(did) + pdf = f"{WORK}/{did}.pdf" + subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True) + sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0 + if sz < 1000 or sz > 30_000_000: + try: + os.remove(pdf) + except Exception: + pass + continue + b64 = base64.b64encode(open(pdf, "rb").read()).decode() + os.remove(pdf) + try: + t, usage = extract(b64) + except Exception as e: + print(f" {did} extract error: {e}", flush=True); continue + if not t: + continue + title = (l.get("title") or "")[:110] + lic = f"Public Domain (NASA NTRS, {(l.get('copyright') or {}).get('determinationType','')})" + used = bool(t.get("applicable")) + t.update({"_id": did, "_title": title, "_license": lic, "_url": url, "_model": MODEL, "_used": used}) + open(OUT, "a").write(json.dumps(t, ensure_ascii=False) + "\n") + if used: + ingest(did, title, t, lic, url) + manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url, + "used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""), + "confidence": t.get("confidence", "")}) + processed += 1 + json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest}, + open(MANIFEST, "w"), ensure_ascii=False, indent=1) + print(f" [{processed}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True) + used_n = sum(1 for m in manifest if m["used"]) + print(f"DONE: {processed} processed, {used_n} used (applicable) -> {MANIFEST}", flush=True) + + +if __name__ == "__main__": + main()