feat(iace): make FMEA harvest cumulative + resumable

Load the prior manifest, seed the seen-set from it + anthropic.jsonl, and add
MAX_DOCS NEW docs per run (100->200->...) instead of re-processing the first
batch and overwriting the register. Widen NTRS paging to page.from 0..400 so
enough fresh usable docs are found after the skip.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-13 15:23:13 +02:00
parent d720db07dd
commit c8a1a40554
@@ -125,29 +125,44 @@ def main():
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1) print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1)
subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json", subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json",
"-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True) "-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True)
manifest, seen, processed = [], set(), 0 # Resume cumulatively: keep the prior manifest, skip already-processed docs,
# and add MAX_DOCS NEW ones so the register grows (100 -> 200 -> ...).
manifest, seen = [], set()
if os.path.exists(MANIFEST):
try:
manifest = json.load(open(MANIFEST)).get("documents", [])
seen = {str(d.get("id")) for d in manifest}
except Exception:
manifest, seen = [], set()
if os.path.exists(OUT):
for line in open(OUT):
try:
seen.add(str(json.loads(line).get("_id")))
except Exception:
pass
new_count = 0
for q in QUERIES: for q in QUERIES:
if processed >= MAX_DOCS: if new_count >= MAX_DOCS:
break break
for frm in (0, 100, 200): for frm in (0, 100, 200, 300, 400):
if processed >= MAX_DOCS: if new_count >= MAX_DOCS:
break break
try: try:
res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90) res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90)
except Exception as e: except Exception as e:
print(f"search '{q}'@{frm} error: {e}", flush=True); continue print(f"search '{q}'@{frm} error: {e}", flush=True); continue
for l in res.get("results", []): for l in res.get("results", []):
if processed >= MAX_DOCS: if new_count >= MAX_DOCS:
break break
did = l.get("id") did = l.get("id")
if not did or did in seen: if not did or str(did) in seen:
continue continue
if not ntrs_usable(l): if not ntrs_usable(l):
continue continue
url = pdf_url(l) url = pdf_url(l)
if not url: if not url:
continue continue
seen.add(did) seen.add(str(did))
pdf = f"{WORK}/{did}.pdf" pdf = f"{WORK}/{did}.pdf"
subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True) subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True)
sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0 sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0
@@ -175,12 +190,12 @@ def main():
manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url, manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url,
"used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""), "used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""),
"confidence": t.get("confidence", "")}) "confidence": t.get("confidence", "")})
processed += 1 new_count += 1
json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest}, json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest},
open(MANIFEST, "w"), ensure_ascii=False, indent=1) open(MANIFEST, "w"), ensure_ascii=False, indent=1)
print(f" [{processed}] {did} used={used} {t.get('component','?')}{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True) print(f" [+{new_count}/{MAX_DOCS} | total {len(manifest)}] {did} used={used} {t.get('component','?')}{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True)
used_n = sum(1 for m in manifest if m["used"]) used_n = sum(1 for m in manifest if m["used"])
print(f"DONE: {processed} processed, {used_n} used (applicable) -> {MANIFEST}", flush=True) print(f"DONE: +{new_count} new this run, {len(manifest)} total, {used_n} used (applicable) -> {MANIFEST}", flush=True)
if __name__ == "__main__": if __name__ == "__main__":