From c8a1a405545e6332d1a727526c1af1a413c1534b Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 13 Jun 2026 15:23:13 +0200 Subject: [PATCH] feat(iace): make FMEA harvest cumulative + resumable Load the prior manifest, seed the seen-set from it + anthropic.jsonl, and add MAX_DOCS NEW docs per run (100->200->...) instead of re-processing the first batch and overwriting the register. Widen NTRS paging to page.from 0..400 so enough fresh usable docs are found after the skip. Co-Authored-By: Claude Opus 4.7 --- .../scripts/fmea_anthropic_extract.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/ai-compliance-sdk/scripts/fmea_anthropic_extract.py b/ai-compliance-sdk/scripts/fmea_anthropic_extract.py index 35b417a2..c83cf17c 100644 --- a/ai-compliance-sdk/scripts/fmea_anthropic_extract.py +++ b/ai-compliance-sdk/scripts/fmea_anthropic_extract.py @@ -125,29 +125,44 @@ def main(): print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1) subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json", "-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True) - manifest, seen, processed = [], set(), 0 + # Resume cumulatively: keep the prior manifest, skip already-processed docs, + # and add MAX_DOCS NEW ones so the register grows (100 -> 200 -> ...). + manifest, seen = [], set() + if os.path.exists(MANIFEST): + try: + manifest = json.load(open(MANIFEST)).get("documents", []) + seen = {str(d.get("id")) for d in manifest} + except Exception: + manifest, seen = [], set() + if os.path.exists(OUT): + for line in open(OUT): + try: + seen.add(str(json.loads(line).get("_id"))) + except Exception: + pass + new_count = 0 for q in QUERIES: - if processed >= MAX_DOCS: + if new_count >= MAX_DOCS: break - for frm in (0, 100, 200): - if processed >= MAX_DOCS: + for frm in (0, 100, 200, 300, 400): + if new_count >= MAX_DOCS: break try: res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90) except Exception as e: print(f"search '{q}'@{frm} error: {e}", flush=True); continue for l in res.get("results", []): - if processed >= MAX_DOCS: + if new_count >= MAX_DOCS: break did = l.get("id") - if not did or did in seen: + if not did or str(did) in seen: continue if not ntrs_usable(l): continue url = pdf_url(l) if not url: continue - seen.add(did) + seen.add(str(did)) pdf = f"{WORK}/{did}.pdf" subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True) sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0 @@ -175,12 +190,12 @@ def main(): manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url, "used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""), "confidence": t.get("confidence", "")}) - processed += 1 + new_count += 1 json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest}, open(MANIFEST, "w"), ensure_ascii=False, indent=1) - print(f" [{processed}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True) + print(f" [+{new_count}/{MAX_DOCS} | total {len(manifest)}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True) used_n = sum(1 for m in manifest if m["used"]) - print(f"DONE: {processed} processed, {used_n} used (applicable) -> {MANIFEST}", flush=True) + print(f"DONE: +{new_count} new this run, {len(manifest)} total, {used_n} used (applicable) -> {MANIFEST}", flush=True) if __name__ == "__main__":