From c8a1a405545e6332d1a727526c1af1a413c1534b Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sat, 13 Jun 2026 15:23:13 +0200
Subject: [PATCH] feat(iace): make FMEA harvest cumulative + resumable

Load the prior manifest, seed the seen-set from it + anthropic.jsonl, and add
MAX_DOCS NEW docs per run (100->200->...) instead of re-processing the first
batch and overwriting the register. Widen NTRS paging to page.from 0..400 so
enough fresh usable docs are found after the skip.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../scripts/fmea_anthropic_extract.py         | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/ai-compliance-sdk/scripts/fmea_anthropic_extract.py b/ai-compliance-sdk/scripts/fmea_anthropic_extract.py
index 35b417a2..c83cf17c 100644
--- a/ai-compliance-sdk/scripts/fmea_anthropic_extract.py
+++ b/ai-compliance-sdk/scripts/fmea_anthropic_extract.py
@@ -125,29 +125,44 @@ def main():
         print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1)
     subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json",
                     "-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True)
-    manifest, seen, processed = [], set(), 0
+    # Resume cumulatively: keep the prior manifest, skip already-processed docs,
+    # and add MAX_DOCS NEW ones so the register grows (100 -> 200 -> ...).
+    manifest, seen = [], set()
+    if os.path.exists(MANIFEST):
+        try:
+            manifest = json.load(open(MANIFEST)).get("documents", [])
+            seen = {str(d.get("id")) for d in manifest}
+        except Exception:
+            manifest, seen = [], set()
+    if os.path.exists(OUT):
+        for line in open(OUT):
+            try:
+                seen.add(str(json.loads(line).get("_id")))
+            except Exception:
+                pass
+    new_count = 0
     for q in QUERIES:
-        if processed >= MAX_DOCS:
+        if new_count >= MAX_DOCS:
             break
-        for frm in (0, 100, 200):
-            if processed >= MAX_DOCS:
+        for frm in (0, 100, 200, 300, 400):
+            if new_count >= MAX_DOCS:
                 break
             try:
                 res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90)
             except Exception as e:
                 print(f"search '{q}'@{frm} error: {e}", flush=True); continue
             for l in res.get("results", []):
-                if processed >= MAX_DOCS:
+                if new_count >= MAX_DOCS:
                     break
                 did = l.get("id")
-                if not did or did in seen:
+                if not did or str(did) in seen:
                     continue
                 if not ntrs_usable(l):
                     continue
                 url = pdf_url(l)
                 if not url:
                     continue
-                seen.add(did)
+                seen.add(str(did))
                 pdf = f"{WORK}/{did}.pdf"
                 subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True)
                 sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0
@@ -175,12 +190,12 @@ def main():
                 manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url,
                                  "used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""),
                                  "confidence": t.get("confidence", "")})
-                processed += 1
+                new_count += 1
                 json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest},
                           open(MANIFEST, "w"), ensure_ascii=False, indent=1)
-                print(f"  [{processed}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True)
+                print(f"  [+{new_count}/{MAX_DOCS} | total {len(manifest)}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True)
     used_n = sum(1 for m in manifest if m["used"])
-    print(f"DONE: {processed} processed, {used_n} used (applicable) -> {MANIFEST}", flush=True)
+    print(f"DONE: +{new_count} new this run, {len(manifest)} total, {used_n} used (applicable) -> {MANIFEST}", flush=True)
 
 
 if __name__ == "__main__":