feat(iace): make FMEA harvest cumulative + resumable
Load the prior manifest, seed the seen-set from it + anthropic.jsonl, and add MAX_DOCS NEW docs per run (100->200->...) instead of re-processing the first batch and overwriting the register. Widen NTRS paging to page.from 0..400 so enough fresh usable docs are found after the skip. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -125,29 +125,44 @@ def main():
|
|||||||
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1)
|
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr); sys.exit(1)
|
||||||
subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json",
|
subprocess.run(["curl", "-sk", "-X", "POST", f"{RAG}/api/v1/collections", "-H", "Content-Type: application/json",
|
||||||
"-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True)
|
"-d", json.dumps({"name": COLLECTION, "vector_size": 1024})], capture_output=True)
|
||||||
manifest, seen, processed = [], set(), 0
|
# Resume cumulatively: keep the prior manifest, skip already-processed docs,
|
||||||
|
# and add MAX_DOCS NEW ones so the register grows (100 -> 200 -> ...).
|
||||||
|
manifest, seen = [], set()
|
||||||
|
if os.path.exists(MANIFEST):
|
||||||
|
try:
|
||||||
|
manifest = json.load(open(MANIFEST)).get("documents", [])
|
||||||
|
seen = {str(d.get("id")) for d in manifest}
|
||||||
|
except Exception:
|
||||||
|
manifest, seen = [], set()
|
||||||
|
if os.path.exists(OUT):
|
||||||
|
for line in open(OUT):
|
||||||
|
try:
|
||||||
|
seen.add(str(json.loads(line).get("_id")))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
new_count = 0
|
||||||
for q in QUERIES:
|
for q in QUERIES:
|
||||||
if processed >= MAX_DOCS:
|
if new_count >= MAX_DOCS:
|
||||||
break
|
break
|
||||||
for frm in (0, 100, 200):
|
for frm in (0, 100, 200, 300, 400):
|
||||||
if processed >= MAX_DOCS:
|
if new_count >= MAX_DOCS:
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90)
|
res = get_json(f"{NTRS}/api/citations/search?q={urllib.parse.quote(q)}&page.size=100&page.from={frm}&highlight=false", timeout=90)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"search '{q}'@{frm} error: {e}", flush=True); continue
|
print(f"search '{q}'@{frm} error: {e}", flush=True); continue
|
||||||
for l in res.get("results", []):
|
for l in res.get("results", []):
|
||||||
if processed >= MAX_DOCS:
|
if new_count >= MAX_DOCS:
|
||||||
break
|
break
|
||||||
did = l.get("id")
|
did = l.get("id")
|
||||||
if not did or did in seen:
|
if not did or str(did) in seen:
|
||||||
continue
|
continue
|
||||||
if not ntrs_usable(l):
|
if not ntrs_usable(l):
|
||||||
continue
|
continue
|
||||||
url = pdf_url(l)
|
url = pdf_url(l)
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
seen.add(did)
|
seen.add(str(did))
|
||||||
pdf = f"{WORK}/{did}.pdf"
|
pdf = f"{WORK}/{did}.pdf"
|
||||||
subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True)
|
subprocess.run(["curl", "-sL", "--max-time", "150", "-o", pdf, url], capture_output=True)
|
||||||
sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0
|
sz = os.path.getsize(pdf) if os.path.exists(pdf) else 0
|
||||||
@@ -175,12 +190,12 @@ def main():
|
|||||||
manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url,
|
manifest.append({"id": did, "title": title, "source": f"NASA NTRS {did}", "license": lic, "url": url,
|
||||||
"used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""),
|
"used": used, "component": t.get("component", ""), "failure_mode": t.get("failure_mode", ""),
|
||||||
"confidence": t.get("confidence", "")})
|
"confidence": t.get("confidence", "")})
|
||||||
processed += 1
|
new_count += 1
|
||||||
json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest},
|
json.dump({"generated": "nightly", "model": MODEL, "count": len(manifest), "documents": manifest},
|
||||||
open(MANIFEST, "w"), ensure_ascii=False, indent=1)
|
open(MANIFEST, "w"), ensure_ascii=False, indent=1)
|
||||||
print(f" [{processed}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True)
|
print(f" [+{new_count}/{MAX_DOCS} | total {len(manifest)}] {did} used={used} {t.get('component','?')}→{t.get('failure_mode','?')} ({usage.get('input_tokens')}in)", flush=True)
|
||||||
used_n = sum(1 for m in manifest if m["used"])
|
used_n = sum(1 for m in manifest if m["used"])
|
||||||
print(f"DONE: {processed} processed, {used_n} used (applicable) -> {MANIFEST}", flush=True)
|
print(f"DONE: +{new_count} new this run, {len(manifest)} total, {used_n} used (applicable) -> {MANIFEST}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user