#!/usr/bin/env python3 """Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO. Uses upload-before-delete pattern: new chunks are created FIRST, old chunks are only deleted after successful verification. Usage: python3 control-pipeline/scripts/reingest_nist.py [--dry-run] python3 control-pipeline/scripts/reingest_nist.py --only-missing """ import argparse import json import logging import sys import time import httpx sys.path.insert(0, "control-pipeline/scripts") from reingest_d5_config import ( # noqa: E402 CHUNK_OVERLAP, CHUNK_SIZE, CHUNK_STRATEGY, DEFAULT_QDRANT_URL, DEFAULT_RAG_URL, content_type_from_filename, ) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) logger = logging.getLogger("reingest-nist") UPLOAD_TIMEOUT = 1800.0 # 30 min for large PDFs # ------------------------------------------------------------------- # Documents to re-ingest # ------------------------------------------------------------------- # 4 documents with 0 chunks (deleted by D5, upload failed) MISSING_DOCS = [ { "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf", "collection": "bp_compliance_datenschutz", "filename": "NIST_SP_800_53r5.pdf", "extra_metadata": { "regulation_id": "nist_sp800_53r5", "source_id": "nist", "doc_type": "controls_catalog", "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls", "license": "public_domain_us_gov", "attribution": "NIST", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf", "collection": "bp_compliance_ce", "filename": "nist_sp_800_82r3.pdf", "extra_metadata": { "regulation_id": "nist_sp_800_82r3", "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security", "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security", "regulation_short": "NIST SP 800-82", "category": "ot_security", "license": "public_domain_us", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf", "collection": "bp_compliance_ce", "filename": "nist_sp_800_160v1r1.pdf", "extra_metadata": { "regulation_id": "nist_sp_800_160v1r1", "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1", "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1", "regulation_short": "NIST SP 800-160", "category": "security_engineering", "license": "public_domain_us", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf", "collection": "bp_compliance_datenschutz", "filename": "NIST_SP_800_207.pdf", "extra_metadata": { "regulation_id": "nist_sp800_207", "source_id": "nist", "doc_type": "architecture", "guideline_name": "NIST SP 800-207 Zero Trust Architecture", "license": "public_domain_us_gov", "attribution": "NIST", "source": "nist.gov", }, }, ] # Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality) LOW_QUALITY_DOCS = [ { "object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf", "collection": "bp_compliance_datenschutz", "filename": "nist_csf_2_0.pdf", "extra_metadata": { "regulation_id": "nist_csf_2_0", "license": "public_domain_us", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf", "collection": "bp_compliance_datenschutz", "filename": "nistir_8259a.pdf", "extra_metadata": { "regulation_id": "nistir_8259a", "license": "public_domain_us", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf", "collection": "bp_compliance_datenschutz", "filename": "nist_ai_rmf.pdf", "extra_metadata": { "regulation_id": "nist_ai_rmf", "license": "public_domain_us", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf", "collection": "bp_compliance_ce", "filename": "nist_sp_800_30r1.pdf", "extra_metadata": { "regulation_id": "nist_sp_800_30r1", "license": "public_domain_us", "source": "nist.gov", }, }, { "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf", "collection": "bp_compliance_ce", "filename": "enisa_supply_chain_good_practices.pdf", "extra_metadata": { "regulation_id": "enisa_supply_chain_good_practices", "license": "reuse_with_attribution", "source": "enisa.europa.eu", }, }, { "object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf", "collection": "bp_compliance_ce", "filename": "enisa_ics_scada.pdf", "extra_metadata": { "regulation_id": "enisa_ics_scada_dependencies", "license": "reuse_with_attribution", "source": "enisa.europa.eu", }, }, { "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf", "collection": "bp_compliance_ce", "filename": "enisa_supply_chain_security.pdf", "extra_metadata": { "regulation_id": "enisa_threat_landscape_supply_chain", "license": "reuse_with_attribution", "source": "enisa.europa.eu", }, }, { "object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf", "collection": "bp_compliance_ce", "filename": "cisa_secure_by_design.pdf", "extra_metadata": { "regulation_id": "cisa_secure_by_design", "license": "public_domain_us", "source": "cisa.gov", }, }, { "object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf", "collection": "bp_compliance_ce", "filename": "cvss_v4_0.pdf", "extra_metadata": { "regulation_id": "cvss_v4_0", "license": "public_domain_us", "source": "first.org", }, }, ] # ------------------------------------------------------------------- # Qdrant helpers # ------------------------------------------------------------------- def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int: """Count existing chunks for a document in Qdrant.""" with httpx.Client(timeout=30.0) as c: resp = c.post( f"{qdrant_url}/collections/{collection}/points/count", json={ "filter": { "must": [{ "key": "object_name", "match": {"value": object_name}, }] }, "exact": True, }, ) resp.raise_for_status() return resp.json()["result"]["count"] def get_old_document_ids( qdrant_url: str, collection: str, object_name: str, ) -> set: """Get all document_ids for existing chunks of this document.""" doc_ids = set() offset = None with httpx.Client(timeout=60.0) as c: while True: body = { "filter": { "must": [{ "key": "object_name", "match": {"value": object_name}, }] }, "limit": 100, "with_payload": ["document_id"], } if offset is not None: body["offset"] = offset resp = c.post( f"{qdrant_url}/collections/{collection}/points/scroll", json=body, ) resp.raise_for_status() data = resp.json()["result"] for pt in data["points"]: did = pt.get("payload", {}).get("document_id") if did: doc_ids.add(did) offset = data.get("next_page_offset") if offset is None: break return doc_ids def delete_by_document_ids( qdrant_url: str, collection: str, doc_ids: set, ) -> None: """Delete chunks matching specific document_ids.""" for did in doc_ids: with httpx.Client(timeout=30.0) as c: c.post( f"{qdrant_url}/collections/{collection}/points/delete", json={ "filter": { "must": [{ "key": "document_id", "match": {"value": did}, }] } }, ).raise_for_status() def check_section_rate( qdrant_url: str, collection: str, object_name: str, ) -> tuple: """Check section rate for a document's chunks. Returns (total, with_section).""" total = 0 with_section = 0 offset = None with httpx.Client(timeout=60.0) as c: while True: body = { "filter": { "must": [{ "key": "object_name", "match": {"value": object_name}, }] }, "limit": 100, "with_payload": ["section"], } if offset is not None: body["offset"] = offset resp = c.post( f"{qdrant_url}/collections/{collection}/points/scroll", json=body, ) resp.raise_for_status() data = resp.json()["result"] for pt in data["points"]: total += 1 sec = pt.get("payload", {}).get("section", "") if sec and sec.strip(): with_section += 1 offset = data.get("next_page_offset") if offset is None: break return total, with_section # ------------------------------------------------------------------- # Upload # ------------------------------------------------------------------- def download_from_minio(rag_url: str, object_name: str) -> bytes: """Download file from MinIO via RAG service presigned URL.""" with httpx.Client(timeout=60.0, verify=False) as c: resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}") resp.raise_for_status() presigned_url = resp.json()["url"] with httpx.Client(timeout=300.0, verify=False) as c: resp = c.get(presigned_url) resp.raise_for_status() return resp.content def upload_document( rag_url: str, file_bytes: bytes, filename: str, collection: str, extra_metadata: dict, ) -> dict: """Upload document to RAG service.""" ct = content_type_from_filename(filename) form_data = { "collection": collection, "data_type": "compliance", "bundesland": "bund", "use_case": "compliance", "year": "2026", "chunk_strategy": CHUNK_STRATEGY, "chunk_size": str(CHUNK_SIZE), "chunk_overlap": str(CHUNK_OVERLAP), "metadata_json": json.dumps(extra_metadata, ensure_ascii=False), } with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c: resp = c.post( f"{rag_url}/api/v1/documents/upload", files={"file": (filename, file_bytes, ct)}, data=form_data, ) resp.raise_for_status() return resp.json() # ------------------------------------------------------------------- # Main processing # ------------------------------------------------------------------- def process_document( doc: dict, rag_url: str, qdrant_url: str, dry_run: bool = False, ) -> dict: """Safe re-ingest: upload first, then delete old. Returns result dict.""" obj = doc["object_name"] coll = doc["collection"] fname = doc["filename"] # 1. Check existing state old_count = count_chunks(qdrant_url, coll, obj) old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set() logger.info(" [%s] existing: %d chunks, %d document_ids", fname, old_count, len(old_doc_ids)) if dry_run: logger.info(" [%s] DRY RUN — would download + upload + delete old", fname) return {"status": "dry_run", "old_chunks": old_count} # 2. Download from MinIO logger.info(" [%s] downloading from MinIO...", fname) file_bytes = download_from_minio(rag_url, obj) size_mb = len(file_bytes) / (1024 * 1024) logger.info(" [%s] downloaded %.1f MB", fname, size_mb) # 3. Upload FIRST (creates new chunks) logger.info(" [%s] uploading to RAG service...", fname) result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"]) new_chunks = result.get("chunks_count", 0) new_doc_id = result.get("document_id", "") logger.info(" [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id) # 4. Verify new chunks exist if new_chunks == 0: logger.error(" [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname) return {"status": "error", "error": "0 new chunks", "old_chunks": old_count} # 5. Delete old chunks (only if there were any) if old_doc_ids: logger.info(" [%s] deleting %d old document_ids...", fname, len(old_doc_ids)) delete_by_document_ids(qdrant_url, coll, old_doc_ids) logger.info(" [%s] old chunks deleted", fname) # 6. Check section rate total, with_sec = check_section_rate(qdrant_url, coll, obj) pct = (with_sec / total * 100) if total > 0 else 0 logger.info(" [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct) return { "status": "ok", "old_chunks": old_count, "new_chunks": new_chunks, "new_document_id": new_doc_id, "section_rate": round(pct, 1), } def main(): parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion") parser.add_argument("--dry-run", action="store_true", help="Show what would happen") parser.add_argument("--only-missing", action="store_true", help="Only re-ingest the 4 missing docs (skip low-quality)") parser.add_argument("--rag-url", default=DEFAULT_RAG_URL) parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL) args = parser.parse_args() docs = list(MISSING_DOCS) if not args.only_missing: docs.extend(LOW_QUALITY_DOCS) logger.info("=" * 60) logger.info("NIST/BSI/ENISA Safe Re-Ingestion") logger.info(" Documents: %d (%d missing + %d low-quality)", len(docs), len(MISSING_DOCS), 0 if args.only_missing else len(LOW_QUALITY_DOCS)) logger.info(" RAG: %s", args.rag_url) logger.info(" Qdrant: %s", args.qdrant_url) logger.info(" Dry run: %s", args.dry_run) logger.info("=" * 60) results = {} ok = 0 errors = 0 for i, doc in enumerate(docs, 1): logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"]) try: r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run) results[doc["filename"]] = r if r["status"] == "ok": ok += 1 elif r["status"] == "error": errors += 1 except Exception as e: logger.error(" FAILED: %s", e) results[doc["filename"]] = {"status": "error", "error": str(e)} errors += 1 if i < len(docs): time.sleep(2) # Summary logger.info("") logger.info("=" * 60) logger.info("RESULTS") logger.info("=" * 60) for fname, r in results.items(): status = r["status"].upper() old = r.get("old_chunks", "?") new = r.get("new_chunks", "?") sec = r.get("section_rate", "?") logger.info(" %-40s %s old=%s new=%s sect=%.0f%%", fname, status, old, new, sec if isinstance(sec, float) else 0) logger.info("") logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs)) if errors > 0: sys.exit(1) if __name__ == "__main__": main()