breakpilot-core/control-pipeline/scripts/reingest_nist.py

#!/usr/bin/env python3
"""Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO.

Uses upload-before-delete pattern: new chunks are created FIRST,
old chunks are only deleted after successful verification.

Usage:
    python3 control-pipeline/scripts/reingest_nist.py [--dry-run]
    python3 control-pipeline/scripts/reingest_nist.py --only-missing
"""

import argparse
import json
import logging
import sys
import time

import httpx

sys.path.insert(0, "control-pipeline/scripts")
from reingest_d5_config import (  # noqa: E402
    CHUNK_OVERLAP,
    CHUNK_SIZE,
    CHUNK_STRATEGY,
    DEFAULT_QDRANT_URL,
    DEFAULT_RAG_URL,
    content_type_from_filename,
)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("reingest-nist")

UPLOAD_TIMEOUT = 1800.0  # 30 min for large PDFs

# -------------------------------------------------------------------
# Documents to re-ingest
# -------------------------------------------------------------------

# 4 documents with 0 chunks (deleted by D5, upload failed)
MISSING_DOCS = [
    {
        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "NIST_SP_800_53r5.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp800_53r5",
            "source_id": "nist",
            "doc_type": "controls_catalog",
            "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
            "license": "public_domain_us_gov",
            "attribution": "NIST",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
        "collection": "bp_compliance_ce",
        "filename": "nist_sp_800_82r3.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp_800_82r3",
            "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
            "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
            "regulation_short": "NIST SP 800-82",
            "category": "ot_security",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
        "collection": "bp_compliance_ce",
        "filename": "nist_sp_800_160v1r1.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp_800_160v1r1",
            "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
            "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
            "regulation_short": "NIST SP 800-160",
            "category": "security_engineering",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "NIST_SP_800_207.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp800_207",
            "source_id": "nist",
            "doc_type": "architecture",
            "guideline_name": "NIST SP 800-207 Zero Trust Architecture",
            "license": "public_domain_us_gov",
            "attribution": "NIST",
            "source": "nist.gov",
        },
    },
]

# Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality)
LOW_QUALITY_DOCS = [
    {
        "object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "nist_csf_2_0.pdf",
        "extra_metadata": {
            "regulation_id": "nist_csf_2_0",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "nistir_8259a.pdf",
        "extra_metadata": {
            "regulation_id": "nistir_8259a",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "nist_ai_rmf.pdf",
        "extra_metadata": {
            "regulation_id": "nist_ai_rmf",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
        "collection": "bp_compliance_ce",
        "filename": "nist_sp_800_30r1.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp_800_30r1",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
        "collection": "bp_compliance_ce",
        "filename": "enisa_supply_chain_good_practices.pdf",
        "extra_metadata": {
            "regulation_id": "enisa_supply_chain_good_practices",
            "license": "reuse_with_attribution",
            "source": "enisa.europa.eu",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
        "collection": "bp_compliance_ce",
        "filename": "enisa_ics_scada.pdf",
        "extra_metadata": {
            "regulation_id": "enisa_ics_scada_dependencies",
            "license": "reuse_with_attribution",
            "source": "enisa.europa.eu",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
        "collection": "bp_compliance_ce",
        "filename": "enisa_supply_chain_security.pdf",
        "extra_metadata": {
            "regulation_id": "enisa_threat_landscape_supply_chain",
            "license": "reuse_with_attribution",
            "source": "enisa.europa.eu",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
        "collection": "bp_compliance_ce",
        "filename": "cisa_secure_by_design.pdf",
        "extra_metadata": {
            "regulation_id": "cisa_secure_by_design",
            "license": "public_domain_us",
            "source": "cisa.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
        "collection": "bp_compliance_ce",
        "filename": "cvss_v4_0.pdf",
        "extra_metadata": {
            "regulation_id": "cvss_v4_0",
            "license": "public_domain_us",
            "source": "first.org",
        },
    },
]


# -------------------------------------------------------------------
# Qdrant helpers
# -------------------------------------------------------------------
def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int:
    """Count existing chunks for a document in Qdrant."""
    with httpx.Client(timeout=30.0) as c:
        resp = c.post(
            f"{qdrant_url}/collections/{collection}/points/count",
            json={
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }]
                },
                "exact": True,
            },
        )
        resp.raise_for_status()
        return resp.json()["result"]["count"]


def get_old_document_ids(
    qdrant_url: str, collection: str, object_name: str,
) -> set:
    """Get all document_ids for existing chunks of this document."""
    doc_ids = set()
    offset = None
    with httpx.Client(timeout=60.0) as c:
        while True:
            body = {
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }]
                },
                "limit": 100,
                "with_payload": ["document_id"],
            }
            if offset is not None:
                body["offset"] = offset
            resp = c.post(
                f"{qdrant_url}/collections/{collection}/points/scroll",
                json=body,
            )
            resp.raise_for_status()
            data = resp.json()["result"]
            for pt in data["points"]:
                did = pt.get("payload", {}).get("document_id")
                if did:
                    doc_ids.add(did)
            offset = data.get("next_page_offset")
            if offset is None:
                break
    return doc_ids


def delete_by_document_ids(
    qdrant_url: str, collection: str, doc_ids: set,
) -> None:
    """Delete chunks matching specific document_ids."""
    for did in doc_ids:
        with httpx.Client(timeout=30.0) as c:
            c.post(
                f"{qdrant_url}/collections/{collection}/points/delete",
                json={
                    "filter": {
                        "must": [{
                            "key": "document_id",
                            "match": {"value": did},
                        }]
                    }
                },
            ).raise_for_status()


def check_section_rate(
    qdrant_url: str, collection: str, object_name: str,
) -> tuple:
    """Check section rate for a document's chunks. Returns (total, with_section)."""
    total = 0
    with_section = 0
    offset = None
    with httpx.Client(timeout=60.0) as c:
        while True:
            body = {
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }]
                },
                "limit": 100,
                "with_payload": ["section"],
            }
            if offset is not None:
                body["offset"] = offset
            resp = c.post(
                f"{qdrant_url}/collections/{collection}/points/scroll",
                json=body,
            )
            resp.raise_for_status()
            data = resp.json()["result"]
            for pt in data["points"]:
                total += 1
                sec = pt.get("payload", {}).get("section", "")
                if sec and sec.strip():
                    with_section += 1
            offset = data.get("next_page_offset")
            if offset is None:
                break
    return total, with_section


# -------------------------------------------------------------------
# Upload
# -------------------------------------------------------------------
def download_from_minio(rag_url: str, object_name: str) -> bytes:
    """Download file from MinIO via RAG service presigned URL."""
    with httpx.Client(timeout=60.0, verify=False) as c:
        resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}")
        resp.raise_for_status()
        presigned_url = resp.json()["url"]

    with httpx.Client(timeout=300.0, verify=False) as c:
        resp = c.get(presigned_url)
        resp.raise_for_status()
        return resp.content


def upload_document(
    rag_url: str,
    file_bytes: bytes,
    filename: str,
    collection: str,
    extra_metadata: dict,
) -> dict:
    """Upload document to RAG service."""
    ct = content_type_from_filename(filename)
    form_data = {
        "collection": collection,
        "data_type": "compliance",
        "bundesland": "bund",
        "use_case": "compliance",
        "year": "2026",
        "chunk_strategy": CHUNK_STRATEGY,
        "chunk_size": str(CHUNK_SIZE),
        "chunk_overlap": str(CHUNK_OVERLAP),
        "metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
    }
    with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
        resp = c.post(
            f"{rag_url}/api/v1/documents/upload",
            files={"file": (filename, file_bytes, ct)},
            data=form_data,
        )
        resp.raise_for_status()
        return resp.json()


# -------------------------------------------------------------------
# Main processing
# -------------------------------------------------------------------
def process_document(
    doc: dict,
    rag_url: str,
    qdrant_url: str,
    dry_run: bool = False,
) -> dict:
    """Safe re-ingest: upload first, then delete old. Returns result dict."""
    obj = doc["object_name"]
    coll = doc["collection"]
    fname = doc["filename"]

    # 1. Check existing state
    old_count = count_chunks(qdrant_url, coll, obj)
    old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set()
    logger.info("  [%s] existing: %d chunks, %d document_ids",
                fname, old_count, len(old_doc_ids))

    if dry_run:
        logger.info("  [%s] DRY RUN — would download + upload + delete old", fname)
        return {"status": "dry_run", "old_chunks": old_count}

    # 2. Download from MinIO
    logger.info("  [%s] downloading from MinIO...", fname)
    file_bytes = download_from_minio(rag_url, obj)
    size_mb = len(file_bytes) / (1024 * 1024)
    logger.info("  [%s] downloaded %.1f MB", fname, size_mb)

    # 3. Upload FIRST (creates new chunks)
    logger.info("  [%s] uploading to RAG service...", fname)
    result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"])
    new_chunks = result.get("chunks_count", 0)
    new_doc_id = result.get("document_id", "")
    logger.info("  [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id)

    # 4. Verify new chunks exist
    if new_chunks == 0:
        logger.error("  [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname)
        return {"status": "error", "error": "0 new chunks", "old_chunks": old_count}

    # 5. Delete old chunks (only if there were any)
    if old_doc_ids:
        logger.info("  [%s] deleting %d old document_ids...", fname, len(old_doc_ids))
        delete_by_document_ids(qdrant_url, coll, old_doc_ids)
        logger.info("  [%s] old chunks deleted", fname)

    # 6. Check section rate
    total, with_sec = check_section_rate(qdrant_url, coll, obj)
    pct = (with_sec / total * 100) if total > 0 else 0
    logger.info("  [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct)

    return {
        "status": "ok",
        "old_chunks": old_count,
        "new_chunks": new_chunks,
        "new_document_id": new_doc_id,
        "section_rate": round(pct, 1),
    }


def main():
    parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion")
    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
    parser.add_argument("--only-missing", action="store_true",
                        help="Only re-ingest the 4 missing docs (skip low-quality)")
    parser.add_argument("--rag-url", default=DEFAULT_RAG_URL)
    parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL)
    args = parser.parse_args()

    docs = list(MISSING_DOCS)
    if not args.only_missing:
        docs.extend(LOW_QUALITY_DOCS)

    logger.info("=" * 60)
    logger.info("NIST/BSI/ENISA Safe Re-Ingestion")
    logger.info("  Documents: %d (%d missing + %d low-quality)",
                len(docs), len(MISSING_DOCS),
                0 if args.only_missing else len(LOW_QUALITY_DOCS))
    logger.info("  RAG: %s", args.rag_url)
    logger.info("  Qdrant: %s", args.qdrant_url)
    logger.info("  Dry run: %s", args.dry_run)
    logger.info("=" * 60)

    results = {}
    ok = 0
    errors = 0

    for i, doc in enumerate(docs, 1):
        logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"])
        try:
            r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run)
            results[doc["filename"]] = r
            if r["status"] == "ok":
                ok += 1
            elif r["status"] == "error":
                errors += 1
        except Exception as e:
            logger.error("  FAILED: %s", e)
            results[doc["filename"]] = {"status": "error", "error": str(e)}
            errors += 1

        if i < len(docs):
            time.sleep(2)

    # Summary
    logger.info("")
    logger.info("=" * 60)
    logger.info("RESULTS")
    logger.info("=" * 60)
    for fname, r in results.items():
        status = r["status"].upper()
        old = r.get("old_chunks", "?")
        new = r.get("new_chunks", "?")
        sec = r.get("section_rate", "?")
        logger.info("  %-40s %s  old=%s new=%s sect=%.0f%%",
                     fname, status, old, new, sec if isinstance(sec, float) else 0)

    logger.info("")
    logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs))

    if errors > 0:
        sys.exit(1)


if __name__ == "__main__":
    main()