feat(embedding): NIST PDF text normalization + safe re-ingest script

Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 06:42:46 +02:00
parent 97a7f6f264
commit 0b0eed27b0
4 changed files with 738 additions and 9 deletions
@@ -0,0 +1,485 @@
+#!/usr/bin/env python3
+"""Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO.
+
+Uses upload-before-delete pattern: new chunks are created FIRST,
+old chunks are only deleted after successful verification.
+
+Usage:
+    python3 control-pipeline/scripts/reingest_nist.py [--dry-run]
+    python3 control-pipeline/scripts/reingest_nist.py --only-missing
+"""
+
+import argparse
+import json
+import logging
+import sys
+import time
+
+import httpx
+
+sys.path.insert(0, "control-pipeline/scripts")
+from reingest_d5_config import (  # noqa: E402
+    CHUNK_OVERLAP,
+    CHUNK_SIZE,
+    CHUNK_STRATEGY,
+    DEFAULT_QDRANT_URL,
+    DEFAULT_RAG_URL,
+    content_type_from_filename,
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+logger = logging.getLogger("reingest-nist")
+
+UPLOAD_TIMEOUT = 1800.0  # 30 min for large PDFs
+
+# -------------------------------------------------------------------
+# Documents to re-ingest
+# -------------------------------------------------------------------
+
+# 4 documents with 0 chunks (deleted by D5, upload failed)
+MISSING_DOCS = [
+    {
+        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "NIST_SP_800_53r5.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_53r5",
+            "source_id": "nist",
+            "doc_type": "controls_catalog",
+            "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
+            "license": "public_domain_us_gov",
+            "attribution": "NIST",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_82r3.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_82r3",
+            "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
+            "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
+            "regulation_short": "NIST SP 800-82",
+            "category": "ot_security",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_160v1r1.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_160v1r1",
+            "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
+            "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
+            "regulation_short": "NIST SP 800-160",
+            "category": "security_engineering",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "NIST_SP_800_207.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_207",
+            "source_id": "nist",
+            "doc_type": "architecture",
+            "guideline_name": "NIST SP 800-207 Zero Trust Architecture",
+            "license": "public_domain_us_gov",
+            "attribution": "NIST",
+            "source": "nist.gov",
+        },
+    },
+]
+
+# Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality)
+LOW_QUALITY_DOCS = [
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "nist_csf_2_0.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_csf_2_0",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "nistir_8259a.pdf",
+        "extra_metadata": {
+            "regulation_id": "nistir_8259a",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "nist_ai_rmf.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_ai_rmf",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_30r1.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_30r1",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "enisa_supply_chain_good_practices.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_supply_chain_good_practices",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "enisa_ics_scada.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_ics_scada_dependencies",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "enisa_supply_chain_security.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_threat_landscape_supply_chain",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "cisa_secure_by_design.pdf",
+        "extra_metadata": {
+            "regulation_id": "cisa_secure_by_design",
+            "license": "public_domain_us",
+            "source": "cisa.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "cvss_v4_0.pdf",
+        "extra_metadata": {
+            "regulation_id": "cvss_v4_0",
+            "license": "public_domain_us",
+            "source": "first.org",
+        },
+    },
+]
+
+
+# -------------------------------------------------------------------
+# Qdrant helpers
+# -------------------------------------------------------------------
+def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int:
+    """Count existing chunks for a document in Qdrant."""
+    with httpx.Client(timeout=30.0) as c:
+        resp = c.post(
+            f"{qdrant_url}/collections/{collection}/points/count",
+            json={
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }]
+                },
+                "exact": True,
+            },
+        )
+        resp.raise_for_status()
+        return resp.json()["result"]["count"]
+
+
+def get_old_document_ids(
+    qdrant_url: str, collection: str, object_name: str,
+) -> set:
+    """Get all document_ids for existing chunks of this document."""
+    doc_ids = set()
+    offset = None
+    with httpx.Client(timeout=60.0) as c:
+        while True:
+            body = {
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }]
+                },
+                "limit": 100,
+                "with_payload": ["document_id"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{qdrant_url}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                did = pt.get("payload", {}).get("document_id")
+                if did:
+                    doc_ids.add(did)
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return doc_ids
+
+
+def delete_by_document_ids(
+    qdrant_url: str, collection: str, doc_ids: set,
+) -> None:
+    """Delete chunks matching specific document_ids."""
+    for did in doc_ids:
+        with httpx.Client(timeout=30.0) as c:
+            c.post(
+                f"{qdrant_url}/collections/{collection}/points/delete",
+                json={
+                    "filter": {
+                        "must": [{
+                            "key": "document_id",
+                            "match": {"value": did},
+                        }]
+                    }
+                },
+            ).raise_for_status()
+
+
+def check_section_rate(
+    qdrant_url: str, collection: str, object_name: str,
+) -> tuple:
+    """Check section rate for a document's chunks. Returns (total, with_section)."""
+    total = 0
+    with_section = 0
+    offset = None
+    with httpx.Client(timeout=60.0) as c:
+        while True:
+            body = {
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }]
+                },
+                "limit": 100,
+                "with_payload": ["section"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{qdrant_url}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                total += 1
+                sec = pt.get("payload", {}).get("section", "")
+                if sec and sec.strip():
+                    with_section += 1
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return total, with_section
+
+
+# -------------------------------------------------------------------
+# Upload
+# -------------------------------------------------------------------
+def download_from_minio(rag_url: str, object_name: str) -> bytes:
+    """Download file from MinIO via RAG service presigned URL."""
+    with httpx.Client(timeout=60.0, verify=False) as c:
+        resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}")
+        resp.raise_for_status()
+        presigned_url = resp.json()["url"]
+
+    with httpx.Client(timeout=300.0, verify=False) as c:
+        resp = c.get(presigned_url)
+        resp.raise_for_status()
+        return resp.content
+
+
+def upload_document(
+    rag_url: str,
+    file_bytes: bytes,
+    filename: str,
+    collection: str,
+    extra_metadata: dict,
+) -> dict:
+    """Upload document to RAG service."""
+    ct = content_type_from_filename(filename)
+    form_data = {
+        "collection": collection,
+        "data_type": "compliance",
+        "bundesland": "bund",
+        "use_case": "compliance",
+        "year": "2026",
+        "chunk_strategy": CHUNK_STRATEGY,
+        "chunk_size": str(CHUNK_SIZE),
+        "chunk_overlap": str(CHUNK_OVERLAP),
+        "metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
+    }
+    with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
+        resp = c.post(
+            f"{rag_url}/api/v1/documents/upload",
+            files={"file": (filename, file_bytes, ct)},
+            data=form_data,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+# -------------------------------------------------------------------
+# Main processing
+# -------------------------------------------------------------------
+def process_document(
+    doc: dict,
+    rag_url: str,
+    qdrant_url: str,
+    dry_run: bool = False,
+) -> dict:
+    """Safe re-ingest: upload first, then delete old. Returns result dict."""
+    obj = doc["object_name"]
+    coll = doc["collection"]
+    fname = doc["filename"]
+
+    # 1. Check existing state
+    old_count = count_chunks(qdrant_url, coll, obj)
+    old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set()
+    logger.info("  [%s] existing: %d chunks, %d document_ids",
+                fname, old_count, len(old_doc_ids))
+
+    if dry_run:
+        logger.info("  [%s] DRY RUN — would download + upload + delete old", fname)
+        return {"status": "dry_run", "old_chunks": old_count}
+
+    # 2. Download from MinIO
+    logger.info("  [%s] downloading from MinIO...", fname)
+    file_bytes = download_from_minio(rag_url, obj)
+    size_mb = len(file_bytes) / (1024 * 1024)
+    logger.info("  [%s] downloaded %.1f MB", fname, size_mb)
+
+    # 3. Upload FIRST (creates new chunks)
+    logger.info("  [%s] uploading to RAG service...", fname)
+    result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"])
+    new_chunks = result.get("chunks_count", 0)
+    new_doc_id = result.get("document_id", "")
+    logger.info("  [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id)
+
+    # 4. Verify new chunks exist
+    if new_chunks == 0:
+        logger.error("  [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname)
+        return {"status": "error", "error": "0 new chunks", "old_chunks": old_count}
+
+    # 5. Delete old chunks (only if there were any)
+    if old_doc_ids:
+        logger.info("  [%s] deleting %d old document_ids...", fname, len(old_doc_ids))
+        delete_by_document_ids(qdrant_url, coll, old_doc_ids)
+        logger.info("  [%s] old chunks deleted", fname)
+
+    # 6. Check section rate
+    total, with_sec = check_section_rate(qdrant_url, coll, obj)
+    pct = (with_sec / total * 100) if total > 0 else 0
+    logger.info("  [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct)
+
+    return {
+        "status": "ok",
+        "old_chunks": old_count,
+        "new_chunks": new_chunks,
+        "new_document_id": new_doc_id,
+        "section_rate": round(pct, 1),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
+    parser.add_argument("--only-missing", action="store_true",
+                        help="Only re-ingest the 4 missing docs (skip low-quality)")
+    parser.add_argument("--rag-url", default=DEFAULT_RAG_URL)
+    parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL)
+    args = parser.parse_args()
+
+    docs = list(MISSING_DOCS)
+    if not args.only_missing:
+        docs.extend(LOW_QUALITY_DOCS)
+
+    logger.info("=" * 60)
+    logger.info("NIST/BSI/ENISA Safe Re-Ingestion")
+    logger.info("  Documents: %d (%d missing + %d low-quality)",
+                len(docs), len(MISSING_DOCS),
+                0 if args.only_missing else len(LOW_QUALITY_DOCS))
+    logger.info("  RAG: %s", args.rag_url)
+    logger.info("  Qdrant: %s", args.qdrant_url)
+    logger.info("  Dry run: %s", args.dry_run)
+    logger.info("=" * 60)
+
+    results = {}
+    ok = 0
+    errors = 0
+
+    for i, doc in enumerate(docs, 1):
+        logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"])
+        try:
+            r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run)
+            results[doc["filename"]] = r
+            if r["status"] == "ok":
+                ok += 1
+            elif r["status"] == "error":
+                errors += 1
+        except Exception as e:
+            logger.error("  FAILED: %s", e)
+            results[doc["filename"]] = {"status": "error", "error": str(e)}
+            errors += 1
+
+        if i < len(docs):
+            time.sleep(2)
+
+    # Summary
+    logger.info("")
+    logger.info("=" * 60)
+    logger.info("RESULTS")
+    logger.info("=" * 60)
+    for fname, r in results.items():
+        status = r["status"].upper()
+        old = r.get("old_chunks", "?")
+        new = r.get("new_chunks", "?")
+        sec = r.get("section_rate", "?")
+        logger.info("  %-40s %s  old=%s new=%s sect=%.0f%%",
+                     fname, status, old, new, sec if isinstance(sec, float) else 0)
+
+    logger.info("")
+    logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs))
+
+    if errors > 0:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()