fix(embedding): add NIST control IDs to _SECTION_NUMBER_RE

_SECTION_NUMBER_RE only had patterns for §/Art/Section/Kapitel/Annex but missed NIST-style identifiers (AC-1, GV.OC-01, 3.1, A01:2021). This caused 0% section rate for all NIST/BSI/ENISA documents even though sections were correctly detected — the section NUMBER wasn't extracted from the header. Also adds: - reupload_legal_strategy.py: re-upload with legal chunking - extract_and_upload_nist.py: local PDF extraction workaround - qdrant-snapshot.sh: backup mechanism for Qdrant collections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 07:42:06 +02:00
parent 0b0eed27b0
commit 2f4a3f2ea2
5 changed files with 843 additions and 0 deletions
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""Extract large NIST PDFs locally, then upload as .txt to RAG service.
+
+Workaround for embedding-service container crashing on large PDFs (>5 MB).
+Runs pdfplumber + normalization locally, uploads extracted text as .txt.
+
+Usage (on Mac Mini):
+    python3 control-pipeline/scripts/extract_and_upload_nist.py
+"""
+
+import json
+import os
+import re
+import sys
+import tempfile
+import unicodedata
+
+import httpx
+import pdfplumber
+
+RAG_URL = "https://localhost:8097"
+QDRANT_URL = "http://localhost:6333"
+
+DOCS = [
+    {
+        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "NIST_SP_800_53r5.txt",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_53r5",
+            "source_id": "nist",
+            "doc_type": "controls_catalog",
+            "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
+            "license": "public_domain_us_gov",
+            "attribution": "NIST",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_82r3.txt",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_82r3",
+            "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
+            "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
+            "regulation_short": "NIST SP 800-82",
+            "category": "ot_security",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_160v1r1.txt",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_160v1r1",
+            "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
+            "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
+            "regulation_short": "NIST SP 800-160",
+            "category": "security_engineering",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "NIST_SP_800_207.txt",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_207",
+            "source_id": "nist",
+            "doc_type": "architecture",
+            "guideline_name": "NIST SP 800-207 Zero Trust Architecture",
+            "license": "public_domain_us_gov",
+            "attribution": "NIST",
+            "source": "nist.gov",
+        },
+    },
+]
+
+
+def normalize_pdf_text(text: str) -> str:
+    """Fix broken spacing from multi-column PDF extraction."""
+    text = unicodedata.normalize('NFKC', text)
+    text = text.replace('\u00ad', '').replace('\u200b', '')
+    prev = None
+    while prev != text:
+        prev = text
+        text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
+    text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
+    text = re.sub(
+        r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
+    )
+    text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
+    text = re.sub(r'[^\S\n]{2,}', ' ', text)
+    return text
+
+
+def extract_pdf_locally(pdf_bytes: bytes) -> str:
+    """Extract text from PDF using pdfplumber with normalization."""
+    import io
+    text_parts = []
+    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+        print(f"    Pages: {len(pdf.pages)}")
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text(x_tolerance=3, y_tolerance=4)
+            if text:
+                text_parts.append(text)
+            if (i + 1) % 50 == 0:
+                print(f"    Extracted {i + 1}/{len(pdf.pages)} pages...")
+    raw = "\n\n".join(text_parts)
+    return normalize_pdf_text(raw)
+
+
+def download_from_minio(object_name: str) -> bytes:
+    """Download file from MinIO via RAG service."""
+    with httpx.Client(timeout=60.0, verify=False) as c:
+        resp = c.get(f"{RAG_URL}/api/v1/documents/download/{object_name}")
+        resp.raise_for_status()
+        url = resp.json()["url"]
+    with httpx.Client(timeout=300.0, verify=False) as c:
+        resp = c.get(url)
+        resp.raise_for_status()
+        return resp.content
+
+
+def upload_text(
+    text: str, filename: str, collection: str, extra_metadata: dict,
+) -> dict:
+    """Upload extracted text to RAG service as .txt."""
+    form_data = {
+        "collection": collection,
+        "data_type": "compliance",
+        "bundesland": "bund",
+        "use_case": "compliance",
+        "year": "2026",
+        "chunk_strategy": "recursive",
+        "chunk_size": "1500",
+        "chunk_overlap": "100",
+        "metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
+    }
+    text_bytes = text.encode("utf-8")
+    with httpx.Client(timeout=1800.0, verify=False) as c:
+        resp = c.post(
+            f"{RAG_URL}/api/v1/documents/upload",
+            files={"file": (filename, text_bytes, "text/plain")},
+            data=form_data,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+def count_chunks(collection: str, regulation_id: str) -> int:
+    """Count chunks for a regulation in Qdrant."""
+    with httpx.Client(timeout=30.0) as c:
+        resp = c.post(
+            f"{QDRANT_URL}/collections/{collection}/points/count",
+            json={
+                "filter": {
+                    "must": [{
+                        "key": "regulation_id",
+                        "match": {"value": regulation_id},
+                    }]
+                },
+                "exact": True,
+            },
+        )
+        resp.raise_for_status()
+        return resp.json()["result"]["count"]
+
+
+def check_section_rate(collection: str, regulation_id: str) -> tuple:
+    """Returns (total_chunks, chunks_with_section)."""
+    total = 0
+    with_sec = 0
+    offset = None
+    with httpx.Client(timeout=60.0) as c:
+        while True:
+            body = {
+                "filter": {
+                    "must": [{
+                        "key": "regulation_id",
+                        "match": {"value": regulation_id},
+                    }]
+                },
+                "limit": 100,
+                "with_payload": ["section"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{QDRANT_URL}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                total += 1
+                s = pt.get("payload", {}).get("section", "")
+                if s and s.strip():
+                    with_sec += 1
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return total, with_sec
+
+
+def main():
+    print("=" * 60)
+    print("NIST PDF Local Extraction + Upload")
+    print("=" * 60)
+
+    results = []
+    for i, doc in enumerate(DOCS, 1):
+        reg_id = doc["extra_metadata"]["regulation_id"]
+        print(f"\n[{i}/{len(DOCS)}] {doc['filename']} → {doc['collection']}")
+
+        # 1. Check current state
+        existing = count_chunks(doc["collection"], reg_id)
+        print(f"  Existing chunks: {existing}")
+
+        # 2. Download PDF from MinIO
+        print(f"  Downloading from MinIO...")
+        pdf_bytes = download_from_minio(doc["object_name"])
+        print(f"  Downloaded {len(pdf_bytes) / 1024 / 1024:.1f} MB")
+
+        # 3. Extract text locally with pdfplumber
+        print(f"  Extracting text locally...")
+        text = extract_pdf_locally(pdf_bytes)
+        print(f"  Extracted {len(text):,} chars, {text.count(chr(10)):,} lines")
+
+        # 4. Save extracted text temporarily (for debugging)
+        tmp_path = f"/tmp/nist_{reg_id}.txt"
+        with open(tmp_path, "w", encoding="utf-8") as f:
+            f.write(text)
+        print(f"  Saved to {tmp_path}")
+
+        # 5. Upload as .txt
+        print(f"  Uploading as .txt to RAG service...")
+        result = upload_text(text, doc["filename"], doc["collection"],
+                             doc["extra_metadata"])
+        new_chunks = result.get("chunks_count", 0)
+        new_doc_id = result.get("document_id", "")
+        print(f"  Uploaded: {new_chunks} chunks (doc_id={new_doc_id})")
+
+        # 6. Check section rate
+        if new_chunks > 0:
+            total, with_sec = check_section_rate(doc["collection"], reg_id)
+            pct = (with_sec / total * 100) if total > 0 else 0
+            print(f"  Section rate: {with_sec}/{total} ({pct:.0f}%)")
+        else:
+            pct = 0
+            print("  WARNING: 0 chunks created!")
+
+        results.append({
+            "file": doc["filename"],
+            "old": existing,
+            "new": new_chunks,
+            "section_rate": round(pct, 1),
+        })
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("RESULTS")
+    print("=" * 60)
+    for r in results:
+        print(f"  {r['file']:<40} old={r['old']} new={r['new']} sect={r['section_rate']}%")
+
+    total_new = sum(r["new"] for r in results)
+    print(f"\nTotal new chunks: {total_new}")
+
+    if any(r["new"] == 0 for r in results):
+        print("\nWARNING: Some documents produced 0 chunks!")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,431 @@
+#!/usr/bin/env python3
+"""Re-upload NIST/BSI/ENISA docs with chunk_strategy='legal' for section metadata.
+
+The docs were already uploaded with 'recursive' strategy (no section detection).
+This script re-uploads with 'legal' strategy, then deletes old recursive chunks.
+
+Usage (on Mac Mini):
+    python3 control-pipeline/scripts/reupload_legal_strategy.py
+    python3 control-pipeline/scripts/reupload_legal_strategy.py --dry-run
+"""
+
+import argparse
+import io
+import json
+import re
+import sys
+import time
+import unicodedata
+
+import httpx
+import pdfplumber
+
+RAG_URL = "https://localhost:8097"
+QDRANT_URL = "http://localhost:6333"
+UPLOAD_TIMEOUT = 1800.0
+
+# ---- Documents to process ----
+
+DOCS = [
+    # 4 NIST docs already extracted at /tmp/nist_*.txt
+    {
+        "regulation_id": "nist_sp800_53r5",
+        "collection": "bp_compliance_datenschutz",
+        "upload_filename": "NIST_SP_800_53r5.txt",
+        "local_txt": "/tmp/nist_nist_sp800_53r5.txt",
+        "minio_pdf": None,  # already extracted
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_53r5",
+            "source_id": "nist",
+            "doc_type": "controls_catalog",
+            "guideline_name": "NIST SP 800-53 Rev. 5",
+            "license": "public_domain_us_gov",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "nist_sp_800_82r3",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "nist_sp_800_82r3.txt",
+        "local_txt": "/tmp/nist_nist_sp_800_82r3.txt",
+        "minio_pdf": None,
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_82r3",
+            "regulation_short": "NIST SP 800-82",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "nist_sp_800_160v1r1",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "nist_sp_800_160v1r1.txt",
+        "local_txt": "/tmp/nist_160.txt",
+        "minio_pdf": None,
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_160v1r1",
+            "regulation_short": "NIST SP 800-160",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "nist_sp800_207",
+        "collection": "bp_compliance_datenschutz",
+        "upload_filename": "NIST_SP_800_207.txt",
+        "local_txt": None,  # needs extraction
+        "minio_pdf": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_207",
+            "source_id": "nist",
+            "doc_type": "architecture",
+            "guideline_name": "NIST SP 800-207 Zero Trust Architecture",
+            "license": "public_domain_us_gov",
+            "source": "nist.gov",
+        },
+    },
+    # Additional low-quality docs (need extraction from MinIO)
+    {
+        "regulation_id": "nist_csf_2_0",
+        "collection": "bp_compliance_datenschutz",
+        "upload_filename": "nist_csf_2_0.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_csf_2_0",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "nistir_8259a",
+        "collection": "bp_compliance_datenschutz",
+        "upload_filename": "nistir_8259a.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/nistir_8259a.pdf",
+        "extra_metadata": {
+            "regulation_id": "nistir_8259a",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "nist_ai_rmf",
+        "collection": "bp_compliance_datenschutz",
+        "upload_filename": "nist_ai_rmf.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_ai_rmf",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "nist_sp_800_30r1",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "nist_sp_800_30r1.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_30r1",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "regulation_id": "cisa_secure_by_design",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "cisa_secure_by_design.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
+        "extra_metadata": {
+            "regulation_id": "cisa_secure_by_design",
+            "license": "public_domain_us",
+            "source": "cisa.gov",
+        },
+    },
+    {
+        "regulation_id": "cvss_v4_0",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "cvss_v4_0.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
+        "extra_metadata": {
+            "regulation_id": "cvss_v4_0",
+            "license": "public_domain_us",
+            "source": "first.org",
+        },
+    },
+    {
+        "regulation_id": "enisa_ics_scada_dependencies",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "enisa_ics_scada.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_ics_scada_dependencies",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "regulation_id": "enisa_threat_landscape_supply_chain",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "enisa_supply_chain_security.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_threat_landscape_supply_chain",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "regulation_id": "enisa_supply_chain_good_practices",
+        "collection": "bp_compliance_ce",
+        "upload_filename": "enisa_supply_chain_good_practices.txt",
+        "local_txt": None,
+        "minio_pdf": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_supply_chain_good_practices",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+]
+
+
+def normalize_pdf_text(text):
+    text = unicodedata.normalize('NFKC', text)
+    text = text.replace('\u00ad', '').replace('\u200b', '')
+    prev = None
+    while prev != text:
+        prev = text
+        text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
+    text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
+    text = re.sub(
+        r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
+    )
+    text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
+    text = re.sub(r'[^\S\n]{2,}', ' ', text)
+    return text
+
+
+def get_text(doc):
+    """Get document text: from local file or extract from MinIO PDF."""
+    if doc["local_txt"]:
+        print(f"  Reading local: {doc['local_txt']}")
+        with open(doc["local_txt"], encoding="utf-8") as f:
+            return f.read()
+
+    print(f"  Downloading from MinIO: {doc['minio_pdf']}")
+    with httpx.Client(timeout=60, verify=False) as c:
+        resp = c.get(f"{RAG_URL}/api/v1/documents/download/{doc['minio_pdf']}")
+        resp.raise_for_status()
+        url = resp.json()["url"]
+    with httpx.Client(timeout=300, verify=False) as c:
+        pdf_bytes = c.get(url).content
+    print(f"  Downloaded {len(pdf_bytes) / 1024 / 1024:.1f} MB")
+
+    print("  Extracting with pdfplumber...")
+    parts = []
+    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+        for i, page in enumerate(pdf.pages):
+            t = page.extract_text(x_tolerance=3, y_tolerance=4)
+            if t:
+                parts.append(t)
+            if (i + 1) % 50 == 0:
+                print(f"    {i + 1}/{len(pdf.pages)} pages...")
+    text = "\n\n".join(parts)
+    text = normalize_pdf_text(text)
+    print(f"  Extracted {len(text):,} chars")
+    return text
+
+
+def get_old_doc_ids(collection, regulation_id):
+    """Get all document_ids for existing chunks."""
+    doc_ids = set()
+    offset = None
+    with httpx.Client(timeout=60) as c:
+        while True:
+            body = {
+                "filter": {"must": [
+                    {"key": "regulation_id", "match": {"value": regulation_id}}
+                ]},
+                "limit": 100,
+                "with_payload": ["document_id"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{QDRANT_URL}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                did = pt.get("payload", {}).get("document_id")
+                if did:
+                    doc_ids.add(did)
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return doc_ids
+
+
+def upload_text_legal(text, filename, collection, extra_metadata):
+    """Upload with chunk_strategy='legal'."""
+    form_data = {
+        "collection": collection,
+        "data_type": "compliance",
+        "bundesland": "bund",
+        "use_case": "compliance",
+        "year": "2026",
+        "chunk_strategy": "legal",
+        "chunk_size": "1500",
+        "chunk_overlap": "100",
+        "metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
+    }
+    with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
+        resp = c.post(
+            f"{RAG_URL}/api/v1/documents/upload",
+            files={"file": (filename, text.encode("utf-8"), "text/plain")},
+            data=form_data,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+def delete_by_doc_ids(collection, doc_ids):
+    """Delete chunks matching specific document_ids."""
+    with httpx.Client(timeout=30) as c:
+        for did in doc_ids:
+            c.post(
+                f"{QDRANT_URL}/collections/{collection}/points/delete",
+                json={"filter": {"must": [
+                    {"key": "document_id", "match": {"value": did}}
+                ]}},
+            ).raise_for_status()
+
+
+def count_chunks(collection, regulation_id):
+    with httpx.Client(timeout=30) as c:
+        resp = c.post(
+            f"{QDRANT_URL}/collections/{collection}/points/count",
+            json={"filter": {"must": [
+                {"key": "regulation_id", "match": {"value": regulation_id}}
+            ]}, "exact": True},
+        )
+        resp.raise_for_status()
+        return resp.json()["result"]["count"]
+
+
+def check_section_rate(collection, regulation_id):
+    total = 0
+    with_sec = 0
+    offset = None
+    with httpx.Client(timeout=60) as c:
+        while True:
+            body = {
+                "filter": {"must": [
+                    {"key": "regulation_id", "match": {"value": regulation_id}}
+                ]},
+                "limit": 100,
+                "with_payload": ["section"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{QDRANT_URL}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                total += 1
+                s = pt.get("payload", {}).get("section", "")
+                if s and s.strip():
+                    with_sec += 1
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return total, with_sec
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Re-upload with chunk_strategy='legal'")
+    print(f"Documents: {len(DOCS)}, Dry run: {args.dry_run}")
+    print("=" * 60)
+
+    results = []
+    for i, doc in enumerate(DOCS, 1):
+        reg_id = doc["regulation_id"]
+        coll = doc["collection"]
+        print(f"\n[{i}/{len(DOCS)}] {doc['upload_filename']} → {coll}")
+
+        # 1. Check existing
+        old_count = count_chunks(coll, reg_id)
+        old_doc_ids = get_old_doc_ids(coll, reg_id) if old_count > 0 else set()
+        print(f"  Old: {old_count} chunks, {len(old_doc_ids)} doc_ids")
+
+        if args.dry_run:
+            print("  DRY RUN — skipping")
+            results.append({"file": doc["upload_filename"], "old": old_count,
+                            "new": "?", "sect": "?"})
+            continue
+
+        # 2. Get text
+        text = get_text(doc)
+
+        # 3. Upload with legal strategy
+        print("  Uploading with strategy='legal'...")
+        result = upload_text_legal(
+            text, doc["upload_filename"], coll, doc["extra_metadata"])
+        new_chunks = result.get("chunks_count", 0)
+        new_doc_id = result.get("document_id", "")
+        print(f"  New: {new_chunks} chunks (doc_id={new_doc_id})")
+
+        if new_chunks == 0:
+            print("  ERROR: 0 chunks — keeping old!")
+            results.append({"file": doc["upload_filename"], "old": old_count,
+                            "new": 0, "sect": 0})
+            continue
+
+        # 4. Delete old chunks (safe: new ones already exist)
+        if old_doc_ids:
+            # Exclude the new document_id just in case
+            old_doc_ids.discard(new_doc_id)
+            if old_doc_ids:
+                print(f"  Deleting {len(old_doc_ids)} old doc_ids...")
+                delete_by_doc_ids(coll, old_doc_ids)
+
+        # 5. Check section rate
+        total, with_sec = check_section_rate(coll, reg_id)
+        pct = (with_sec / total * 100) if total > 0 else 0
+        print(f"  Section rate: {with_sec}/{total} ({pct:.0f}%)")
+
+        results.append({"file": doc["upload_filename"], "old": old_count,
+                        "new": new_chunks, "sect": round(pct, 1)})
+
+        if i < len(DOCS):
+            time.sleep(2)
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("RESULTS")
+    print("=" * 60)
+    for r in results:
+        print(f"  {r['file']:<45} old={r['old']:<6} new={r['new']:<6} sect={r['sect']}%")
+
+    total_new = sum(r["new"] for r in results if isinstance(r["new"], int))
+    print(f"\nTotal new chunks: {total_new}")
+
+
+if __name__ == "__main__":
+    main()
@@ -389,6 +389,11 @@ _SECTION_NUMBER_RE = re.compile(
    r'|Kapitel\s+(\d+)'              # Kapitel 2
    r'|Anhang\s+([IVXLC\d]+)'       # Anhang III
    r'|Annex\s+([IVXLC\d]+)'        # Annex XII
+    # NIST/ENISA/standard identifiers
+    r'|([A-Z]{2}\.[A-Z]{2}-\d{2})'  # GV.OC-01 (NIST CSF 2.0)
+    r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)'  # AC-1, AC-1(1) (NIST controls)
+    r'|(\d+\.\d+(?:\.\d+)*)'        # 3.1, 2.3.1 (numbered sections)
+    r'|(A\d{2}(?::\d{4})?)'         # A01:2021 (OWASP)
    r')',
    re.IGNORECASE
 )
@@ -13,7 +13,9 @@ Covers:
 from main import (
    _normalize_pdf_text,
    _extract_section_header,
+    _parse_section_metadata,
    chunk_text_legal,
+    chunk_text_legal_structured,
 )


@@ -124,6 +126,66 @@ class TestNistSectionDetection:
        assert _extract_section_header(normalized) is not None


+# =========================================================================
+# Section metadata extraction (_parse_section_metadata)
+# =========================================================================
+
+class TestNistSectionMetadata:
+
+    def test_nist_control_ac1_section(self):
+        meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
+        assert meta["section"] == "AC-1"
+
+    def test_nist_control_au2_section(self):
+        meta = _parse_section_metadata("AU-2 Audit Events")
+        assert meta["section"] == "AU-2"
+
+    def test_nist_enhancement_section(self):
+        meta = _parse_section_metadata("AC-1(1) Policy and Procedures")
+        assert meta["section"] == "AC-1(1)"
+
+    def test_nist_csf_compound_section(self):
+        meta = _parse_section_metadata("GV.OC-01 Organizational Context")
+        assert meta["section"] == "GV.OC-01"
+
+    def test_numbered_section(self):
+        meta = _parse_section_metadata("3.1 ACCESS CONTROL")
+        assert meta["section"] == "3.1"
+
+    def test_deep_numbered_section(self):
+        meta = _parse_section_metadata("2.3.1 Subtitle")
+        assert meta["section"] == "2.3.1"
+
+    def test_owasp_section(self):
+        meta = _parse_section_metadata("A01:2021 Broken Access Control")
+        assert meta["section"] == "A01:2021"
+
+    def test_section_title_extracted(self):
+        meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
+        assert meta["section_title"] == "POLICY AND PROCEDURES"
+
+    def test_numbered_section_title(self):
+        meta = _parse_section_metadata("3.1 ACCESS CONTROL")
+        assert meta["section_title"] == "ACCESS CONTROL"
+
+    def test_structured_chunks_have_section(self):
+        text = (
+            "3.1 ACCESS CONTROL\n"
+            "Overview of access control family.\n\n"
+            "AC-1 POLICY AND PROCEDURES\n"
+            "The organization develops, documents, and disseminates an access "
+            "control policy that addresses purpose, scope, roles, responsibilities, "
+            "management commitment, coordination among entities.\n\n"
+            "AC-2 ACCOUNT MANAGEMENT\n"
+            "The information system enforces approved authorizations for logical "
+            "access to information and system resources.\n"
+        )
+        result = chunk_text_legal_structured(text, chunk_size=300, overlap=50)
+        sections = [r.get("section", "") for r in result]
+        assert any(s == "AC-1" for s in sections)
+        assert any(s == "AC-2" for s in sections)
+
+
 # =========================================================================
 # Chunking with NIST-style text
 # =========================================================================
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Qdrant Snapshot — erstellt Snapshots aller Collections
+#
+# Usage:
+#   bash scripts/qdrant-snapshot.sh                    # Create snapshots
+#   bash scripts/qdrant-snapshot.sh --list             # List existing snapshots
+#   bash scripts/qdrant-snapshot.sh --restore <file>   # Restore (interactive)
+#
+# Snapshots werden im Qdrant-Volume unter /qdrant/storage/snapshots/ gespeichert.
+# Zusaetzlich werden sie nach ./backups/qdrant/ kopiert.
+
+set -euo pipefail
+
+QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
+BACKUP_DIR="${BACKUP_DIR:-./backups/qdrant}"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+
+# --- List existing snapshots ---
+if [[ "${1:-}" == "--list" ]]; then
+    echo "=== Qdrant Snapshots ==="
+    for coll in $(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]"); do
+        echo ""
+        echo "Collection: $coll"
+        curl -sf "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "
+import sys, json
+snaps = json.load(sys.stdin).get('result', [])
+if not snaps:
+    print('  (no snapshots)')
+else:
+    for s in snaps:
+        print(f\"  {s['name']}  size={s.get('size',0)/(1024*1024):.1f}MB\")
+"
+    done
+    exit 0
+fi
+
+# --- Create snapshots ---
+echo "=== Creating Qdrant Snapshots ($TIMESTAMP) ==="
+mkdir -p "$BACKUP_DIR"
+
+COLLECTIONS=$(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]")
+
+for coll in $COLLECTIONS; do
+    echo ""
+    echo "[$coll] Creating snapshot..."
+
+    SNAP=$(curl -sf -X POST "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['name'])")
+
+    if [[ -z "$SNAP" ]]; then
+        echo "[$coll] ERROR: snapshot creation failed"
+        continue
+    fi
+
+    echo "[$coll] Snapshot: $SNAP"
+
+    # Download snapshot to backup dir
+    OUTFILE="$BACKUP_DIR/${coll}_${TIMESTAMP}.snapshot"
+    curl -sf "$QDRANT_URL/collections/$coll/snapshots/$SNAP" -o "$OUTFILE"
+    SIZE=$(du -h "$OUTFILE" | cut -f1)
+    echo "[$coll] Saved: $OUTFILE ($SIZE)"
+done
+
+echo ""
+echo "=== Done ==="
+ls -lh "$BACKUP_DIR"/*_${TIMESTAMP}.snapshot 2>/dev/null || echo "No snapshots created"