fix(embedding): add NIST control IDs to _SECTION_NUMBER_RE
_SECTION_NUMBER_RE only had patterns for §/Art/Section/Kapitel/Annex but missed NIST-style identifiers (AC-1, GV.OC-01, 3.1, A01:2021). This caused 0% section rate for all NIST/BSI/ENISA documents even though sections were correctly detected — the section NUMBER wasn't extracted from the header. Also adds: - reupload_legal_strategy.py: re-upload with legal chunking - extract_and_upload_nist.py: local PDF extraction workaround - qdrant-snapshot.sh: backup mechanism for Qdrant collections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,280 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Extract large NIST PDFs locally, then upload as .txt to RAG service.
|
||||||
|
|
||||||
|
Workaround for embedding-service container crashing on large PDFs (>5 MB).
|
||||||
|
Runs pdfplumber + normalization locally, uploads extracted text as .txt.
|
||||||
|
|
||||||
|
Usage (on Mac Mini):
|
||||||
|
python3 control-pipeline/scripts/extract_and_upload_nist.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
RAG_URL = "https://localhost:8097"
|
||||||
|
QDRANT_URL = "http://localhost:6333"
|
||||||
|
|
||||||
|
DOCS = [
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "NIST_SP_800_53r5.txt",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp800_53r5",
|
||||||
|
"source_id": "nist",
|
||||||
|
"doc_type": "controls_catalog",
|
||||||
|
"guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
|
||||||
|
"license": "public_domain_us_gov",
|
||||||
|
"attribution": "NIST",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "nist_sp_800_82r3.txt",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_82r3",
|
||||||
|
"regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
|
||||||
|
"regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
|
||||||
|
"regulation_short": "NIST SP 800-82",
|
||||||
|
"category": "ot_security",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "nist_sp_800_160v1r1.txt",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_160v1r1",
|
||||||
|
"regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
|
||||||
|
"regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
|
||||||
|
"regulation_short": "NIST SP 800-160",
|
||||||
|
"category": "security_engineering",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "NIST_SP_800_207.txt",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp800_207",
|
||||||
|
"source_id": "nist",
|
||||||
|
"doc_type": "architecture",
|
||||||
|
"guideline_name": "NIST SP 800-207 Zero Trust Architecture",
|
||||||
|
"license": "public_domain_us_gov",
|
||||||
|
"attribution": "NIST",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_pdf_text(text: str) -> str:
|
||||||
|
"""Fix broken spacing from multi-column PDF extraction."""
|
||||||
|
text = unicodedata.normalize('NFKC', text)
|
||||||
|
text = text.replace('\u00ad', '').replace('\u200b', '')
|
||||||
|
prev = None
|
||||||
|
while prev != text:
|
||||||
|
prev = text
|
||||||
|
text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
|
||||||
|
text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
|
||||||
|
text = re.sub(
|
||||||
|
r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
|
||||||
|
)
|
||||||
|
text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
|
||||||
|
text = re.sub(r'[^\S\n]{2,}', ' ', text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_locally(pdf_bytes: bytes) -> str:
|
||||||
|
"""Extract text from PDF using pdfplumber with normalization."""
|
||||||
|
import io
|
||||||
|
text_parts = []
|
||||||
|
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||||
|
print(f" Pages: {len(pdf.pages)}")
|
||||||
|
for i, page in enumerate(pdf.pages):
|
||||||
|
text = page.extract_text(x_tolerance=3, y_tolerance=4)
|
||||||
|
if text:
|
||||||
|
text_parts.append(text)
|
||||||
|
if (i + 1) % 50 == 0:
|
||||||
|
print(f" Extracted {i + 1}/{len(pdf.pages)} pages...")
|
||||||
|
raw = "\n\n".join(text_parts)
|
||||||
|
return normalize_pdf_text(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def download_from_minio(object_name: str) -> bytes:
|
||||||
|
"""Download file from MinIO via RAG service."""
|
||||||
|
with httpx.Client(timeout=60.0, verify=False) as c:
|
||||||
|
resp = c.get(f"{RAG_URL}/api/v1/documents/download/{object_name}")
|
||||||
|
resp.raise_for_status()
|
||||||
|
url = resp.json()["url"]
|
||||||
|
with httpx.Client(timeout=300.0, verify=False) as c:
|
||||||
|
resp = c.get(url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.content
|
||||||
|
|
||||||
|
|
||||||
|
def upload_text(
|
||||||
|
text: str, filename: str, collection: str, extra_metadata: dict,
|
||||||
|
) -> dict:
|
||||||
|
"""Upload extracted text to RAG service as .txt."""
|
||||||
|
form_data = {
|
||||||
|
"collection": collection,
|
||||||
|
"data_type": "compliance",
|
||||||
|
"bundesland": "bund",
|
||||||
|
"use_case": "compliance",
|
||||||
|
"year": "2026",
|
||||||
|
"chunk_strategy": "recursive",
|
||||||
|
"chunk_size": "1500",
|
||||||
|
"chunk_overlap": "100",
|
||||||
|
"metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
text_bytes = text.encode("utf-8")
|
||||||
|
with httpx.Client(timeout=1800.0, verify=False) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{RAG_URL}/api/v1/documents/upload",
|
||||||
|
files={"file": (filename, text_bytes, "text/plain")},
|
||||||
|
data=form_data,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def count_chunks(collection: str, regulation_id: str) -> int:
|
||||||
|
"""Count chunks for a regulation in Qdrant."""
|
||||||
|
with httpx.Client(timeout=30.0) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/count",
|
||||||
|
json={
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "regulation_id",
|
||||||
|
"match": {"value": regulation_id},
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
"exact": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()["result"]["count"]
|
||||||
|
|
||||||
|
|
||||||
|
def check_section_rate(collection: str, regulation_id: str) -> tuple:
|
||||||
|
"""Returns (total_chunks, chunks_with_section)."""
|
||||||
|
total = 0
|
||||||
|
with_sec = 0
|
||||||
|
offset = None
|
||||||
|
with httpx.Client(timeout=60.0) as c:
|
||||||
|
while True:
|
||||||
|
body = {
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "regulation_id",
|
||||||
|
"match": {"value": regulation_id},
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
"limit": 100,
|
||||||
|
"with_payload": ["section"],
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
body["offset"] = offset
|
||||||
|
resp = c.post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()["result"]
|
||||||
|
for pt in data["points"]:
|
||||||
|
total += 1
|
||||||
|
s = pt.get("payload", {}).get("section", "")
|
||||||
|
if s and s.strip():
|
||||||
|
with_sec += 1
|
||||||
|
offset = data.get("next_page_offset")
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
return total, with_sec
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("NIST PDF Local Extraction + Upload")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, doc in enumerate(DOCS, 1):
|
||||||
|
reg_id = doc["extra_metadata"]["regulation_id"]
|
||||||
|
print(f"\n[{i}/{len(DOCS)}] {doc['filename']} → {doc['collection']}")
|
||||||
|
|
||||||
|
# 1. Check current state
|
||||||
|
existing = count_chunks(doc["collection"], reg_id)
|
||||||
|
print(f" Existing chunks: {existing}")
|
||||||
|
|
||||||
|
# 2. Download PDF from MinIO
|
||||||
|
print(f" Downloading from MinIO...")
|
||||||
|
pdf_bytes = download_from_minio(doc["object_name"])
|
||||||
|
print(f" Downloaded {len(pdf_bytes) / 1024 / 1024:.1f} MB")
|
||||||
|
|
||||||
|
# 3. Extract text locally with pdfplumber
|
||||||
|
print(f" Extracting text locally...")
|
||||||
|
text = extract_pdf_locally(pdf_bytes)
|
||||||
|
print(f" Extracted {len(text):,} chars, {text.count(chr(10)):,} lines")
|
||||||
|
|
||||||
|
# 4. Save extracted text temporarily (for debugging)
|
||||||
|
tmp_path = f"/tmp/nist_{reg_id}.txt"
|
||||||
|
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(text)
|
||||||
|
print(f" Saved to {tmp_path}")
|
||||||
|
|
||||||
|
# 5. Upload as .txt
|
||||||
|
print(f" Uploading as .txt to RAG service...")
|
||||||
|
result = upload_text(text, doc["filename"], doc["collection"],
|
||||||
|
doc["extra_metadata"])
|
||||||
|
new_chunks = result.get("chunks_count", 0)
|
||||||
|
new_doc_id = result.get("document_id", "")
|
||||||
|
print(f" Uploaded: {new_chunks} chunks (doc_id={new_doc_id})")
|
||||||
|
|
||||||
|
# 6. Check section rate
|
||||||
|
if new_chunks > 0:
|
||||||
|
total, with_sec = check_section_rate(doc["collection"], reg_id)
|
||||||
|
pct = (with_sec / total * 100) if total > 0 else 0
|
||||||
|
print(f" Section rate: {with_sec}/{total} ({pct:.0f}%)")
|
||||||
|
else:
|
||||||
|
pct = 0
|
||||||
|
print(" WARNING: 0 chunks created!")
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"file": doc["filename"],
|
||||||
|
"old": existing,
|
||||||
|
"new": new_chunks,
|
||||||
|
"section_rate": round(pct, 1),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("RESULTS")
|
||||||
|
print("=" * 60)
|
||||||
|
for r in results:
|
||||||
|
print(f" {r['file']:<40} old={r['old']} new={r['new']} sect={r['section_rate']}%")
|
||||||
|
|
||||||
|
total_new = sum(r["new"] for r in results)
|
||||||
|
print(f"\nTotal new chunks: {total_new}")
|
||||||
|
|
||||||
|
if any(r["new"] == 0 for r in results):
|
||||||
|
print("\nWARNING: Some documents produced 0 chunks!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,431 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Re-upload NIST/BSI/ENISA docs with chunk_strategy='legal' for section metadata.
|
||||||
|
|
||||||
|
The docs were already uploaded with 'recursive' strategy (no section detection).
|
||||||
|
This script re-uploads with 'legal' strategy, then deletes old recursive chunks.
|
||||||
|
|
||||||
|
Usage (on Mac Mini):
|
||||||
|
python3 control-pipeline/scripts/reupload_legal_strategy.py
|
||||||
|
python3 control-pipeline/scripts/reupload_legal_strategy.py --dry-run
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
RAG_URL = "https://localhost:8097"
|
||||||
|
QDRANT_URL = "http://localhost:6333"
|
||||||
|
UPLOAD_TIMEOUT = 1800.0
|
||||||
|
|
||||||
|
# ---- Documents to process ----
|
||||||
|
|
||||||
|
DOCS = [
|
||||||
|
# 4 NIST docs already extracted at /tmp/nist_*.txt
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_sp800_53r5",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"upload_filename": "NIST_SP_800_53r5.txt",
|
||||||
|
"local_txt": "/tmp/nist_nist_sp800_53r5.txt",
|
||||||
|
"minio_pdf": None, # already extracted
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp800_53r5",
|
||||||
|
"source_id": "nist",
|
||||||
|
"doc_type": "controls_catalog",
|
||||||
|
"guideline_name": "NIST SP 800-53 Rev. 5",
|
||||||
|
"license": "public_domain_us_gov",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_sp_800_82r3",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "nist_sp_800_82r3.txt",
|
||||||
|
"local_txt": "/tmp/nist_nist_sp_800_82r3.txt",
|
||||||
|
"minio_pdf": None,
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_82r3",
|
||||||
|
"regulation_short": "NIST SP 800-82",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_sp_800_160v1r1",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "nist_sp_800_160v1r1.txt",
|
||||||
|
"local_txt": "/tmp/nist_160.txt",
|
||||||
|
"minio_pdf": None,
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_160v1r1",
|
||||||
|
"regulation_short": "NIST SP 800-160",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_sp800_207",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"upload_filename": "NIST_SP_800_207.txt",
|
||||||
|
"local_txt": None, # needs extraction
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp800_207",
|
||||||
|
"source_id": "nist",
|
||||||
|
"doc_type": "architecture",
|
||||||
|
"guideline_name": "NIST SP 800-207 Zero Trust Architecture",
|
||||||
|
"license": "public_domain_us_gov",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# Additional low-quality docs (need extraction from MinIO)
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_csf_2_0",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"upload_filename": "nist_csf_2_0.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_csf_2_0",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "nistir_8259a",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"upload_filename": "nistir_8259a.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/nistir_8259a.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nistir_8259a",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_ai_rmf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"upload_filename": "nist_ai_rmf.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_ai_rmf",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "nist_sp_800_30r1",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "nist_sp_800_30r1.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_30r1",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "cisa_secure_by_design",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "cisa_secure_by_design.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "cisa_secure_by_design",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "cisa.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "cvss_v4_0",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "cvss_v4_0.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "cvss_v4_0",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "first.org",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "enisa_ics_scada_dependencies",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "enisa_ics_scada.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "enisa_ics_scada_dependencies",
|
||||||
|
"license": "reuse_with_attribution",
|
||||||
|
"source": "enisa.europa.eu",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "enisa_threat_landscape_supply_chain",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "enisa_supply_chain_security.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "enisa_threat_landscape_supply_chain",
|
||||||
|
"license": "reuse_with_attribution",
|
||||||
|
"source": "enisa.europa.eu",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regulation_id": "enisa_supply_chain_good_practices",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"upload_filename": "enisa_supply_chain_good_practices.txt",
|
||||||
|
"local_txt": None,
|
||||||
|
"minio_pdf": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "enisa_supply_chain_good_practices",
|
||||||
|
"license": "reuse_with_attribution",
|
||||||
|
"source": "enisa.europa.eu",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_pdf_text(text):
|
||||||
|
text = unicodedata.normalize('NFKC', text)
|
||||||
|
text = text.replace('\u00ad', '').replace('\u200b', '')
|
||||||
|
prev = None
|
||||||
|
while prev != text:
|
||||||
|
prev = text
|
||||||
|
text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
|
||||||
|
text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
|
||||||
|
text = re.sub(
|
||||||
|
r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
|
||||||
|
)
|
||||||
|
text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
|
||||||
|
text = re.sub(r'[^\S\n]{2,}', ' ', text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_text(doc):
|
||||||
|
"""Get document text: from local file or extract from MinIO PDF."""
|
||||||
|
if doc["local_txt"]:
|
||||||
|
print(f" Reading local: {doc['local_txt']}")
|
||||||
|
with open(doc["local_txt"], encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
print(f" Downloading from MinIO: {doc['minio_pdf']}")
|
||||||
|
with httpx.Client(timeout=60, verify=False) as c:
|
||||||
|
resp = c.get(f"{RAG_URL}/api/v1/documents/download/{doc['minio_pdf']}")
|
||||||
|
resp.raise_for_status()
|
||||||
|
url = resp.json()["url"]
|
||||||
|
with httpx.Client(timeout=300, verify=False) as c:
|
||||||
|
pdf_bytes = c.get(url).content
|
||||||
|
print(f" Downloaded {len(pdf_bytes) / 1024 / 1024:.1f} MB")
|
||||||
|
|
||||||
|
print(" Extracting with pdfplumber...")
|
||||||
|
parts = []
|
||||||
|
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||||
|
for i, page in enumerate(pdf.pages):
|
||||||
|
t = page.extract_text(x_tolerance=3, y_tolerance=4)
|
||||||
|
if t:
|
||||||
|
parts.append(t)
|
||||||
|
if (i + 1) % 50 == 0:
|
||||||
|
print(f" {i + 1}/{len(pdf.pages)} pages...")
|
||||||
|
text = "\n\n".join(parts)
|
||||||
|
text = normalize_pdf_text(text)
|
||||||
|
print(f" Extracted {len(text):,} chars")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_old_doc_ids(collection, regulation_id):
|
||||||
|
"""Get all document_ids for existing chunks."""
|
||||||
|
doc_ids = set()
|
||||||
|
offset = None
|
||||||
|
with httpx.Client(timeout=60) as c:
|
||||||
|
while True:
|
||||||
|
body = {
|
||||||
|
"filter": {"must": [
|
||||||
|
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||||
|
]},
|
||||||
|
"limit": 100,
|
||||||
|
"with_payload": ["document_id"],
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
body["offset"] = offset
|
||||||
|
resp = c.post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()["result"]
|
||||||
|
for pt in data["points"]:
|
||||||
|
did = pt.get("payload", {}).get("document_id")
|
||||||
|
if did:
|
||||||
|
doc_ids.add(did)
|
||||||
|
offset = data.get("next_page_offset")
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
return doc_ids
|
||||||
|
|
||||||
|
|
||||||
|
def upload_text_legal(text, filename, collection, extra_metadata):
|
||||||
|
"""Upload with chunk_strategy='legal'."""
|
||||||
|
form_data = {
|
||||||
|
"collection": collection,
|
||||||
|
"data_type": "compliance",
|
||||||
|
"bundesland": "bund",
|
||||||
|
"use_case": "compliance",
|
||||||
|
"year": "2026",
|
||||||
|
"chunk_strategy": "legal",
|
||||||
|
"chunk_size": "1500",
|
||||||
|
"chunk_overlap": "100",
|
||||||
|
"metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{RAG_URL}/api/v1/documents/upload",
|
||||||
|
files={"file": (filename, text.encode("utf-8"), "text/plain")},
|
||||||
|
data=form_data,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def delete_by_doc_ids(collection, doc_ids):
|
||||||
|
"""Delete chunks matching specific document_ids."""
|
||||||
|
with httpx.Client(timeout=30) as c:
|
||||||
|
for did in doc_ids:
|
||||||
|
c.post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/delete",
|
||||||
|
json={"filter": {"must": [
|
||||||
|
{"key": "document_id", "match": {"value": did}}
|
||||||
|
]}},
|
||||||
|
).raise_for_status()
|
||||||
|
|
||||||
|
|
||||||
|
def count_chunks(collection, regulation_id):
|
||||||
|
with httpx.Client(timeout=30) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/count",
|
||||||
|
json={"filter": {"must": [
|
||||||
|
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||||
|
]}, "exact": True},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()["result"]["count"]
|
||||||
|
|
||||||
|
|
||||||
|
def check_section_rate(collection, regulation_id):
|
||||||
|
total = 0
|
||||||
|
with_sec = 0
|
||||||
|
offset = None
|
||||||
|
with httpx.Client(timeout=60) as c:
|
||||||
|
while True:
|
||||||
|
body = {
|
||||||
|
"filter": {"must": [
|
||||||
|
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||||
|
]},
|
||||||
|
"limit": 100,
|
||||||
|
"with_payload": ["section"],
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
body["offset"] = offset
|
||||||
|
resp = c.post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()["result"]
|
||||||
|
for pt in data["points"]:
|
||||||
|
total += 1
|
||||||
|
s = pt.get("payload", {}).get("section", "")
|
||||||
|
if s and s.strip():
|
||||||
|
with_sec += 1
|
||||||
|
offset = data.get("next_page_offset")
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
return total, with_sec
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--dry-run", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Re-upload with chunk_strategy='legal'")
|
||||||
|
print(f"Documents: {len(DOCS)}, Dry run: {args.dry_run}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, doc in enumerate(DOCS, 1):
|
||||||
|
reg_id = doc["regulation_id"]
|
||||||
|
coll = doc["collection"]
|
||||||
|
print(f"\n[{i}/{len(DOCS)}] {doc['upload_filename']} → {coll}")
|
||||||
|
|
||||||
|
# 1. Check existing
|
||||||
|
old_count = count_chunks(coll, reg_id)
|
||||||
|
old_doc_ids = get_old_doc_ids(coll, reg_id) if old_count > 0 else set()
|
||||||
|
print(f" Old: {old_count} chunks, {len(old_doc_ids)} doc_ids")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print(" DRY RUN — skipping")
|
||||||
|
results.append({"file": doc["upload_filename"], "old": old_count,
|
||||||
|
"new": "?", "sect": "?"})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. Get text
|
||||||
|
text = get_text(doc)
|
||||||
|
|
||||||
|
# 3. Upload with legal strategy
|
||||||
|
print(" Uploading with strategy='legal'...")
|
||||||
|
result = upload_text_legal(
|
||||||
|
text, doc["upload_filename"], coll, doc["extra_metadata"])
|
||||||
|
new_chunks = result.get("chunks_count", 0)
|
||||||
|
new_doc_id = result.get("document_id", "")
|
||||||
|
print(f" New: {new_chunks} chunks (doc_id={new_doc_id})")
|
||||||
|
|
||||||
|
if new_chunks == 0:
|
||||||
|
print(" ERROR: 0 chunks — keeping old!")
|
||||||
|
results.append({"file": doc["upload_filename"], "old": old_count,
|
||||||
|
"new": 0, "sect": 0})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 4. Delete old chunks (safe: new ones already exist)
|
||||||
|
if old_doc_ids:
|
||||||
|
# Exclude the new document_id just in case
|
||||||
|
old_doc_ids.discard(new_doc_id)
|
||||||
|
if old_doc_ids:
|
||||||
|
print(f" Deleting {len(old_doc_ids)} old doc_ids...")
|
||||||
|
delete_by_doc_ids(coll, old_doc_ids)
|
||||||
|
|
||||||
|
# 5. Check section rate
|
||||||
|
total, with_sec = check_section_rate(coll, reg_id)
|
||||||
|
pct = (with_sec / total * 100) if total > 0 else 0
|
||||||
|
print(f" Section rate: {with_sec}/{total} ({pct:.0f}%)")
|
||||||
|
|
||||||
|
results.append({"file": doc["upload_filename"], "old": old_count,
|
||||||
|
"new": new_chunks, "sect": round(pct, 1)})
|
||||||
|
|
||||||
|
if i < len(DOCS):
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("RESULTS")
|
||||||
|
print("=" * 60)
|
||||||
|
for r in results:
|
||||||
|
print(f" {r['file']:<45} old={r['old']:<6} new={r['new']:<6} sect={r['sect']}%")
|
||||||
|
|
||||||
|
total_new = sum(r["new"] for r in results if isinstance(r["new"], int))
|
||||||
|
print(f"\nTotal new chunks: {total_new}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -389,6 +389,11 @@ _SECTION_NUMBER_RE = re.compile(
|
|||||||
r'|Kapitel\s+(\d+)' # Kapitel 2
|
r'|Kapitel\s+(\d+)' # Kapitel 2
|
||||||
r'|Anhang\s+([IVXLC\d]+)' # Anhang III
|
r'|Anhang\s+([IVXLC\d]+)' # Anhang III
|
||||||
r'|Annex\s+([IVXLC\d]+)' # Annex XII
|
r'|Annex\s+([IVXLC\d]+)' # Annex XII
|
||||||
|
# NIST/ENISA/standard identifiers
|
||||||
|
r'|([A-Z]{2}\.[A-Z]{2}-\d{2})' # GV.OC-01 (NIST CSF 2.0)
|
||||||
|
r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)' # AC-1, AC-1(1) (NIST controls)
|
||||||
|
r'|(\d+\.\d+(?:\.\d+)*)' # 3.1, 2.3.1 (numbered sections)
|
||||||
|
r'|(A\d{2}(?::\d{4})?)' # A01:2021 (OWASP)
|
||||||
r')',
|
r')',
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -13,7 +13,9 @@ Covers:
|
|||||||
from main import (
|
from main import (
|
||||||
_normalize_pdf_text,
|
_normalize_pdf_text,
|
||||||
_extract_section_header,
|
_extract_section_header,
|
||||||
|
_parse_section_metadata,
|
||||||
chunk_text_legal,
|
chunk_text_legal,
|
||||||
|
chunk_text_legal_structured,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -124,6 +126,66 @@ class TestNistSectionDetection:
|
|||||||
assert _extract_section_header(normalized) is not None
|
assert _extract_section_header(normalized) is not None
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Section metadata extraction (_parse_section_metadata)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestNistSectionMetadata:
|
||||||
|
|
||||||
|
def test_nist_control_ac1_section(self):
|
||||||
|
meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
|
||||||
|
assert meta["section"] == "AC-1"
|
||||||
|
|
||||||
|
def test_nist_control_au2_section(self):
|
||||||
|
meta = _parse_section_metadata("AU-2 Audit Events")
|
||||||
|
assert meta["section"] == "AU-2"
|
||||||
|
|
||||||
|
def test_nist_enhancement_section(self):
|
||||||
|
meta = _parse_section_metadata("AC-1(1) Policy and Procedures")
|
||||||
|
assert meta["section"] == "AC-1(1)"
|
||||||
|
|
||||||
|
def test_nist_csf_compound_section(self):
|
||||||
|
meta = _parse_section_metadata("GV.OC-01 Organizational Context")
|
||||||
|
assert meta["section"] == "GV.OC-01"
|
||||||
|
|
||||||
|
def test_numbered_section(self):
|
||||||
|
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
||||||
|
assert meta["section"] == "3.1"
|
||||||
|
|
||||||
|
def test_deep_numbered_section(self):
|
||||||
|
meta = _parse_section_metadata("2.3.1 Subtitle")
|
||||||
|
assert meta["section"] == "2.3.1"
|
||||||
|
|
||||||
|
def test_owasp_section(self):
|
||||||
|
meta = _parse_section_metadata("A01:2021 Broken Access Control")
|
||||||
|
assert meta["section"] == "A01:2021"
|
||||||
|
|
||||||
|
def test_section_title_extracted(self):
|
||||||
|
meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
|
||||||
|
assert meta["section_title"] == "POLICY AND PROCEDURES"
|
||||||
|
|
||||||
|
def test_numbered_section_title(self):
|
||||||
|
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
||||||
|
assert meta["section_title"] == "ACCESS CONTROL"
|
||||||
|
|
||||||
|
def test_structured_chunks_have_section(self):
|
||||||
|
text = (
|
||||||
|
"3.1 ACCESS CONTROL\n"
|
||||||
|
"Overview of access control family.\n\n"
|
||||||
|
"AC-1 POLICY AND PROCEDURES\n"
|
||||||
|
"The organization develops, documents, and disseminates an access "
|
||||||
|
"control policy that addresses purpose, scope, roles, responsibilities, "
|
||||||
|
"management commitment, coordination among entities.\n\n"
|
||||||
|
"AC-2 ACCOUNT MANAGEMENT\n"
|
||||||
|
"The information system enforces approved authorizations for logical "
|
||||||
|
"access to information and system resources.\n"
|
||||||
|
)
|
||||||
|
result = chunk_text_legal_structured(text, chunk_size=300, overlap=50)
|
||||||
|
sections = [r.get("section", "") for r in result]
|
||||||
|
assert any(s == "AC-1" for s in sections)
|
||||||
|
assert any(s == "AC-2" for s in sections)
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Chunking with NIST-style text
|
# Chunking with NIST-style text
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
|
|||||||
Executable
+65
@@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Qdrant Snapshot — erstellt Snapshots aller Collections
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# bash scripts/qdrant-snapshot.sh # Create snapshots
|
||||||
|
# bash scripts/qdrant-snapshot.sh --list # List existing snapshots
|
||||||
|
# bash scripts/qdrant-snapshot.sh --restore <file> # Restore (interactive)
|
||||||
|
#
|
||||||
|
# Snapshots werden im Qdrant-Volume unter /qdrant/storage/snapshots/ gespeichert.
|
||||||
|
# Zusaetzlich werden sie nach ./backups/qdrant/ kopiert.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
|
||||||
|
BACKUP_DIR="${BACKUP_DIR:-./backups/qdrant}"
|
||||||
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||||
|
|
||||||
|
# --- List existing snapshots ---
|
||||||
|
if [[ "${1:-}" == "--list" ]]; then
|
||||||
|
echo "=== Qdrant Snapshots ==="
|
||||||
|
for coll in $(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]"); do
|
||||||
|
echo ""
|
||||||
|
echo "Collection: $coll"
|
||||||
|
curl -sf "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "
|
||||||
|
import sys, json
|
||||||
|
snaps = json.load(sys.stdin).get('result', [])
|
||||||
|
if not snaps:
|
||||||
|
print(' (no snapshots)')
|
||||||
|
else:
|
||||||
|
for s in snaps:
|
||||||
|
print(f\" {s['name']} size={s.get('size',0)/(1024*1024):.1f}MB\")
|
||||||
|
"
|
||||||
|
done
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Create snapshots ---
|
||||||
|
echo "=== Creating Qdrant Snapshots ($TIMESTAMP) ==="
|
||||||
|
mkdir -p "$BACKUP_DIR"
|
||||||
|
|
||||||
|
COLLECTIONS=$(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]")
|
||||||
|
|
||||||
|
for coll in $COLLECTIONS; do
|
||||||
|
echo ""
|
||||||
|
echo "[$coll] Creating snapshot..."
|
||||||
|
|
||||||
|
SNAP=$(curl -sf -X POST "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['name'])")
|
||||||
|
|
||||||
|
if [[ -z "$SNAP" ]]; then
|
||||||
|
echo "[$coll] ERROR: snapshot creation failed"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[$coll] Snapshot: $SNAP"
|
||||||
|
|
||||||
|
# Download snapshot to backup dir
|
||||||
|
OUTFILE="$BACKUP_DIR/${coll}_${TIMESTAMP}.snapshot"
|
||||||
|
curl -sf "$QDRANT_URL/collections/$coll/snapshots/$SNAP" -o "$OUTFILE"
|
||||||
|
SIZE=$(du -h "$OUTFILE" | cut -f1)
|
||||||
|
echo "[$coll] Saved: $OUTFILE ($SIZE)"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Done ==="
|
||||||
|
ls -lh "$BACKUP_DIR"/*_${TIMESTAMP}.snapshot 2>/dev/null || echo "No snapshots created"
|
||||||
Reference in New Issue
Block a user