From 2f4a3f2ea2eef0e250d507717a4f351929bc9a15 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 3 May 2026 07:42:06 +0200 Subject: [PATCH] fix(embedding): add NIST control IDs to _SECTION_NUMBER_RE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _SECTION_NUMBER_RE only had patterns for §/Art/Section/Kapitel/Annex but missed NIST-style identifiers (AC-1, GV.OC-01, 3.1, A01:2021). This caused 0% section rate for all NIST/BSI/ENISA documents even though sections were correctly detected — the section NUMBER wasn't extracted from the header. Also adds: - reupload_legal_strategy.py: re-upload with legal chunking - extract_and_upload_nist.py: local PDF extraction workaround - qdrant-snapshot.sh: backup mechanism for Qdrant collections Co-Authored-By: Claude Opus 4.6 (1M context) --- .../scripts/extract_and_upload_nist.py | 280 ++++++++++++ .../scripts/reupload_legal_strategy.py | 431 ++++++++++++++++++ embedding-service/main.py | 5 + embedding-service/test_nist_normalization.py | 62 +++ scripts/qdrant-snapshot.sh | 65 +++ 5 files changed, 843 insertions(+) create mode 100644 control-pipeline/scripts/extract_and_upload_nist.py create mode 100644 control-pipeline/scripts/reupload_legal_strategy.py create mode 100755 scripts/qdrant-snapshot.sh diff --git a/control-pipeline/scripts/extract_and_upload_nist.py b/control-pipeline/scripts/extract_and_upload_nist.py new file mode 100644 index 0000000..44f2281 --- /dev/null +++ b/control-pipeline/scripts/extract_and_upload_nist.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +"""Extract large NIST PDFs locally, then upload as .txt to RAG service. + +Workaround for embedding-service container crashing on large PDFs (>5 MB). +Runs pdfplumber + normalization locally, uploads extracted text as .txt. + +Usage (on Mac Mini): + python3 control-pipeline/scripts/extract_and_upload_nist.py +""" + +import json +import os +import re +import sys +import tempfile +import unicodedata + +import httpx +import pdfplumber + +RAG_URL = "https://localhost:8097" +QDRANT_URL = "http://localhost:6333" + +DOCS = [ + { + "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "NIST_SP_800_53r5.txt", + "extra_metadata": { + "regulation_id": "nist_sp800_53r5", + "source_id": "nist", + "doc_type": "controls_catalog", + "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls", + "license": "public_domain_us_gov", + "attribution": "NIST", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf", + "collection": "bp_compliance_ce", + "filename": "nist_sp_800_82r3.txt", + "extra_metadata": { + "regulation_id": "nist_sp_800_82r3", + "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security", + "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security", + "regulation_short": "NIST SP 800-82", + "category": "ot_security", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf", + "collection": "bp_compliance_ce", + "filename": "nist_sp_800_160v1r1.txt", + "extra_metadata": { + "regulation_id": "nist_sp_800_160v1r1", + "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1", + "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1", + "regulation_short": "NIST SP 800-160", + "category": "security_engineering", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "NIST_SP_800_207.txt", + "extra_metadata": { + "regulation_id": "nist_sp800_207", + "source_id": "nist", + "doc_type": "architecture", + "guideline_name": "NIST SP 800-207 Zero Trust Architecture", + "license": "public_domain_us_gov", + "attribution": "NIST", + "source": "nist.gov", + }, + }, +] + + +def normalize_pdf_text(text: str) -> str: + """Fix broken spacing from multi-column PDF extraction.""" + text = unicodedata.normalize('NFKC', text) + text = text.replace('\u00ad', '').replace('\u200b', '') + prev = None + while prev != text: + prev = text + text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text) + text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text) + text = re.sub( + r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text + ) + text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text) + text = re.sub(r'[^\S\n]{2,}', ' ', text) + return text + + +def extract_pdf_locally(pdf_bytes: bytes) -> str: + """Extract text from PDF using pdfplumber with normalization.""" + import io + text_parts = [] + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + print(f" Pages: {len(pdf.pages)}") + for i, page in enumerate(pdf.pages): + text = page.extract_text(x_tolerance=3, y_tolerance=4) + if text: + text_parts.append(text) + if (i + 1) % 50 == 0: + print(f" Extracted {i + 1}/{len(pdf.pages)} pages...") + raw = "\n\n".join(text_parts) + return normalize_pdf_text(raw) + + +def download_from_minio(object_name: str) -> bytes: + """Download file from MinIO via RAG service.""" + with httpx.Client(timeout=60.0, verify=False) as c: + resp = c.get(f"{RAG_URL}/api/v1/documents/download/{object_name}") + resp.raise_for_status() + url = resp.json()["url"] + with httpx.Client(timeout=300.0, verify=False) as c: + resp = c.get(url) + resp.raise_for_status() + return resp.content + + +def upload_text( + text: str, filename: str, collection: str, extra_metadata: dict, +) -> dict: + """Upload extracted text to RAG service as .txt.""" + form_data = { + "collection": collection, + "data_type": "compliance", + "bundesland": "bund", + "use_case": "compliance", + "year": "2026", + "chunk_strategy": "recursive", + "chunk_size": "1500", + "chunk_overlap": "100", + "metadata_json": json.dumps(extra_metadata, ensure_ascii=False), + } + text_bytes = text.encode("utf-8") + with httpx.Client(timeout=1800.0, verify=False) as c: + resp = c.post( + f"{RAG_URL}/api/v1/documents/upload", + files={"file": (filename, text_bytes, "text/plain")}, + data=form_data, + ) + resp.raise_for_status() + return resp.json() + + +def count_chunks(collection: str, regulation_id: str) -> int: + """Count chunks for a regulation in Qdrant.""" + with httpx.Client(timeout=30.0) as c: + resp = c.post( + f"{QDRANT_URL}/collections/{collection}/points/count", + json={ + "filter": { + "must": [{ + "key": "regulation_id", + "match": {"value": regulation_id}, + }] + }, + "exact": True, + }, + ) + resp.raise_for_status() + return resp.json()["result"]["count"] + + +def check_section_rate(collection: str, regulation_id: str) -> tuple: + """Returns (total_chunks, chunks_with_section).""" + total = 0 + with_sec = 0 + offset = None + with httpx.Client(timeout=60.0) as c: + while True: + body = { + "filter": { + "must": [{ + "key": "regulation_id", + "match": {"value": regulation_id}, + }] + }, + "limit": 100, + "with_payload": ["section"], + } + if offset is not None: + body["offset"] = offset + resp = c.post( + f"{QDRANT_URL}/collections/{collection}/points/scroll", + json=body, + ) + resp.raise_for_status() + data = resp.json()["result"] + for pt in data["points"]: + total += 1 + s = pt.get("payload", {}).get("section", "") + if s and s.strip(): + with_sec += 1 + offset = data.get("next_page_offset") + if offset is None: + break + return total, with_sec + + +def main(): + print("=" * 60) + print("NIST PDF Local Extraction + Upload") + print("=" * 60) + + results = [] + for i, doc in enumerate(DOCS, 1): + reg_id = doc["extra_metadata"]["regulation_id"] + print(f"\n[{i}/{len(DOCS)}] {doc['filename']} → {doc['collection']}") + + # 1. Check current state + existing = count_chunks(doc["collection"], reg_id) + print(f" Existing chunks: {existing}") + + # 2. Download PDF from MinIO + print(f" Downloading from MinIO...") + pdf_bytes = download_from_minio(doc["object_name"]) + print(f" Downloaded {len(pdf_bytes) / 1024 / 1024:.1f} MB") + + # 3. Extract text locally with pdfplumber + print(f" Extracting text locally...") + text = extract_pdf_locally(pdf_bytes) + print(f" Extracted {len(text):,} chars, {text.count(chr(10)):,} lines") + + # 4. Save extracted text temporarily (for debugging) + tmp_path = f"/tmp/nist_{reg_id}.txt" + with open(tmp_path, "w", encoding="utf-8") as f: + f.write(text) + print(f" Saved to {tmp_path}") + + # 5. Upload as .txt + print(f" Uploading as .txt to RAG service...") + result = upload_text(text, doc["filename"], doc["collection"], + doc["extra_metadata"]) + new_chunks = result.get("chunks_count", 0) + new_doc_id = result.get("document_id", "") + print(f" Uploaded: {new_chunks} chunks (doc_id={new_doc_id})") + + # 6. Check section rate + if new_chunks > 0: + total, with_sec = check_section_rate(doc["collection"], reg_id) + pct = (with_sec / total * 100) if total > 0 else 0 + print(f" Section rate: {with_sec}/{total} ({pct:.0f}%)") + else: + pct = 0 + print(" WARNING: 0 chunks created!") + + results.append({ + "file": doc["filename"], + "old": existing, + "new": new_chunks, + "section_rate": round(pct, 1), + }) + + # Summary + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + for r in results: + print(f" {r['file']:<40} old={r['old']} new={r['new']} sect={r['section_rate']}%") + + total_new = sum(r["new"] for r in results) + print(f"\nTotal new chunks: {total_new}") + + if any(r["new"] == 0 for r in results): + print("\nWARNING: Some documents produced 0 chunks!") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/control-pipeline/scripts/reupload_legal_strategy.py b/control-pipeline/scripts/reupload_legal_strategy.py new file mode 100644 index 0000000..f886ac3 --- /dev/null +++ b/control-pipeline/scripts/reupload_legal_strategy.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 +"""Re-upload NIST/BSI/ENISA docs with chunk_strategy='legal' for section metadata. + +The docs were already uploaded with 'recursive' strategy (no section detection). +This script re-uploads with 'legal' strategy, then deletes old recursive chunks. + +Usage (on Mac Mini): + python3 control-pipeline/scripts/reupload_legal_strategy.py + python3 control-pipeline/scripts/reupload_legal_strategy.py --dry-run +""" + +import argparse +import io +import json +import re +import sys +import time +import unicodedata + +import httpx +import pdfplumber + +RAG_URL = "https://localhost:8097" +QDRANT_URL = "http://localhost:6333" +UPLOAD_TIMEOUT = 1800.0 + +# ---- Documents to process ---- + +DOCS = [ + # 4 NIST docs already extracted at /tmp/nist_*.txt + { + "regulation_id": "nist_sp800_53r5", + "collection": "bp_compliance_datenschutz", + "upload_filename": "NIST_SP_800_53r5.txt", + "local_txt": "/tmp/nist_nist_sp800_53r5.txt", + "minio_pdf": None, # already extracted + "extra_metadata": { + "regulation_id": "nist_sp800_53r5", + "source_id": "nist", + "doc_type": "controls_catalog", + "guideline_name": "NIST SP 800-53 Rev. 5", + "license": "public_domain_us_gov", + "source": "nist.gov", + }, + }, + { + "regulation_id": "nist_sp_800_82r3", + "collection": "bp_compliance_ce", + "upload_filename": "nist_sp_800_82r3.txt", + "local_txt": "/tmp/nist_nist_sp_800_82r3.txt", + "minio_pdf": None, + "extra_metadata": { + "regulation_id": "nist_sp_800_82r3", + "regulation_short": "NIST SP 800-82", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "regulation_id": "nist_sp_800_160v1r1", + "collection": "bp_compliance_ce", + "upload_filename": "nist_sp_800_160v1r1.txt", + "local_txt": "/tmp/nist_160.txt", + "minio_pdf": None, + "extra_metadata": { + "regulation_id": "nist_sp_800_160v1r1", + "regulation_short": "NIST SP 800-160", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "regulation_id": "nist_sp800_207", + "collection": "bp_compliance_datenschutz", + "upload_filename": "NIST_SP_800_207.txt", + "local_txt": None, # needs extraction + "minio_pdf": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf", + "extra_metadata": { + "regulation_id": "nist_sp800_207", + "source_id": "nist", + "doc_type": "architecture", + "guideline_name": "NIST SP 800-207 Zero Trust Architecture", + "license": "public_domain_us_gov", + "source": "nist.gov", + }, + }, + # Additional low-quality docs (need extraction from MinIO) + { + "regulation_id": "nist_csf_2_0", + "collection": "bp_compliance_datenschutz", + "upload_filename": "nist_csf_2_0.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/nist_csf_2_0.pdf", + "extra_metadata": { + "regulation_id": "nist_csf_2_0", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "regulation_id": "nistir_8259a", + "collection": "bp_compliance_datenschutz", + "upload_filename": "nistir_8259a.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/nistir_8259a.pdf", + "extra_metadata": { + "regulation_id": "nistir_8259a", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "regulation_id": "nist_ai_rmf", + "collection": "bp_compliance_datenschutz", + "upload_filename": "nist_ai_rmf.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/nist_ai_rmf.pdf", + "extra_metadata": { + "regulation_id": "nist_ai_rmf", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "regulation_id": "nist_sp_800_30r1", + "collection": "bp_compliance_ce", + "upload_filename": "nist_sp_800_30r1.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf", + "extra_metadata": { + "regulation_id": "nist_sp_800_30r1", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "regulation_id": "cisa_secure_by_design", + "collection": "bp_compliance_ce", + "upload_filename": "cisa_secure_by_design.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf", + "extra_metadata": { + "regulation_id": "cisa_secure_by_design", + "license": "public_domain_us", + "source": "cisa.gov", + }, + }, + { + "regulation_id": "cvss_v4_0", + "collection": "bp_compliance_ce", + "upload_filename": "cvss_v4_0.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/cvss_v4_0.pdf", + "extra_metadata": { + "regulation_id": "cvss_v4_0", + "license": "public_domain_us", + "source": "first.org", + }, + }, + { + "regulation_id": "enisa_ics_scada_dependencies", + "collection": "bp_compliance_ce", + "upload_filename": "enisa_ics_scada.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/enisa_ics_scada.pdf", + "extra_metadata": { + "regulation_id": "enisa_ics_scada_dependencies", + "license": "reuse_with_attribution", + "source": "enisa.europa.eu", + }, + }, + { + "regulation_id": "enisa_threat_landscape_supply_chain", + "collection": "bp_compliance_ce", + "upload_filename": "enisa_supply_chain_security.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf", + "extra_metadata": { + "regulation_id": "enisa_threat_landscape_supply_chain", + "license": "reuse_with_attribution", + "source": "enisa.europa.eu", + }, + }, + { + "regulation_id": "enisa_supply_chain_good_practices", + "collection": "bp_compliance_ce", + "upload_filename": "enisa_supply_chain_good_practices.txt", + "local_txt": None, + "minio_pdf": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf", + "extra_metadata": { + "regulation_id": "enisa_supply_chain_good_practices", + "license": "reuse_with_attribution", + "source": "enisa.europa.eu", + }, + }, +] + + +def normalize_pdf_text(text): + text = unicodedata.normalize('NFKC', text) + text = text.replace('\u00ad', '').replace('\u200b', '') + prev = None + while prev != text: + prev = text + text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text) + text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text) + text = re.sub( + r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text + ) + text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text) + text = re.sub(r'[^\S\n]{2,}', ' ', text) + return text + + +def get_text(doc): + """Get document text: from local file or extract from MinIO PDF.""" + if doc["local_txt"]: + print(f" Reading local: {doc['local_txt']}") + with open(doc["local_txt"], encoding="utf-8") as f: + return f.read() + + print(f" Downloading from MinIO: {doc['minio_pdf']}") + with httpx.Client(timeout=60, verify=False) as c: + resp = c.get(f"{RAG_URL}/api/v1/documents/download/{doc['minio_pdf']}") + resp.raise_for_status() + url = resp.json()["url"] + with httpx.Client(timeout=300, verify=False) as c: + pdf_bytes = c.get(url).content + print(f" Downloaded {len(pdf_bytes) / 1024 / 1024:.1f} MB") + + print(" Extracting with pdfplumber...") + parts = [] + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + for i, page in enumerate(pdf.pages): + t = page.extract_text(x_tolerance=3, y_tolerance=4) + if t: + parts.append(t) + if (i + 1) % 50 == 0: + print(f" {i + 1}/{len(pdf.pages)} pages...") + text = "\n\n".join(parts) + text = normalize_pdf_text(text) + print(f" Extracted {len(text):,} chars") + return text + + +def get_old_doc_ids(collection, regulation_id): + """Get all document_ids for existing chunks.""" + doc_ids = set() + offset = None + with httpx.Client(timeout=60) as c: + while True: + body = { + "filter": {"must": [ + {"key": "regulation_id", "match": {"value": regulation_id}} + ]}, + "limit": 100, + "with_payload": ["document_id"], + } + if offset is not None: + body["offset"] = offset + resp = c.post( + f"{QDRANT_URL}/collections/{collection}/points/scroll", + json=body, + ) + resp.raise_for_status() + data = resp.json()["result"] + for pt in data["points"]: + did = pt.get("payload", {}).get("document_id") + if did: + doc_ids.add(did) + offset = data.get("next_page_offset") + if offset is None: + break + return doc_ids + + +def upload_text_legal(text, filename, collection, extra_metadata): + """Upload with chunk_strategy='legal'.""" + form_data = { + "collection": collection, + "data_type": "compliance", + "bundesland": "bund", + "use_case": "compliance", + "year": "2026", + "chunk_strategy": "legal", + "chunk_size": "1500", + "chunk_overlap": "100", + "metadata_json": json.dumps(extra_metadata, ensure_ascii=False), + } + with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c: + resp = c.post( + f"{RAG_URL}/api/v1/documents/upload", + files={"file": (filename, text.encode("utf-8"), "text/plain")}, + data=form_data, + ) + resp.raise_for_status() + return resp.json() + + +def delete_by_doc_ids(collection, doc_ids): + """Delete chunks matching specific document_ids.""" + with httpx.Client(timeout=30) as c: + for did in doc_ids: + c.post( + f"{QDRANT_URL}/collections/{collection}/points/delete", + json={"filter": {"must": [ + {"key": "document_id", "match": {"value": did}} + ]}}, + ).raise_for_status() + + +def count_chunks(collection, regulation_id): + with httpx.Client(timeout=30) as c: + resp = c.post( + f"{QDRANT_URL}/collections/{collection}/points/count", + json={"filter": {"must": [ + {"key": "regulation_id", "match": {"value": regulation_id}} + ]}, "exact": True}, + ) + resp.raise_for_status() + return resp.json()["result"]["count"] + + +def check_section_rate(collection, regulation_id): + total = 0 + with_sec = 0 + offset = None + with httpx.Client(timeout=60) as c: + while True: + body = { + "filter": {"must": [ + {"key": "regulation_id", "match": {"value": regulation_id}} + ]}, + "limit": 100, + "with_payload": ["section"], + } + if offset is not None: + body["offset"] = offset + resp = c.post( + f"{QDRANT_URL}/collections/{collection}/points/scroll", + json=body, + ) + resp.raise_for_status() + data = resp.json()["result"] + for pt in data["points"]: + total += 1 + s = pt.get("payload", {}).get("section", "") + if s and s.strip(): + with_sec += 1 + offset = data.get("next_page_offset") + if offset is None: + break + return total, with_sec + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + print("=" * 60) + print("Re-upload with chunk_strategy='legal'") + print(f"Documents: {len(DOCS)}, Dry run: {args.dry_run}") + print("=" * 60) + + results = [] + for i, doc in enumerate(DOCS, 1): + reg_id = doc["regulation_id"] + coll = doc["collection"] + print(f"\n[{i}/{len(DOCS)}] {doc['upload_filename']} → {coll}") + + # 1. Check existing + old_count = count_chunks(coll, reg_id) + old_doc_ids = get_old_doc_ids(coll, reg_id) if old_count > 0 else set() + print(f" Old: {old_count} chunks, {len(old_doc_ids)} doc_ids") + + if args.dry_run: + print(" DRY RUN — skipping") + results.append({"file": doc["upload_filename"], "old": old_count, + "new": "?", "sect": "?"}) + continue + + # 2. Get text + text = get_text(doc) + + # 3. Upload with legal strategy + print(" Uploading with strategy='legal'...") + result = upload_text_legal( + text, doc["upload_filename"], coll, doc["extra_metadata"]) + new_chunks = result.get("chunks_count", 0) + new_doc_id = result.get("document_id", "") + print(f" New: {new_chunks} chunks (doc_id={new_doc_id})") + + if new_chunks == 0: + print(" ERROR: 0 chunks — keeping old!") + results.append({"file": doc["upload_filename"], "old": old_count, + "new": 0, "sect": 0}) + continue + + # 4. Delete old chunks (safe: new ones already exist) + if old_doc_ids: + # Exclude the new document_id just in case + old_doc_ids.discard(new_doc_id) + if old_doc_ids: + print(f" Deleting {len(old_doc_ids)} old doc_ids...") + delete_by_doc_ids(coll, old_doc_ids) + + # 5. Check section rate + total, with_sec = check_section_rate(coll, reg_id) + pct = (with_sec / total * 100) if total > 0 else 0 + print(f" Section rate: {with_sec}/{total} ({pct:.0f}%)") + + results.append({"file": doc["upload_filename"], "old": old_count, + "new": new_chunks, "sect": round(pct, 1)}) + + if i < len(DOCS): + time.sleep(2) + + # Summary + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + for r in results: + print(f" {r['file']:<45} old={r['old']:<6} new={r['new']:<6} sect={r['sect']}%") + + total_new = sum(r["new"] for r in results if isinstance(r["new"], int)) + print(f"\nTotal new chunks: {total_new}") + + +if __name__ == "__main__": + main() diff --git a/embedding-service/main.py b/embedding-service/main.py index cdcceca..fa6d9bf 100644 --- a/embedding-service/main.py +++ b/embedding-service/main.py @@ -389,6 +389,11 @@ _SECTION_NUMBER_RE = re.compile( r'|Kapitel\s+(\d+)' # Kapitel 2 r'|Anhang\s+([IVXLC\d]+)' # Anhang III r'|Annex\s+([IVXLC\d]+)' # Annex XII + # NIST/ENISA/standard identifiers + r'|([A-Z]{2}\.[A-Z]{2}-\d{2})' # GV.OC-01 (NIST CSF 2.0) + r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)' # AC-1, AC-1(1) (NIST controls) + r'|(\d+\.\d+(?:\.\d+)*)' # 3.1, 2.3.1 (numbered sections) + r'|(A\d{2}(?::\d{4})?)' # A01:2021 (OWASP) r')', re.IGNORECASE ) diff --git a/embedding-service/test_nist_normalization.py b/embedding-service/test_nist_normalization.py index 6439529..53bd0c4 100644 --- a/embedding-service/test_nist_normalization.py +++ b/embedding-service/test_nist_normalization.py @@ -13,7 +13,9 @@ Covers: from main import ( _normalize_pdf_text, _extract_section_header, + _parse_section_metadata, chunk_text_legal, + chunk_text_legal_structured, ) @@ -124,6 +126,66 @@ class TestNistSectionDetection: assert _extract_section_header(normalized) is not None +# ========================================================================= +# Section metadata extraction (_parse_section_metadata) +# ========================================================================= + +class TestNistSectionMetadata: + + def test_nist_control_ac1_section(self): + meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES") + assert meta["section"] == "AC-1" + + def test_nist_control_au2_section(self): + meta = _parse_section_metadata("AU-2 Audit Events") + assert meta["section"] == "AU-2" + + def test_nist_enhancement_section(self): + meta = _parse_section_metadata("AC-1(1) Policy and Procedures") + assert meta["section"] == "AC-1(1)" + + def test_nist_csf_compound_section(self): + meta = _parse_section_metadata("GV.OC-01 Organizational Context") + assert meta["section"] == "GV.OC-01" + + def test_numbered_section(self): + meta = _parse_section_metadata("3.1 ACCESS CONTROL") + assert meta["section"] == "3.1" + + def test_deep_numbered_section(self): + meta = _parse_section_metadata("2.3.1 Subtitle") + assert meta["section"] == "2.3.1" + + def test_owasp_section(self): + meta = _parse_section_metadata("A01:2021 Broken Access Control") + assert meta["section"] == "A01:2021" + + def test_section_title_extracted(self): + meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES") + assert meta["section_title"] == "POLICY AND PROCEDURES" + + def test_numbered_section_title(self): + meta = _parse_section_metadata("3.1 ACCESS CONTROL") + assert meta["section_title"] == "ACCESS CONTROL" + + def test_structured_chunks_have_section(self): + text = ( + "3.1 ACCESS CONTROL\n" + "Overview of access control family.\n\n" + "AC-1 POLICY AND PROCEDURES\n" + "The organization develops, documents, and disseminates an access " + "control policy that addresses purpose, scope, roles, responsibilities, " + "management commitment, coordination among entities.\n\n" + "AC-2 ACCOUNT MANAGEMENT\n" + "The information system enforces approved authorizations for logical " + "access to information and system resources.\n" + ) + result = chunk_text_legal_structured(text, chunk_size=300, overlap=50) + sections = [r.get("section", "") for r in result] + assert any(s == "AC-1" for s in sections) + assert any(s == "AC-2" for s in sections) + + # ========================================================================= # Chunking with NIST-style text # ========================================================================= diff --git a/scripts/qdrant-snapshot.sh b/scripts/qdrant-snapshot.sh new file mode 100755 index 0000000..8e8cbd2 --- /dev/null +++ b/scripts/qdrant-snapshot.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Qdrant Snapshot — erstellt Snapshots aller Collections +# +# Usage: +# bash scripts/qdrant-snapshot.sh # Create snapshots +# bash scripts/qdrant-snapshot.sh --list # List existing snapshots +# bash scripts/qdrant-snapshot.sh --restore # Restore (interactive) +# +# Snapshots werden im Qdrant-Volume unter /qdrant/storage/snapshots/ gespeichert. +# Zusaetzlich werden sie nach ./backups/qdrant/ kopiert. + +set -euo pipefail + +QDRANT_URL="${QDRANT_URL:-http://localhost:6333}" +BACKUP_DIR="${BACKUP_DIR:-./backups/qdrant}" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +# --- List existing snapshots --- +if [[ "${1:-}" == "--list" ]]; then + echo "=== Qdrant Snapshots ===" + for coll in $(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]"); do + echo "" + echo "Collection: $coll" + curl -sf "$QDRANT_URL/collections/$coll/snapshots" | python3 -c " +import sys, json +snaps = json.load(sys.stdin).get('result', []) +if not snaps: + print(' (no snapshots)') +else: + for s in snaps: + print(f\" {s['name']} size={s.get('size',0)/(1024*1024):.1f}MB\") +" + done + exit 0 +fi + +# --- Create snapshots --- +echo "=== Creating Qdrant Snapshots ($TIMESTAMP) ===" +mkdir -p "$BACKUP_DIR" + +COLLECTIONS=$(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]") + +for coll in $COLLECTIONS; do + echo "" + echo "[$coll] Creating snapshot..." + + SNAP=$(curl -sf -X POST "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['name'])") + + if [[ -z "$SNAP" ]]; then + echo "[$coll] ERROR: snapshot creation failed" + continue + fi + + echo "[$coll] Snapshot: $SNAP" + + # Download snapshot to backup dir + OUTFILE="$BACKUP_DIR/${coll}_${TIMESTAMP}.snapshot" + curl -sf "$QDRANT_URL/collections/$coll/snapshots/$SNAP" -o "$OUTFILE" + SIZE=$(du -h "$OUTFILE" | cut -f1) + echo "[$coll] Saved: $OUTFILE ($SIZE)" +done + +echo "" +echo "=== Done ===" +ls -lh "$BACKUP_DIR"/*_${TIMESTAMP}.snapshot 2>/dev/null || echo "No snapshots created"