feat(embedding): NIST PDF text normalization + safe re-ingest script
Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int
|
|||||||
return 0 # Qdrant delete doesn't return count
|
return 0 # Qdrant delete doesn't return count
|
||||||
|
|
||||||
|
|
||||||
|
def _delete_old_chunks_safe(
|
||||||
|
qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Delete old chunks for a document, keeping chunks with keep_doc_id."""
|
||||||
|
with httpx.Client(timeout=30.0) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{qdrant_url}/collections/{collection}/points/delete",
|
||||||
|
json={
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "object_name",
|
||||||
|
"match": {"value": object_name},
|
||||||
|
}],
|
||||||
|
"must_not": [{
|
||||||
|
"key": "document_id",
|
||||||
|
"match": {"value": keep_doc_id},
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
|
||||||
def reupload_document(
|
def reupload_document(
|
||||||
rag_url: str,
|
rag_url: str,
|
||||||
file_bytes: bytes,
|
file_bytes: bytes,
|
||||||
@@ -220,7 +243,11 @@ def process_document(
|
|||||||
progress: dict,
|
progress: dict,
|
||||||
max_retries: int = 2,
|
max_retries: int = 2,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Process a single document: download → delete → re-upload. Returns success."""
|
"""Process a single document: download → upload → verify → delete old.
|
||||||
|
|
||||||
|
Safe order: new chunks are created FIRST, old chunks deleted only after
|
||||||
|
successful verification (upload-before-delete pattern).
|
||||||
|
"""
|
||||||
key = doc_key(doc["object_name"], doc["collection"])
|
key = doc_key(doc["object_name"], doc["collection"])
|
||||||
|
|
||||||
# Skip if already done
|
# Skip if already done
|
||||||
@@ -237,20 +264,32 @@ def process_document(
|
|||||||
"status": "skipped", "reason": "empty_file"}
|
"status": "skipped", "reason": "empty_file"}
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 2. Delete old chunks
|
# 2. Upload FIRST (creates new chunks alongside old ones)
|
||||||
delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"])
|
|
||||||
|
|
||||||
# 3. Re-upload
|
|
||||||
result = reupload_document(
|
result = reupload_document(
|
||||||
rag_url, file_bytes, doc["filename"],
|
rag_url, file_bytes, doc["filename"],
|
||||||
doc["collection"], doc["form"], doc["extra_metadata"],
|
doc["collection"], doc["form"], doc["extra_metadata"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
new_chunks = result.get("chunks_count", 0)
|
||||||
|
new_doc_id = result.get("document_id", "")
|
||||||
|
if new_chunks == 0:
|
||||||
|
logger.error(" Upload produced 0 chunks — keeping old data: %s",
|
||||||
|
doc["object_name"])
|
||||||
|
progress.setdefault("documents", {})[key] = {
|
||||||
|
"status": "error", "error": "0 new chunks"}
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 3. Delete OLD chunks only (exclude the new document_id)
|
||||||
|
_delete_old_chunks_safe(
|
||||||
|
qdrant_url, doc["collection"],
|
||||||
|
doc["object_name"], new_doc_id,
|
||||||
|
)
|
||||||
|
|
||||||
# 4. Record success
|
# 4. Record success
|
||||||
progress.setdefault("documents", {})[key] = {
|
progress.setdefault("documents", {})[key] = {
|
||||||
"status": "done",
|
"status": "done",
|
||||||
"old_chunks": doc["old_chunk_count"],
|
"old_chunks": doc["old_chunk_count"],
|
||||||
"new_chunks": result.get("chunks_count", 0),
|
"new_chunks": new_chunks,
|
||||||
"new_document_id": result.get("document_id", ""),
|
"new_document_id": result.get("document_id", ""),
|
||||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,485 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO.
|
||||||
|
|
||||||
|
Uses upload-before-delete pattern: new chunks are created FIRST,
|
||||||
|
old chunks are only deleted after successful verification.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 control-pipeline/scripts/reingest_nist.py [--dry-run]
|
||||||
|
python3 control-pipeline/scripts/reingest_nist.py --only-missing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
sys.path.insert(0, "control-pipeline/scripts")
|
||||||
|
from reingest_d5_config import ( # noqa: E402
|
||||||
|
CHUNK_OVERLAP,
|
||||||
|
CHUNK_SIZE,
|
||||||
|
CHUNK_STRATEGY,
|
||||||
|
DEFAULT_QDRANT_URL,
|
||||||
|
DEFAULT_RAG_URL,
|
||||||
|
content_type_from_filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("reingest-nist")
|
||||||
|
|
||||||
|
UPLOAD_TIMEOUT = 1800.0 # 30 min for large PDFs
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Documents to re-ingest
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
|
||||||
|
# 4 documents with 0 chunks (deleted by D5, upload failed)
|
||||||
|
MISSING_DOCS = [
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "NIST_SP_800_53r5.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp800_53r5",
|
||||||
|
"source_id": "nist",
|
||||||
|
"doc_type": "controls_catalog",
|
||||||
|
"guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
|
||||||
|
"license": "public_domain_us_gov",
|
||||||
|
"attribution": "NIST",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "nist_sp_800_82r3.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_82r3",
|
||||||
|
"regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
|
||||||
|
"regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
|
||||||
|
"regulation_short": "NIST SP 800-82",
|
||||||
|
"category": "ot_security",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "nist_sp_800_160v1r1.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_160v1r1",
|
||||||
|
"regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
|
||||||
|
"regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
|
||||||
|
"regulation_short": "NIST SP 800-160",
|
||||||
|
"category": "security_engineering",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "NIST_SP_800_207.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp800_207",
|
||||||
|
"source_id": "nist",
|
||||||
|
"doc_type": "architecture",
|
||||||
|
"guideline_name": "NIST SP 800-207 Zero Trust Architecture",
|
||||||
|
"license": "public_domain_us_gov",
|
||||||
|
"attribution": "NIST",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality)
|
||||||
|
LOW_QUALITY_DOCS = [
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "nist_csf_2_0.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_csf_2_0",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "nistir_8259a.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nistir_8259a",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
|
||||||
|
"collection": "bp_compliance_datenschutz",
|
||||||
|
"filename": "nist_ai_rmf.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_ai_rmf",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "nist_sp_800_30r1.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "nist_sp_800_30r1",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "nist.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "enisa_supply_chain_good_practices.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "enisa_supply_chain_good_practices",
|
||||||
|
"license": "reuse_with_attribution",
|
||||||
|
"source": "enisa.europa.eu",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "enisa_ics_scada.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "enisa_ics_scada_dependencies",
|
||||||
|
"license": "reuse_with_attribution",
|
||||||
|
"source": "enisa.europa.eu",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "enisa_supply_chain_security.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "enisa_threat_landscape_supply_chain",
|
||||||
|
"license": "reuse_with_attribution",
|
||||||
|
"source": "enisa.europa.eu",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "cisa_secure_by_design.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "cisa_secure_by_design",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "cisa.gov",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
|
||||||
|
"collection": "bp_compliance_ce",
|
||||||
|
"filename": "cvss_v4_0.pdf",
|
||||||
|
"extra_metadata": {
|
||||||
|
"regulation_id": "cvss_v4_0",
|
||||||
|
"license": "public_domain_us",
|
||||||
|
"source": "first.org",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Qdrant helpers
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int:
|
||||||
|
"""Count existing chunks for a document in Qdrant."""
|
||||||
|
with httpx.Client(timeout=30.0) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{qdrant_url}/collections/{collection}/points/count",
|
||||||
|
json={
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "object_name",
|
||||||
|
"match": {"value": object_name},
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
"exact": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()["result"]["count"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_old_document_ids(
|
||||||
|
qdrant_url: str, collection: str, object_name: str,
|
||||||
|
) -> set:
|
||||||
|
"""Get all document_ids for existing chunks of this document."""
|
||||||
|
doc_ids = set()
|
||||||
|
offset = None
|
||||||
|
with httpx.Client(timeout=60.0) as c:
|
||||||
|
while True:
|
||||||
|
body = {
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "object_name",
|
||||||
|
"match": {"value": object_name},
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
"limit": 100,
|
||||||
|
"with_payload": ["document_id"],
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
body["offset"] = offset
|
||||||
|
resp = c.post(
|
||||||
|
f"{qdrant_url}/collections/{collection}/points/scroll",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()["result"]
|
||||||
|
for pt in data["points"]:
|
||||||
|
did = pt.get("payload", {}).get("document_id")
|
||||||
|
if did:
|
||||||
|
doc_ids.add(did)
|
||||||
|
offset = data.get("next_page_offset")
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
return doc_ids
|
||||||
|
|
||||||
|
|
||||||
|
def delete_by_document_ids(
|
||||||
|
qdrant_url: str, collection: str, doc_ids: set,
|
||||||
|
) -> None:
|
||||||
|
"""Delete chunks matching specific document_ids."""
|
||||||
|
for did in doc_ids:
|
||||||
|
with httpx.Client(timeout=30.0) as c:
|
||||||
|
c.post(
|
||||||
|
f"{qdrant_url}/collections/{collection}/points/delete",
|
||||||
|
json={
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "document_id",
|
||||||
|
"match": {"value": did},
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
).raise_for_status()
|
||||||
|
|
||||||
|
|
||||||
|
def check_section_rate(
|
||||||
|
qdrant_url: str, collection: str, object_name: str,
|
||||||
|
) -> tuple:
|
||||||
|
"""Check section rate for a document's chunks. Returns (total, with_section)."""
|
||||||
|
total = 0
|
||||||
|
with_section = 0
|
||||||
|
offset = None
|
||||||
|
with httpx.Client(timeout=60.0) as c:
|
||||||
|
while True:
|
||||||
|
body = {
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "object_name",
|
||||||
|
"match": {"value": object_name},
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
"limit": 100,
|
||||||
|
"with_payload": ["section"],
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
body["offset"] = offset
|
||||||
|
resp = c.post(
|
||||||
|
f"{qdrant_url}/collections/{collection}/points/scroll",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()["result"]
|
||||||
|
for pt in data["points"]:
|
||||||
|
total += 1
|
||||||
|
sec = pt.get("payload", {}).get("section", "")
|
||||||
|
if sec and sec.strip():
|
||||||
|
with_section += 1
|
||||||
|
offset = data.get("next_page_offset")
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
return total, with_section
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Upload
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
def download_from_minio(rag_url: str, object_name: str) -> bytes:
|
||||||
|
"""Download file from MinIO via RAG service presigned URL."""
|
||||||
|
with httpx.Client(timeout=60.0, verify=False) as c:
|
||||||
|
resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}")
|
||||||
|
resp.raise_for_status()
|
||||||
|
presigned_url = resp.json()["url"]
|
||||||
|
|
||||||
|
with httpx.Client(timeout=300.0, verify=False) as c:
|
||||||
|
resp = c.get(presigned_url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.content
|
||||||
|
|
||||||
|
|
||||||
|
def upload_document(
|
||||||
|
rag_url: str,
|
||||||
|
file_bytes: bytes,
|
||||||
|
filename: str,
|
||||||
|
collection: str,
|
||||||
|
extra_metadata: dict,
|
||||||
|
) -> dict:
|
||||||
|
"""Upload document to RAG service."""
|
||||||
|
ct = content_type_from_filename(filename)
|
||||||
|
form_data = {
|
||||||
|
"collection": collection,
|
||||||
|
"data_type": "compliance",
|
||||||
|
"bundesland": "bund",
|
||||||
|
"use_case": "compliance",
|
||||||
|
"year": "2026",
|
||||||
|
"chunk_strategy": CHUNK_STRATEGY,
|
||||||
|
"chunk_size": str(CHUNK_SIZE),
|
||||||
|
"chunk_overlap": str(CHUNK_OVERLAP),
|
||||||
|
"metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
|
||||||
|
resp = c.post(
|
||||||
|
f"{rag_url}/api/v1/documents/upload",
|
||||||
|
files={"file": (filename, file_bytes, ct)},
|
||||||
|
data=form_data,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Main processing
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
def process_document(
|
||||||
|
doc: dict,
|
||||||
|
rag_url: str,
|
||||||
|
qdrant_url: str,
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> dict:
|
||||||
|
"""Safe re-ingest: upload first, then delete old. Returns result dict."""
|
||||||
|
obj = doc["object_name"]
|
||||||
|
coll = doc["collection"]
|
||||||
|
fname = doc["filename"]
|
||||||
|
|
||||||
|
# 1. Check existing state
|
||||||
|
old_count = count_chunks(qdrant_url, coll, obj)
|
||||||
|
old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set()
|
||||||
|
logger.info(" [%s] existing: %d chunks, %d document_ids",
|
||||||
|
fname, old_count, len(old_doc_ids))
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
logger.info(" [%s] DRY RUN — would download + upload + delete old", fname)
|
||||||
|
return {"status": "dry_run", "old_chunks": old_count}
|
||||||
|
|
||||||
|
# 2. Download from MinIO
|
||||||
|
logger.info(" [%s] downloading from MinIO...", fname)
|
||||||
|
file_bytes = download_from_minio(rag_url, obj)
|
||||||
|
size_mb = len(file_bytes) / (1024 * 1024)
|
||||||
|
logger.info(" [%s] downloaded %.1f MB", fname, size_mb)
|
||||||
|
|
||||||
|
# 3. Upload FIRST (creates new chunks)
|
||||||
|
logger.info(" [%s] uploading to RAG service...", fname)
|
||||||
|
result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"])
|
||||||
|
new_chunks = result.get("chunks_count", 0)
|
||||||
|
new_doc_id = result.get("document_id", "")
|
||||||
|
logger.info(" [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id)
|
||||||
|
|
||||||
|
# 4. Verify new chunks exist
|
||||||
|
if new_chunks == 0:
|
||||||
|
logger.error(" [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname)
|
||||||
|
return {"status": "error", "error": "0 new chunks", "old_chunks": old_count}
|
||||||
|
|
||||||
|
# 5. Delete old chunks (only if there were any)
|
||||||
|
if old_doc_ids:
|
||||||
|
logger.info(" [%s] deleting %d old document_ids...", fname, len(old_doc_ids))
|
||||||
|
delete_by_document_ids(qdrant_url, coll, old_doc_ids)
|
||||||
|
logger.info(" [%s] old chunks deleted", fname)
|
||||||
|
|
||||||
|
# 6. Check section rate
|
||||||
|
total, with_sec = check_section_rate(qdrant_url, coll, obj)
|
||||||
|
pct = (with_sec / total * 100) if total > 0 else 0
|
||||||
|
logger.info(" [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"old_chunks": old_count,
|
||||||
|
"new_chunks": new_chunks,
|
||||||
|
"new_document_id": new_doc_id,
|
||||||
|
"section_rate": round(pct, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
|
||||||
|
parser.add_argument("--only-missing", action="store_true",
|
||||||
|
help="Only re-ingest the 4 missing docs (skip low-quality)")
|
||||||
|
parser.add_argument("--rag-url", default=DEFAULT_RAG_URL)
|
||||||
|
parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
docs = list(MISSING_DOCS)
|
||||||
|
if not args.only_missing:
|
||||||
|
docs.extend(LOW_QUALITY_DOCS)
|
||||||
|
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("NIST/BSI/ENISA Safe Re-Ingestion")
|
||||||
|
logger.info(" Documents: %d (%d missing + %d low-quality)",
|
||||||
|
len(docs), len(MISSING_DOCS),
|
||||||
|
0 if args.only_missing else len(LOW_QUALITY_DOCS))
|
||||||
|
logger.info(" RAG: %s", args.rag_url)
|
||||||
|
logger.info(" Qdrant: %s", args.qdrant_url)
|
||||||
|
logger.info(" Dry run: %s", args.dry_run)
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
ok = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for i, doc in enumerate(docs, 1):
|
||||||
|
logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"])
|
||||||
|
try:
|
||||||
|
r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run)
|
||||||
|
results[doc["filename"]] = r
|
||||||
|
if r["status"] == "ok":
|
||||||
|
ok += 1
|
||||||
|
elif r["status"] == "error":
|
||||||
|
errors += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(" FAILED: %s", e)
|
||||||
|
results[doc["filename"]] = {"status": "error", "error": str(e)}
|
||||||
|
errors += 1
|
||||||
|
|
||||||
|
if i < len(docs):
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
logger.info("")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("RESULTS")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
for fname, r in results.items():
|
||||||
|
status = r["status"].upper()
|
||||||
|
old = r.get("old_chunks", "?")
|
||||||
|
new = r.get("new_chunks", "?")
|
||||||
|
sec = r.get("section_rate", "?")
|
||||||
|
logger.info(" %-40s %s old=%s new=%s sect=%.0f%%",
|
||||||
|
fname, status, old, new, sec if isinstance(sec, float) else 0)
|
||||||
|
|
||||||
|
logger.info("")
|
||||||
|
logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs))
|
||||||
|
|
||||||
|
if errors > 0:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -12,6 +12,7 @@ This service handles all ML-heavy operations, keeping the main klausur-service l
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import unicodedata
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
@@ -299,6 +300,9 @@ _LEGAL_SECTION_RE = re.compile(
|
|||||||
# NIST/ENISA/standard numbering
|
# NIST/ENISA/standard numbering
|
||||||
r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]' # 1.1 Title, 2.3.1 Subtitle
|
r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]' # 1.1 Title, 2.3.1 Subtitle
|
||||||
r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b' # AC-1, AU-2, PO.1, PW.1.1
|
r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b' # AC-1, AU-2, PO.1, PW.1.1
|
||||||
|
r'|[A-Z]{2}\.[A-Z]{2}-\d{2}\b' # GV.OC-01 (NIST CSF 2.0)
|
||||||
|
r'|[A-Z]{2,4}-\d+\(\d+\)' # AC-1(1) (NIST enhancements)
|
||||||
|
r'|A\d{2}(?::\d{4})?\b' # A01:2021 (OWASP Top 10)
|
||||||
r'|Table\s+\d+' # Table 1, Table A-1
|
r'|Table\s+\d+' # Table 1, Table A-1
|
||||||
r'|Figure\s+\d+' # Figure 1
|
r'|Figure\s+\d+' # Figure 1
|
||||||
r'|Appendix\s+[A-Z\d]' # Appendix A, Appendix 1
|
r'|Appendix\s+[A-Z\d]' # Appendix A, Appendix 1
|
||||||
@@ -827,6 +831,34 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_pdf_text(text: str) -> str:
|
||||||
|
"""Fix broken spacing from multi-column PDF extraction.
|
||||||
|
|
||||||
|
pdfplumber/pypdf often break section numbers in multi-column NIST/BSI/ENISA
|
||||||
|
PDFs: "1 . 1" instead of "1.1", "AC - 1" instead of "AC-1".
|
||||||
|
"""
|
||||||
|
# Unicode NFKC: decompose ligatures (fi → fi) before other fixes
|
||||||
|
text = unicodedata.normalize('NFKC', text)
|
||||||
|
# Remove soft hyphens and zero-width spaces
|
||||||
|
text = text.replace('\u00ad', '').replace('\u200b', '')
|
||||||
|
# "1 . 1" → "1.1" (broken section numbers, apply repeatedly for nested)
|
||||||
|
prev = None
|
||||||
|
while prev != text:
|
||||||
|
prev = text
|
||||||
|
text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
|
||||||
|
# "AC - 1" → "AC-1" (broken NIST control IDs, 2-4 uppercase letters)
|
||||||
|
text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
|
||||||
|
# "GV . OC - 01" → "GV.OC-01" (NIST CSF 2.0 compound IDs)
|
||||||
|
text = re.sub(
|
||||||
|
r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
|
||||||
|
)
|
||||||
|
# "AC - 1 ( 1 )" → "AC-1(1)" (NIST enhancements with spaced parens)
|
||||||
|
text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
|
||||||
|
# Collapse multiple horizontal spaces (keep newlines)
|
||||||
|
text = re.sub(r'[^\S\n]{2,}', ' ', text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
|
def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
|
||||||
"""Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
|
"""Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
|
||||||
import io
|
import io
|
||||||
@@ -839,12 +871,12 @@ def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
|
|||||||
with pdfplumber.open(pdf_file) as pdf:
|
with pdfplumber.open(pdf_file) as pdf:
|
||||||
page_count = len(pdf.pages)
|
page_count = len(pdf.pages)
|
||||||
for page in pdf.pages:
|
for page in pdf.pages:
|
||||||
text = page.extract_text(x_tolerance=2, y_tolerance=3)
|
text = page.extract_text(x_tolerance=3, y_tolerance=4)
|
||||||
if text:
|
if text:
|
||||||
text_parts.append(text)
|
text_parts.append(text)
|
||||||
|
|
||||||
return ExtractPDFResponse(
|
return ExtractPDFResponse(
|
||||||
text="\n\n".join(text_parts),
|
text=_normalize_pdf_text("\n\n".join(text_parts)),
|
||||||
backend_used="pdfplumber",
|
backend_used="pdfplumber",
|
||||||
pages=page_count,
|
pages=page_count,
|
||||||
table_count=0,
|
table_count=0,
|
||||||
@@ -866,7 +898,7 @@ def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
|
|||||||
text_parts.append(text)
|
text_parts.append(text)
|
||||||
|
|
||||||
return ExtractPDFResponse(
|
return ExtractPDFResponse(
|
||||||
text="\n\n".join(text_parts),
|
text=_normalize_pdf_text("\n\n".join(text_parts)),
|
||||||
backend_used="pypdf",
|
backend_used="pypdf",
|
||||||
pages=len(reader.pages),
|
pages=len(reader.pages),
|
||||||
table_count=0
|
table_count=0
|
||||||
|
|||||||
@@ -0,0 +1,173 @@
|
|||||||
|
"""
|
||||||
|
Tests for NIST/BSI/ENISA PDF text normalization and section detection.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- _normalize_pdf_text() fixing broken multi-column PDF artifacts
|
||||||
|
- Section detection after normalization
|
||||||
|
- NIST CSF 2.0 compound IDs (GV.OC-01)
|
||||||
|
- NIST SP 800-53 control IDs (AC-1, AC-1(1))
|
||||||
|
- OWASP Top 10 IDs (A01:2021)
|
||||||
|
- Unicode normalization (ligatures, soft hyphens)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from main import (
|
||||||
|
_normalize_pdf_text,
|
||||||
|
_extract_section_header,
|
||||||
|
chunk_text_legal,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# _normalize_pdf_text — broken spacing fixes
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestNormalizePdfText:
|
||||||
|
|
||||||
|
def test_broken_section_number(self):
|
||||||
|
assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing"
|
||||||
|
|
||||||
|
def test_nested_section_number(self):
|
||||||
|
assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle"
|
||||||
|
|
||||||
|
def test_broken_nist_control_id(self):
|
||||||
|
assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management"
|
||||||
|
|
||||||
|
def test_broken_nist_control_au(self):
|
||||||
|
assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events"
|
||||||
|
|
||||||
|
def test_broken_csf_compound_id(self):
|
||||||
|
assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context"
|
||||||
|
|
||||||
|
def test_broken_enhancement_parens(self):
|
||||||
|
assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement"
|
||||||
|
|
||||||
|
def test_soft_hyphen_removed(self):
|
||||||
|
assert _normalize_pdf_text("infor\u00admation") == "information"
|
||||||
|
|
||||||
|
def test_zero_width_space_removed(self):
|
||||||
|
assert _normalize_pdf_text("data\u200bprotection") == "dataprotection"
|
||||||
|
|
||||||
|
def test_ligature_fi_normalized(self):
|
||||||
|
# U+FB01 = fi ligature
|
||||||
|
assert _normalize_pdf_text("con\ufb01dential") == "confidential"
|
||||||
|
|
||||||
|
def test_ligature_fl_normalized(self):
|
||||||
|
# U+FB02 = fl ligature
|
||||||
|
assert _normalize_pdf_text("over\ufb02ow") == "overflow"
|
||||||
|
|
||||||
|
def test_multiple_spaces_collapsed(self):
|
||||||
|
assert _normalize_pdf_text("too many spaces") == "too many spaces"
|
||||||
|
|
||||||
|
def test_newlines_preserved(self):
|
||||||
|
result = _normalize_pdf_text("line one\nline two\n\nline three")
|
||||||
|
assert "\n" in result
|
||||||
|
assert "line one" in result
|
||||||
|
assert "line three" in result
|
||||||
|
|
||||||
|
def test_normal_text_unchanged(self):
|
||||||
|
text = "AC-1 Account Management requires proper controls."
|
||||||
|
assert _normalize_pdf_text(text) == text
|
||||||
|
|
||||||
|
def test_combined_artifacts(self):
|
||||||
|
"""Multiple broken artifacts in one text block."""
|
||||||
|
broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context"
|
||||||
|
fixed = _normalize_pdf_text(broken)
|
||||||
|
assert "1.1 Overview" in fixed
|
||||||
|
assert "AC-1 Account Management" in fixed
|
||||||
|
assert "GV.OC-01 Context" in fixed
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Section detection after normalization
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestNistSectionDetection:
|
||||||
|
|
||||||
|
def test_nist_control_ac1(self):
|
||||||
|
assert _extract_section_header("AC-1 Account Management") is not None
|
||||||
|
|
||||||
|
def test_nist_control_au2(self):
|
||||||
|
assert _extract_section_header("AU-2 Audit Events") is not None
|
||||||
|
|
||||||
|
def test_nist_csf_compound(self):
|
||||||
|
assert _extract_section_header("GV.OC-01 Organizational Context") is not None
|
||||||
|
|
||||||
|
def test_nist_enhancement(self):
|
||||||
|
assert _extract_section_header("AC-1(1) Policy and Procedures") is not None
|
||||||
|
|
||||||
|
def test_owasp_top10(self):
|
||||||
|
assert _extract_section_header("A01:2021 Broken Access Control") is not None
|
||||||
|
|
||||||
|
def test_owasp_without_year(self):
|
||||||
|
assert _extract_section_header("A03 Injection") is not None
|
||||||
|
|
||||||
|
def test_numbered_section(self):
|
||||||
|
assert _extract_section_header("2.1 Risk Framing") is not None
|
||||||
|
|
||||||
|
def test_deep_numbered_section(self):
|
||||||
|
assert _extract_section_header("3.2.1 Assessment Methodology") is not None
|
||||||
|
|
||||||
|
def test_broken_then_normalized_detects(self):
|
||||||
|
"""After normalization, broken NIST IDs should be detected as sections."""
|
||||||
|
broken = "AC - 1 Account Management"
|
||||||
|
normalized = _normalize_pdf_text(broken)
|
||||||
|
assert _extract_section_header(normalized) is not None
|
||||||
|
|
||||||
|
def test_broken_csf_then_normalized_detects(self):
|
||||||
|
broken = "GV . OC - 01 Organizational Context"
|
||||||
|
normalized = _normalize_pdf_text(broken)
|
||||||
|
assert _extract_section_header(normalized) is not None
|
||||||
|
|
||||||
|
def test_broken_section_num_then_normalized(self):
|
||||||
|
broken = "2 . 1 Risk Framing"
|
||||||
|
normalized = _normalize_pdf_text(broken)
|
||||||
|
assert _extract_section_header(normalized) is not None
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Chunking with NIST-style text
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestNistChunking:
|
||||||
|
|
||||||
|
NIST_SAMPLE = (
|
||||||
|
"AC-1 Account Management\n"
|
||||||
|
"The organization develops, documents, and disseminates an access "
|
||||||
|
"control policy that addresses purpose, scope, roles, responsibilities, "
|
||||||
|
"management commitment, coordination among organizational entities, "
|
||||||
|
"and compliance.\n\n"
|
||||||
|
"AC-2 Access Enforcement\n"
|
||||||
|
"The information system enforces approved authorizations for logical "
|
||||||
|
"access to information and system resources in accordance with "
|
||||||
|
"applicable access control policies.\n\n"
|
||||||
|
"AC-3 Information Flow Enforcement\n"
|
||||||
|
"The system enforces approved authorizations for controlling the flow "
|
||||||
|
"of information within the system and between interconnected systems.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_chunks_have_section_prefix(self):
|
||||||
|
chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50)
|
||||||
|
assert any("[AC-1" in c for c in chunks)
|
||||||
|
assert any("[AC-2" in c for c in chunks)
|
||||||
|
|
||||||
|
def test_sections_detected(self):
|
||||||
|
chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50)
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
|
||||||
|
def test_normalized_broken_text_chunks_correctly(self):
|
||||||
|
"""Broken PDF text should chunk correctly after normalization."""
|
||||||
|
broken = (
|
||||||
|
"AC - 1 Account Management\n"
|
||||||
|
"The organization develops, documents, and disseminates an access "
|
||||||
|
"control policy that addresses purpose, scope, roles, responsibilities, "
|
||||||
|
"management commitment, coordination among organizational entities, "
|
||||||
|
"and compliance with applicable regulations and standards.\n\n"
|
||||||
|
"AC - 2 Access Enforcement\n"
|
||||||
|
"The information system enforces approved authorizations for logical "
|
||||||
|
"access to information and system resources in accordance with "
|
||||||
|
"applicable access control policies and procedures.\n"
|
||||||
|
)
|
||||||
|
normalized = _normalize_pdf_text(broken)
|
||||||
|
chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50)
|
||||||
|
assert any("[AC-1" in c for c in chunks)
|
||||||
|
assert any("[AC-2" in c for c in chunks)
|
||||||
Reference in New Issue
Block a user