feat(embedding): NIST PDF text normalization + safe re-ingest script

Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents:
- _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1),
  control IDs (AC - 1 → AC-1), ligatures, soft hyphens
- pdfplumber tolerances increased (x=3,y=4) for better column handling
- 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10
- reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs
- reingest_d5.py: safety fix — upload first, verify, then delete old

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 06:42:46 +02:00
parent 97a7f6f264
commit 0b0eed27b0
4 changed files with 738 additions and 9 deletions
+45 -6
View File
@@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int
return 0 # Qdrant delete doesn't return count
def _delete_old_chunks_safe(
qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
) -> None:
"""Delete old chunks for a document, keeping chunks with keep_doc_id."""
with httpx.Client(timeout=30.0) as c:
resp = c.post(
f"{qdrant_url}/collections/{collection}/points/delete",
json={
"filter": {
"must": [{
"key": "object_name",
"match": {"value": object_name},
}],
"must_not": [{
"key": "document_id",
"match": {"value": keep_doc_id},
}],
}
},
)
resp.raise_for_status()
def reupload_document(
rag_url: str,
file_bytes: bytes,
@@ -220,7 +243,11 @@ def process_document(
progress: dict,
max_retries: int = 2,
) -> bool:
"""Process a single document: download → delete → re-upload. Returns success."""
"""Process a single document: download → upload → verify → delete old.
Safe order: new chunks are created FIRST, old chunks deleted only after
successful verification (upload-before-delete pattern).
"""
key = doc_key(doc["object_name"], doc["collection"])
# Skip if already done
@@ -237,20 +264,32 @@ def process_document(
"status": "skipped", "reason": "empty_file"}
return False
# 2. Delete old chunks
delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"])
# 3. Re-upload
# 2. Upload FIRST (creates new chunks alongside old ones)
result = reupload_document(
rag_url, file_bytes, doc["filename"],
doc["collection"], doc["form"], doc["extra_metadata"],
)
new_chunks = result.get("chunks_count", 0)
new_doc_id = result.get("document_id", "")
if new_chunks == 0:
logger.error(" Upload produced 0 chunks — keeping old data: %s",
doc["object_name"])
progress.setdefault("documents", {})[key] = {
"status": "error", "error": "0 new chunks"}
return False
# 3. Delete OLD chunks only (exclude the new document_id)
_delete_old_chunks_safe(
qdrant_url, doc["collection"],
doc["object_name"], new_doc_id,
)
# 4. Record success
progress.setdefault("documents", {})[key] = {
"status": "done",
"old_chunks": doc["old_chunk_count"],
"new_chunks": result.get("chunks_count", 0),
"new_chunks": new_chunks,
"new_document_id": result.get("document_id", ""),
"completed_at": datetime.now(timezone.utc).isoformat(),
}