feat(embedding): NIST PDF text normalization + safe re-ingest script
Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int
|
||||
return 0 # Qdrant delete doesn't return count
|
||||
|
||||
|
||||
def _delete_old_chunks_safe(
|
||||
qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
|
||||
) -> None:
|
||||
"""Delete old chunks for a document, keeping chunks with keep_doc_id."""
|
||||
with httpx.Client(timeout=30.0) as c:
|
||||
resp = c.post(
|
||||
f"{qdrant_url}/collections/{collection}/points/delete",
|
||||
json={
|
||||
"filter": {
|
||||
"must": [{
|
||||
"key": "object_name",
|
||||
"match": {"value": object_name},
|
||||
}],
|
||||
"must_not": [{
|
||||
"key": "document_id",
|
||||
"match": {"value": keep_doc_id},
|
||||
}],
|
||||
}
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
|
||||
def reupload_document(
|
||||
rag_url: str,
|
||||
file_bytes: bytes,
|
||||
@@ -220,7 +243,11 @@ def process_document(
|
||||
progress: dict,
|
||||
max_retries: int = 2,
|
||||
) -> bool:
|
||||
"""Process a single document: download → delete → re-upload. Returns success."""
|
||||
"""Process a single document: download → upload → verify → delete old.
|
||||
|
||||
Safe order: new chunks are created FIRST, old chunks deleted only after
|
||||
successful verification (upload-before-delete pattern).
|
||||
"""
|
||||
key = doc_key(doc["object_name"], doc["collection"])
|
||||
|
||||
# Skip if already done
|
||||
@@ -237,20 +264,32 @@ def process_document(
|
||||
"status": "skipped", "reason": "empty_file"}
|
||||
return False
|
||||
|
||||
# 2. Delete old chunks
|
||||
delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"])
|
||||
|
||||
# 3. Re-upload
|
||||
# 2. Upload FIRST (creates new chunks alongside old ones)
|
||||
result = reupload_document(
|
||||
rag_url, file_bytes, doc["filename"],
|
||||
doc["collection"], doc["form"], doc["extra_metadata"],
|
||||
)
|
||||
|
||||
new_chunks = result.get("chunks_count", 0)
|
||||
new_doc_id = result.get("document_id", "")
|
||||
if new_chunks == 0:
|
||||
logger.error(" Upload produced 0 chunks — keeping old data: %s",
|
||||
doc["object_name"])
|
||||
progress.setdefault("documents", {})[key] = {
|
||||
"status": "error", "error": "0 new chunks"}
|
||||
return False
|
||||
|
||||
# 3. Delete OLD chunks only (exclude the new document_id)
|
||||
_delete_old_chunks_safe(
|
||||
qdrant_url, doc["collection"],
|
||||
doc["object_name"], new_doc_id,
|
||||
)
|
||||
|
||||
# 4. Record success
|
||||
progress.setdefault("documents", {})[key] = {
|
||||
"status": "done",
|
||||
"old_chunks": doc["old_chunk_count"],
|
||||
"new_chunks": result.get("chunks_count", 0),
|
||||
"new_chunks": new_chunks,
|
||||
"new_document_id": result.get("document_id", ""),
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user