feat(embedding): NIST PDF text normalization + safe re-ingest script

Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 06:42:46 +02:00
parent 97a7f6f264
commit 0b0eed27b0
4 changed files with 738 additions and 9 deletions
@@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int
    return 0  # Qdrant delete doesn't return count


+def _delete_old_chunks_safe(
+    qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
+) -> None:
+    """Delete old chunks for a document, keeping chunks with keep_doc_id."""
+    with httpx.Client(timeout=30.0) as c:
+        resp = c.post(
+            f"{qdrant_url}/collections/{collection}/points/delete",
+            json={
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }],
+                    "must_not": [{
+                        "key": "document_id",
+                        "match": {"value": keep_doc_id},
+                    }],
+                }
+            },
+        )
+        resp.raise_for_status()
+
+
 def reupload_document(
    rag_url: str,
    file_bytes: bytes,
@@ -220,7 +243,11 @@ def process_document(
    progress: dict,
    max_retries: int = 2,
 ) -> bool:
-    """Process a single document: download → delete → re-upload. Returns success."""
+    """Process a single document: download → upload → verify → delete old.
+
+    Safe order: new chunks are created FIRST, old chunks deleted only after
+    successful verification (upload-before-delete pattern).
+    """
    key = doc_key(doc["object_name"], doc["collection"])

    # Skip if already done
@@ -237,20 +264,32 @@ def process_document(
                    "status": "skipped", "reason": "empty_file"}
                return False

-            # 2. Delete old chunks
-            delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"])
-
-            # 3. Re-upload
+            # 2. Upload FIRST (creates new chunks alongside old ones)
            result = reupload_document(
                rag_url, file_bytes, doc["filename"],
                doc["collection"], doc["form"], doc["extra_metadata"],
            )

+            new_chunks = result.get("chunks_count", 0)
+            new_doc_id = result.get("document_id", "")
+            if new_chunks == 0:
+                logger.error("  Upload produced 0 chunks — keeping old data: %s",
+                             doc["object_name"])
+                progress.setdefault("documents", {})[key] = {
+                    "status": "error", "error": "0 new chunks"}
+                return False
+
+            # 3. Delete OLD chunks only (exclude the new document_id)
+            _delete_old_chunks_safe(
+                qdrant_url, doc["collection"],
+                doc["object_name"], new_doc_id,
+            )
+
            # 4. Record success
            progress.setdefault("documents", {})[key] = {
                "status": "done",
                "old_chunks": doc["old_chunk_count"],
-                "new_chunks": result.get("chunks_count", 0),
+                "new_chunks": new_chunks,
                "new_document_id": result.get("document_id", ""),
                "completed_at": datetime.now(timezone.utc).isoformat(),
            }