0b0eed27b0
Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
487 lines
17 KiB
Python
487 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
D5 Re-Ingestion: Re-chunk all ~297 legal sources with structural metadata.
|
|
|
|
Usage:
|
|
# Dry-run: build manifest, no changes
|
|
python3 scripts/reingest_d5.py --dry-run
|
|
|
|
# Re-ingest one collection (test)
|
|
python3 scripts/reingest_d5.py --collection bp_compliance_gesetze
|
|
|
|
# Re-ingest all collections (resume-capable)
|
|
python3 scripts/reingest_d5.py --resume
|
|
|
|
# Custom URLs
|
|
python3 scripts/reingest_d5.py --rag-url https://macmini:8097 --qdrant-url http://macmini:6333
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import random
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
|
|
from reingest_d5_config import (
|
|
CHUNK_OVERLAP,
|
|
CHUNK_SIZE,
|
|
CHUNK_STRATEGY,
|
|
DEFAULT_QDRANT_URL,
|
|
DEFAULT_RAG_URL,
|
|
MANIFEST_FILE,
|
|
TARGET_COLLECTIONS,
|
|
content_type_from_filename,
|
|
doc_key,
|
|
extract_doc_metadata,
|
|
load_progress,
|
|
save_progress,
|
|
)
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger("d5-reingest")
|
|
|
|
UPLOAD_TIMEOUT = httpx.Timeout(timeout=3600.0, connect=30.0)
|
|
SCROLL_TIMEOUT = httpx.Timeout(timeout=60.0, connect=10.0)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 0: Preflight
|
|
# ---------------------------------------------------------------------------
|
|
def preflight_checks(rag_url: str, qdrant_url: str) -> dict:
|
|
"""Verify services are reachable and record baseline chunk counts."""
|
|
logger.info("Phase 0: Preflight checks...")
|
|
|
|
with httpx.Client(timeout=10.0, verify=False) as c:
|
|
r = c.get(f"{rag_url}/health")
|
|
r.raise_for_status()
|
|
logger.info(" RAG service: OK")
|
|
|
|
with httpx.Client(timeout=10.0) as c:
|
|
r = c.get(f"{qdrant_url}/collections")
|
|
r.raise_for_status()
|
|
logger.info(" Qdrant: OK")
|
|
|
|
before_counts = {}
|
|
with httpx.Client(timeout=10.0) as c:
|
|
for coll in TARGET_COLLECTIONS:
|
|
try:
|
|
r = c.post(f"{qdrant_url}/collections/{coll}/points/count",
|
|
json={"exact": True})
|
|
r.raise_for_status()
|
|
count = r.json()["result"]["count"]
|
|
except Exception:
|
|
count = 0
|
|
before_counts[coll] = count
|
|
logger.info(" %s: %d chunks", coll, count)
|
|
|
|
return before_counts
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 1: Build manifest
|
|
# ---------------------------------------------------------------------------
|
|
def build_manifest(qdrant_url: str, collections: list[str]) -> list[dict]:
|
|
"""Scroll Qdrant and build a deduplicated document manifest."""
|
|
logger.info("Phase 1: Building document manifest...")
|
|
documents: dict[str, dict] = {} # keyed by doc_key(object_name, collection)
|
|
|
|
with httpx.Client(timeout=SCROLL_TIMEOUT) as client:
|
|
for coll in collections:
|
|
logger.info(" Scrolling %s...", coll)
|
|
offset = None
|
|
points_seen = 0
|
|
|
|
while True:
|
|
body: dict = {
|
|
"limit": 250,
|
|
"with_payload": True,
|
|
"with_vector": False,
|
|
}
|
|
if offset:
|
|
body["offset"] = offset
|
|
|
|
resp = client.post(
|
|
f"{qdrant_url}/collections/{coll}/points/scroll",
|
|
json=body,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()["result"]
|
|
points = data["points"]
|
|
|
|
for pt in points:
|
|
payload = pt.get("payload", {})
|
|
obj_name = payload.get("object_name", "")
|
|
if not obj_name:
|
|
continue
|
|
|
|
key = doc_key(obj_name, coll)
|
|
if key not in documents:
|
|
meta = extract_doc_metadata(payload)
|
|
documents[key] = {
|
|
"object_name": obj_name,
|
|
"collection": coll,
|
|
"filename": payload.get("filename", obj_name.split("/")[-1]),
|
|
"form": meta["form"],
|
|
"extra_metadata": meta["extra"],
|
|
"old_chunk_count": 0,
|
|
}
|
|
documents[key]["old_chunk_count"] += 1
|
|
|
|
points_seen += len(points)
|
|
offset = data.get("next_page_offset")
|
|
if not offset:
|
|
break
|
|
|
|
logger.info(" %d points → %d unique docs",
|
|
points_seen,
|
|
sum(1 for d in documents.values() if d["collection"] == coll))
|
|
|
|
manifest = list(documents.values())
|
|
logger.info(" Total: %d unique documents across %d collections",
|
|
len(manifest), len(collections))
|
|
return manifest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 2: Per-document re-ingestion
|
|
# ---------------------------------------------------------------------------
|
|
def download_file(rag_url: str, object_name: str) -> bytes:
|
|
"""Download file bytes via MinIO presigned URL."""
|
|
with httpx.Client(timeout=60.0, verify=False) as c:
|
|
resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}")
|
|
resp.raise_for_status()
|
|
presigned_url = resp.json()["url"]
|
|
|
|
with httpx.Client(timeout=120.0, verify=False) as c:
|
|
resp = c.get(presigned_url)
|
|
resp.raise_for_status()
|
|
return resp.content
|
|
|
|
|
|
def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int:
|
|
"""Delete all chunks for a document from Qdrant. Returns estimated count."""
|
|
with httpx.Client(timeout=30.0) as c:
|
|
resp = c.post(
|
|
f"{qdrant_url}/collections/{collection}/points/delete",
|
|
json={
|
|
"filter": {
|
|
"must": [{
|
|
"key": "object_name",
|
|
"match": {"value": object_name},
|
|
}]
|
|
}
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
return 0 # Qdrant delete doesn't return count
|
|
|
|
|
|
def _delete_old_chunks_safe(
|
|
qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
|
|
) -> None:
|
|
"""Delete old chunks for a document, keeping chunks with keep_doc_id."""
|
|
with httpx.Client(timeout=30.0) as c:
|
|
resp = c.post(
|
|
f"{qdrant_url}/collections/{collection}/points/delete",
|
|
json={
|
|
"filter": {
|
|
"must": [{
|
|
"key": "object_name",
|
|
"match": {"value": object_name},
|
|
}],
|
|
"must_not": [{
|
|
"key": "document_id",
|
|
"match": {"value": keep_doc_id},
|
|
}],
|
|
}
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
|
|
def reupload_document(
|
|
rag_url: str,
|
|
file_bytes: bytes,
|
|
filename: str,
|
|
collection: str,
|
|
form_fields: dict,
|
|
extra_metadata: dict,
|
|
) -> dict:
|
|
"""Upload document to RAG service with new chunking parameters."""
|
|
ct = content_type_from_filename(filename)
|
|
|
|
form_data = {
|
|
"collection": collection,
|
|
"data_type": form_fields.get("data_type", "compliance"),
|
|
"bundesland": form_fields.get("bundesland", "bund"),
|
|
"use_case": form_fields.get("use_case", "compliance"),
|
|
"year": form_fields.get("year", "2026"),
|
|
"chunk_strategy": CHUNK_STRATEGY,
|
|
"chunk_size": str(CHUNK_SIZE),
|
|
"chunk_overlap": str(CHUNK_OVERLAP),
|
|
"metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
|
|
}
|
|
|
|
with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
|
|
resp = c.post(
|
|
f"{rag_url}/api/v1/documents/upload",
|
|
files={"file": (filename, file_bytes, ct)},
|
|
data=form_data,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
def process_document(
|
|
doc: dict,
|
|
rag_url: str,
|
|
qdrant_url: str,
|
|
progress: dict,
|
|
max_retries: int = 2,
|
|
) -> bool:
|
|
"""Process a single document: download → upload → verify → delete old.
|
|
|
|
Safe order: new chunks are created FIRST, old chunks deleted only after
|
|
successful verification (upload-before-delete pattern).
|
|
"""
|
|
key = doc_key(doc["object_name"], doc["collection"])
|
|
|
|
# Skip if already done
|
|
if progress.get("documents", {}).get(key, {}).get("status") == "done":
|
|
return True
|
|
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
# 1. Download
|
|
file_bytes = download_file(rag_url, doc["object_name"])
|
|
if not file_bytes:
|
|
logger.warning(" Empty file: %s — skipping", doc["object_name"])
|
|
progress.setdefault("documents", {})[key] = {
|
|
"status": "skipped", "reason": "empty_file"}
|
|
return False
|
|
|
|
# 2. Upload FIRST (creates new chunks alongside old ones)
|
|
result = reupload_document(
|
|
rag_url, file_bytes, doc["filename"],
|
|
doc["collection"], doc["form"], doc["extra_metadata"],
|
|
)
|
|
|
|
new_chunks = result.get("chunks_count", 0)
|
|
new_doc_id = result.get("document_id", "")
|
|
if new_chunks == 0:
|
|
logger.error(" Upload produced 0 chunks — keeping old data: %s",
|
|
doc["object_name"])
|
|
progress.setdefault("documents", {})[key] = {
|
|
"status": "error", "error": "0 new chunks"}
|
|
return False
|
|
|
|
# 3. Delete OLD chunks only (exclude the new document_id)
|
|
_delete_old_chunks_safe(
|
|
qdrant_url, doc["collection"],
|
|
doc["object_name"], new_doc_id,
|
|
)
|
|
|
|
# 4. Record success
|
|
progress.setdefault("documents", {})[key] = {
|
|
"status": "done",
|
|
"old_chunks": doc["old_chunk_count"],
|
|
"new_chunks": new_chunks,
|
|
"new_document_id": result.get("document_id", ""),
|
|
"completed_at": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
return True
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 404:
|
|
logger.warning(" File not in MinIO (404): %s — skipping", doc["object_name"])
|
|
progress.setdefault("documents", {})[key] = {
|
|
"status": "skipped", "reason": "not_in_minio"}
|
|
return False
|
|
if attempt < max_retries:
|
|
wait = 5 * (attempt + 1)
|
|
logger.warning(" HTTP %d on attempt %d, retrying in %ds...",
|
|
e.response.status_code, attempt + 1, wait)
|
|
time.sleep(wait)
|
|
else:
|
|
logger.error(" FAILED after %d retries: %s", max_retries, e)
|
|
progress.setdefault("documents", {})[key] = {
|
|
"status": "error", "error": str(e), "retries": max_retries}
|
|
return False
|
|
|
|
except Exception as e:
|
|
if attempt < max_retries:
|
|
wait = 10 * (attempt + 1)
|
|
logger.warning(" Error on attempt %d: %s — retrying in %ds",
|
|
attempt + 1, e, wait)
|
|
time.sleep(wait)
|
|
else:
|
|
logger.error(" FAILED after %d retries: %s", max_retries, e)
|
|
progress.setdefault("documents", {})[key] = {
|
|
"status": "error", "error": str(e), "retries": max_retries}
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 3: Verification
|
|
# ---------------------------------------------------------------------------
|
|
def verify_results(
|
|
qdrant_url: str,
|
|
before_counts: dict,
|
|
collections: list[str],
|
|
manifest: list[dict],
|
|
):
|
|
"""Compare before/after counts and spot-check metadata."""
|
|
logger.info("Phase 3: Verification...")
|
|
|
|
print("\n" + "=" * 65)
|
|
print("D5 RE-INGESTION VERIFICATION REPORT")
|
|
print("=" * 65)
|
|
|
|
after_counts = {}
|
|
with httpx.Client(timeout=10.0) as c:
|
|
for coll in collections:
|
|
try:
|
|
r = c.post(f"{qdrant_url}/collections/{coll}/points/count",
|
|
json={"exact": True})
|
|
r.raise_for_status()
|
|
after_counts[coll] = r.json()["result"]["count"]
|
|
except Exception:
|
|
after_counts[coll] = -1
|
|
|
|
print(f"\n{'Collection':<35} {'Before':>8} {'After':>8} {'Delta':>8}")
|
|
print("-" * 65)
|
|
for coll in collections:
|
|
before = before_counts.get(coll, 0)
|
|
after = after_counts.get(coll, -1)
|
|
delta = after - before if after >= 0 else "?"
|
|
print(f"{coll:<35} {before:>8} {after:>8} {str(delta):>8}")
|
|
|
|
# Spot-check: pick 3 random docs and verify metadata
|
|
print("\nSpot-check (3 random docs):")
|
|
sample = random.sample(manifest, min(3, len(manifest)))
|
|
|
|
with httpx.Client(timeout=30.0) as c:
|
|
for doc in sample:
|
|
resp = c.post(
|
|
f"{qdrant_url}/collections/{doc['collection']}/points/scroll",
|
|
json={
|
|
"limit": 3,
|
|
"with_payload": True,
|
|
"with_vector": False,
|
|
"filter": {
|
|
"must": [{
|
|
"key": "object_name",
|
|
"match": {"value": doc["object_name"]},
|
|
}]
|
|
},
|
|
},
|
|
)
|
|
if resp.status_code != 200:
|
|
print(f" {doc['object_name']}: QUERY FAILED")
|
|
continue
|
|
|
|
points = resp.json()["result"]["points"]
|
|
if not points:
|
|
print(f" {doc['object_name']}: NO CHUNKS FOUND")
|
|
continue
|
|
|
|
has_section = sum(1 for p in points if p["payload"].get("section"))
|
|
has_para = sum(1 for p in points if p["payload"].get("paragraph"))
|
|
print(f" {doc['filename'][:40]:<42} "
|
|
f"chunks={len(points):>3} "
|
|
f"with_section={has_section}/{len(points)} "
|
|
f"with_para={has_para}/{len(points)}")
|
|
|
|
print()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="D5 Re-Ingestion Script")
|
|
parser.add_argument("--rag-url", default=DEFAULT_RAG_URL)
|
|
parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL)
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Build manifest only, no changes")
|
|
parser.add_argument("--collection", default=None,
|
|
help="Process only this collection")
|
|
parser.add_argument("--resume", action="store_true",
|
|
help="Resume from progress file")
|
|
args = parser.parse_args()
|
|
|
|
collections = [args.collection] if args.collection else TARGET_COLLECTIONS
|
|
|
|
# Phase 0
|
|
before_counts = preflight_checks(args.rag_url, args.qdrant_url)
|
|
|
|
# Phase 1
|
|
manifest = build_manifest(args.qdrant_url, collections)
|
|
|
|
# Save manifest for inspection
|
|
with open(MANIFEST_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
logger.info("Manifest saved to %s", MANIFEST_FILE)
|
|
|
|
if args.dry_run:
|
|
print(f"\nDRY RUN: {len(manifest)} documents found. See {MANIFEST_FILE}")
|
|
for doc in manifest[:10]:
|
|
reg = doc["extra_metadata"].get("regulation_code", "?")
|
|
print(f" {reg:<30} {doc['collection']:<35} chunks={doc['old_chunk_count']}")
|
|
if len(manifest) > 10:
|
|
print(f" ... and {len(manifest) - 10} more")
|
|
sys.exit(0)
|
|
|
|
# Phase 2
|
|
progress = load_progress() if args.resume else {"documents": {}}
|
|
progress["started_at"] = datetime.now(timezone.utc).isoformat()
|
|
progress["before_counts"] = before_counts
|
|
|
|
done = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for i, doc in enumerate(manifest, 1):
|
|
key = doc_key(doc["object_name"], doc["collection"])
|
|
reg = doc["extra_metadata"].get("regulation_code", "?")
|
|
|
|
if progress.get("documents", {}).get(key, {}).get("status") == "done":
|
|
done += 1
|
|
continue
|
|
|
|
logger.info("[%d/%d] %s (%s) — %d old chunks",
|
|
i, len(manifest), reg, doc["collection"], doc["old_chunk_count"])
|
|
|
|
ok = process_document(doc, args.rag_url, args.qdrant_url, progress)
|
|
if ok:
|
|
done += 1
|
|
new_chunks = progress["documents"][key].get("new_chunks", "?")
|
|
logger.info(" OK: %d old → %s new chunks", doc["old_chunk_count"], new_chunks)
|
|
elif progress["documents"][key].get("status") == "skipped":
|
|
skipped += 1
|
|
else:
|
|
failed += 1
|
|
|
|
save_progress(progress)
|
|
time.sleep(2)
|
|
|
|
logger.info("Phase 2 complete: %d done, %d skipped, %d failed", done, skipped, failed)
|
|
|
|
# Phase 3
|
|
verify_results(args.qdrant_url, before_counts, collections, manifest)
|
|
|
|
print(f"Summary: {done} done, {skipped} skipped, {failed} failed")
|
|
if failed:
|
|
print(f"Re-run with --resume to retry {failed} failed documents")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|