fix(embedding): add NIST control IDs to _SECTION_NUMBER_RE
_SECTION_NUMBER_RE only had patterns for §/Art/Section/Kapitel/Annex but missed NIST-style identifiers (AC-1, GV.OC-01, 3.1, A01:2021). This caused 0% section rate for all NIST/BSI/ENISA documents even though sections were correctly detected — the section NUMBER wasn't extracted from the header. Also adds: - reupload_legal_strategy.py: re-upload with legal chunking - extract_and_upload_nist.py: local PDF extraction workaround - qdrant-snapshot.sh: backup mechanism for Qdrant collections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Executable
+65
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
# Qdrant Snapshot — erstellt Snapshots aller Collections
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/qdrant-snapshot.sh # Create snapshots
|
||||
# bash scripts/qdrant-snapshot.sh --list # List existing snapshots
|
||||
# bash scripts/qdrant-snapshot.sh --restore <file> # Restore (interactive)
|
||||
#
|
||||
# Snapshots werden im Qdrant-Volume unter /qdrant/storage/snapshots/ gespeichert.
|
||||
# Zusaetzlich werden sie nach ./backups/qdrant/ kopiert.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
|
||||
BACKUP_DIR="${BACKUP_DIR:-./backups/qdrant}"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
# --- List existing snapshots ---
|
||||
if [[ "${1:-}" == "--list" ]]; then
|
||||
echo "=== Qdrant Snapshots ==="
|
||||
for coll in $(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]"); do
|
||||
echo ""
|
||||
echo "Collection: $coll"
|
||||
curl -sf "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "
|
||||
import sys, json
|
||||
snaps = json.load(sys.stdin).get('result', [])
|
||||
if not snaps:
|
||||
print(' (no snapshots)')
|
||||
else:
|
||||
for s in snaps:
|
||||
print(f\" {s['name']} size={s.get('size',0)/(1024*1024):.1f}MB\")
|
||||
"
|
||||
done
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Create snapshots ---
|
||||
echo "=== Creating Qdrant Snapshots ($TIMESTAMP) ==="
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
COLLECTIONS=$(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]")
|
||||
|
||||
for coll in $COLLECTIONS; do
|
||||
echo ""
|
||||
echo "[$coll] Creating snapshot..."
|
||||
|
||||
SNAP=$(curl -sf -X POST "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['name'])")
|
||||
|
||||
if [[ -z "$SNAP" ]]; then
|
||||
echo "[$coll] ERROR: snapshot creation failed"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "[$coll] Snapshot: $SNAP"
|
||||
|
||||
# Download snapshot to backup dir
|
||||
OUTFILE="$BACKUP_DIR/${coll}_${TIMESTAMP}.snapshot"
|
||||
curl -sf "$QDRANT_URL/collections/$coll/snapshots/$SNAP" -o "$OUTFILE"
|
||||
SIZE=$(du -h "$OUTFILE" | cut -f1)
|
||||
echo "[$coll] Saved: $OUTFILE ($SIZE)"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Done ==="
|
||||
ls -lh "$BACKUP_DIR"/*_${TIMESTAMP}.snapshot 2>/dev/null || echo "No snapshots created"
|
||||
Reference in New Issue
Block a user