fix(embedding): add NIST control IDs to _SECTION_NUMBER_RE

_SECTION_NUMBER_RE only had patterns for §/Art/Section/Kapitel/Annex
but missed NIST-style identifiers (AC-1, GV.OC-01, 3.1, A01:2021).
This caused 0% section rate for all NIST/BSI/ENISA documents even
though sections were correctly detected — the section NUMBER wasn't
extracted from the header.

Also adds:
- reupload_legal_strategy.py: re-upload with legal chunking
- extract_and_upload_nist.py: local PDF extraction workaround
- qdrant-snapshot.sh: backup mechanism for Qdrant collections

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 07:42:06 +02:00
parent 0b0eed27b0
commit 2f4a3f2ea2
5 changed files with 843 additions and 0 deletions
+65
View File
@@ -0,0 +1,65 @@
#!/bin/bash
# Qdrant Snapshot — erstellt Snapshots aller Collections
#
# Usage:
# bash scripts/qdrant-snapshot.sh # Create snapshots
# bash scripts/qdrant-snapshot.sh --list # List existing snapshots
# bash scripts/qdrant-snapshot.sh --restore <file> # Restore (interactive)
#
# Snapshots werden im Qdrant-Volume unter /qdrant/storage/snapshots/ gespeichert.
# Zusaetzlich werden sie nach ./backups/qdrant/ kopiert.
set -euo pipefail
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
BACKUP_DIR="${BACKUP_DIR:-./backups/qdrant}"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
# --- List existing snapshots ---
if [[ "${1:-}" == "--list" ]]; then
echo "=== Qdrant Snapshots ==="
for coll in $(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]"); do
echo ""
echo "Collection: $coll"
curl -sf "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "
import sys, json
snaps = json.load(sys.stdin).get('result', [])
if not snaps:
print(' (no snapshots)')
else:
for s in snaps:
print(f\" {s['name']} size={s.get('size',0)/(1024*1024):.1f}MB\")
"
done
exit 0
fi
# --- Create snapshots ---
echo "=== Creating Qdrant Snapshots ($TIMESTAMP) ==="
mkdir -p "$BACKUP_DIR"
COLLECTIONS=$(curl -sf "$QDRANT_URL/collections" | python3 -c "import sys,json; [print(c['name']) for c in json.load(sys.stdin)['result']['collections']]")
for coll in $COLLECTIONS; do
echo ""
echo "[$coll] Creating snapshot..."
SNAP=$(curl -sf -X POST "$QDRANT_URL/collections/$coll/snapshots" | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['name'])")
if [[ -z "$SNAP" ]]; then
echo "[$coll] ERROR: snapshot creation failed"
continue
fi
echo "[$coll] Snapshot: $SNAP"
# Download snapshot to backup dir
OUTFILE="$BACKUP_DIR/${coll}_${TIMESTAMP}.snapshot"
curl -sf "$QDRANT_URL/collections/$coll/snapshots/$SNAP" -o "$OUTFILE"
SIZE=$(du -h "$OUTFILE" | cut -f1)
echo "[$coll] Saved: $OUTFILE ($SIZE)"
done
echo ""
echo "=== Done ==="
ls -lh "$BACKUP_DIR"/*_${TIMESTAMP}.snapshot 2>/dev/null || echo "No snapshots created"