#!/usr/bin/env bash # Ingest the IACE open-source knowledge-base documents into a RAG/Qdrant target. # # Idempotent: creates the two collections (1024-dim, BGE-M3) if missing and # uploads each versioned source doc with retry. Use it to populate a fresh # Qdrant (e.g. production) from the repo — the docs under datasources/ are the # single source of truth. # # Usage: # RAG_URL=https://rag.prod.example ./scripts/ingest_iace_kb.sh # ./scripts/ingest_iace_kb.sh # defaults to https://127.0.0.1:8097 # # Env: # RAG_URL base URL of the RAG service (default https://127.0.0.1:8097) # INSECURE set to 0 to disable curl -k (default 1, for self-signed dev certs) set -uo pipefail RAG_URL="${RAG_URL:-https://127.0.0.1:8097}" INSECURE="${INSECURE:-1}" VECTOR_SIZE=1024 CURL=(curl -sS --max-time 120) [ "$INSECURE" = "1" ] && CURL+=(-k) # Resolve the datasources dir relative to this script (repo-portable). SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" DS_DIR="$SCRIPT_DIR/../internal/iace/datasources" # doc -> collection mapping (parallel arrays; macOS bash 3.2 compatible). DOCS=( "esaw_accident_stats_2023.md|bp_iace_accident_stats|Accidents at work - ESAW 2023|CC BY 4.0" "bls_cfoi_fatal_2024.md|bp_iace_accident_stats|BLS CFOI fatal injuries 2023-24|US Public Domain" "prism_risk_methodology.md|bp_iace_safety_kb|PRISM risk methodology|OGL v3" "cobot_biomech_limits.md|bp_iace_safety_kb|Cobot biomechanical pain limits|CC BY 4.0" "hse_example_risk_assessments.md|bp_iace_safety_kb|HSE example risk assessments|OGL v3" "osha_robot_safety.md|bp_iace_safety_kb|OSHA industrial robot safety|US Public Domain" ) create_collection() { local name="$1" echo " • ensure collection $name (${VECTOR_SIZE}d)" "${CURL[@]}" -X POST "$RAG_URL/api/v1/collections" \ -H 'Content-Type: application/json' \ -d "{\"name\":\"$name\",\"vector_size\":$VECTOR_SIZE}" >/dev/null 2>&1 || true } upload() { local file="$1" collection="$2" title="$3" license="$4" local path="$DS_DIR/$file" if [ ! -f "$path" ]; then echo " ✗ MISSING: $path"; return 1; fi local meta="{\"title\":\"$title\",\"license\":\"$license\",\"source\":\"iace_kb\"}" local try resp for try in 1 2 3; do resp="$("${CURL[@]}" -X POST "$RAG_URL/api/v1/documents/upload" \ -F "file=@$path" -F "collection=$collection" \ -F 'data_type=safety_kb' -F 'use_case=iace_risk' -F 'year=2024' \ -F "metadata_json=$meta" 2>&1)" if echo "$resp" | grep -q 'chunks_count'; then echo " ✓ $file -> $collection ($(echo "$resp" | grep -o '"chunks_count":[0-9]*'))" return 0 fi sleep 4 done echo " ✗ FAILED $file: $(echo "$resp" | head -c 120)" return 1 } echo "Ingesting IACE KB into $RAG_URL" # Unique collections first. for c in bp_iace_accident_stats bp_iace_safety_kb; do create_collection "$c"; done rc=0 for entry in "${DOCS[@]}"; do IFS='|' read -r file collection title license <<<"$entry" upload "$file" "$collection" "$title" "$license" || rc=1 done echo "Done (exit $rc)." exit $rc