755ea44343
- architecture.go: DataSources now reflect the real ingested set (ESAW 2023, BLS CFOI, OSHA OTM, PRISM, cobot CC-BY, HSE) with their RAG collections; risk stage cites BLS + the searchable RAG layer; matrix stage now mentions the distance-benchmark dimension. - Architektur & Datenfluss tab: new DataFlowDiagram — 4 lanes (input → knowledge/RAG-evidence → deterministic engine → outputs) with live counts. - scripts/ingest_iace_kb.sh: idempotent E1 ingest — creates the 2 collections and uploads the 6 datasources docs against a configurable RAG_URL (for prod Qdrant), with retry. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
80 lines
3.0 KiB
Bash
Executable File
80 lines
3.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Ingest the IACE open-source knowledge-base documents into a RAG/Qdrant target.
|
|
#
|
|
# Idempotent: creates the two collections (1024-dim, BGE-M3) if missing and
|
|
# uploads each versioned source doc with retry. Use it to populate a fresh
|
|
# Qdrant (e.g. production) from the repo — the docs under datasources/ are the
|
|
# single source of truth.
|
|
#
|
|
# Usage:
|
|
# RAG_URL=https://rag.prod.example ./scripts/ingest_iace_kb.sh
|
|
# ./scripts/ingest_iace_kb.sh # defaults to https://127.0.0.1:8097
|
|
#
|
|
# Env:
|
|
# RAG_URL base URL of the RAG service (default https://127.0.0.1:8097)
|
|
# INSECURE set to 0 to disable curl -k (default 1, for self-signed dev certs)
|
|
set -uo pipefail
|
|
|
|
RAG_URL="${RAG_URL:-https://127.0.0.1:8097}"
|
|
INSECURE="${INSECURE:-1}"
|
|
VECTOR_SIZE=1024
|
|
|
|
CURL=(curl -sS --max-time 120)
|
|
[ "$INSECURE" = "1" ] && CURL+=(-k)
|
|
|
|
# Resolve the datasources dir relative to this script (repo-portable).
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
DS_DIR="$SCRIPT_DIR/../internal/iace/datasources"
|
|
|
|
# doc -> collection mapping (parallel arrays; macOS bash 3.2 compatible).
|
|
DOCS=(
|
|
"esaw_accident_stats_2023.md|bp_iace_accident_stats|Accidents at work - ESAW 2023|CC BY 4.0"
|
|
"bls_cfoi_fatal_2024.md|bp_iace_accident_stats|BLS CFOI fatal injuries 2023-24|US Public Domain"
|
|
"prism_risk_methodology.md|bp_iace_safety_kb|PRISM risk methodology|OGL v3"
|
|
"cobot_biomech_limits.md|bp_iace_safety_kb|Cobot biomechanical pain limits|CC BY 4.0"
|
|
"hse_example_risk_assessments.md|bp_iace_safety_kb|HSE example risk assessments|OGL v3"
|
|
"osha_robot_safety.md|bp_iace_safety_kb|OSHA industrial robot safety|US Public Domain"
|
|
)
|
|
|
|
create_collection() {
|
|
local name="$1"
|
|
echo " • ensure collection $name (${VECTOR_SIZE}d)"
|
|
"${CURL[@]}" -X POST "$RAG_URL/api/v1/collections" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"name\":\"$name\",\"vector_size\":$VECTOR_SIZE}" >/dev/null 2>&1 || true
|
|
}
|
|
|
|
upload() {
|
|
local file="$1" collection="$2" title="$3" license="$4"
|
|
local path="$DS_DIR/$file"
|
|
if [ ! -f "$path" ]; then echo " ✗ MISSING: $path"; return 1; fi
|
|
local meta="{\"title\":\"$title\",\"license\":\"$license\",\"source\":\"iace_kb\"}"
|
|
local try resp
|
|
for try in 1 2 3; do
|
|
resp="$("${CURL[@]}" -X POST "$RAG_URL/api/v1/documents/upload" \
|
|
-F "file=@$path" -F "collection=$collection" \
|
|
-F 'data_type=safety_kb' -F 'use_case=iace_risk' -F 'year=2024' \
|
|
-F "metadata_json=$meta" 2>&1)"
|
|
if echo "$resp" | grep -q 'chunks_count'; then
|
|
echo " ✓ $file -> $collection ($(echo "$resp" | grep -o '"chunks_count":[0-9]*'))"
|
|
return 0
|
|
fi
|
|
sleep 4
|
|
done
|
|
echo " ✗ FAILED $file: $(echo "$resp" | head -c 120)"
|
|
return 1
|
|
}
|
|
|
|
echo "Ingesting IACE KB into $RAG_URL"
|
|
# Unique collections first.
|
|
for c in bp_iace_accident_stats bp_iace_safety_kb; do create_collection "$c"; done
|
|
|
|
rc=0
|
|
for entry in "${DOCS[@]}"; do
|
|
IFS='|' read -r file collection title license <<<"$entry"
|
|
upload "$file" "$collection" "$title" "$license" || rc=1
|
|
done
|
|
|
|
echo "Done (exit $rc)."
|
|
exit $rc
|