#!/usr/bin/env bash # ============================================================================= # BreakPilot Compliance — IACE Library RAG Ingestion # # Exports IACE hazard library, component library, energy sources, protective # measures, and evidence types from the Go code / database, then ingests them # into Qdrant collection `bp_iace_libraries` via the Core RAG-API (Port 8097). # # Execution on Mac Mini: # bash ~/Projekte/breakpilot-compliance/scripts/ingest-iace-libraries.sh # bash .../ingest-iace-libraries.sh [--skip-export] [--only PHASE] # # Phases: export, create-collection, ingest, verify, version # ============================================================================= set -euo pipefail # --- Configuration ----------------------------------------------------------- WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-iace}" RAG_URL="https://localhost:8097/api/v1/documents/upload" QDRANT_URL="http://localhost:6333" SDK_URL="http://localhost:8090" COLLECTION="bp_iace_libraries" CURL_OPTS="-sk --connect-timeout 15 --max-time 600 --retry 3 --retry-delay 5" DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}" # Counters UPLOADED=0 FAILED=0 SKIPPED=0 # --- CLI Args ---------------------------------------------------------------- SKIP_EXPORT=false ONLY_PHASE="" while [[ $# -gt 0 ]]; do case $1 in --skip-export) SKIP_EXPORT=true; shift ;; --only) ONLY_PHASE="$2"; shift 2 ;; -h|--help) echo "Usage: $0 [--skip-export] [--only PHASE]" echo "Phases: export, create-collection, ingest, verify, version" exit 0 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done # --- Helpers ----------------------------------------------------------------- log() { echo "[$(date '+%H:%M:%S')] $*"; } ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; } warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; } fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; } should_run() { [[ -z "$ONLY_PHASE" ]] || [[ "$ONLY_PHASE" == "$1" ]] } mkdir -p "$WORK_DIR" # ============================================================================= # Phase 1: Export IACE library data from the SDK API # ============================================================================= if should_run "export" && [[ "$SKIP_EXPORT" == "false" ]]; then log "Phase 1: Exporting IACE library data from SDK API..." # Export hazard library log " Fetching hazard library..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-library" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/hazard-library.json" 2>/dev/null && \ ok " Hazard library exported" || warn " Hazard library export failed" # Export component library log " Fetching component library..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/component-library" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/component-library.json" 2>/dev/null && \ ok " Component library exported" || warn " Component library export failed" # Export energy sources log " Fetching energy sources..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/energy-sources" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/energy-sources.json" 2>/dev/null && \ ok " Energy sources exported" || warn " Energy sources export failed" # Export protective measures log " Fetching protective measures library..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/protective-measures-library" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/protective-measures.json" 2>/dev/null && \ ok " Protective measures exported" || warn " Protective measures export failed" # Export evidence types log " Fetching evidence types..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/evidence-types" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/evidence-types.json" 2>/dev/null && \ ok " Evidence types exported" || warn " Evidence types export failed" # Export tag taxonomy log " Fetching tag taxonomy..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/tags" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/tag-taxonomy.json" 2>/dev/null && \ ok " Tag taxonomy exported" || warn " Tag taxonomy export failed" # Export hazard patterns log " Fetching hazard patterns..." curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-patterns" \ -H "X-Tenant-ID: system" \ -o "$WORK_DIR/hazard-patterns.json" 2>/dev/null && \ ok " Hazard patterns exported" || warn " Hazard patterns export failed" ok "Phase 1 complete: Library data exported to $WORK_DIR" fi # ============================================================================= # Phase 2: Create Qdrant collection (if not exists) # ============================================================================= if should_run "create-collection"; then log "Phase 2: Creating Qdrant collection '$COLLECTION'..." # Check if collection exists HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \ "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null) if [[ "$HTTP_CODE" == "200" ]]; then ok " Collection '$COLLECTION' already exists" else log " Creating collection with bge-m3 (1024 dimensions)..." curl $CURL_OPTS -X PUT "$QDRANT_URL/collections/$COLLECTION" \ -H "Content-Type: application/json" \ -d '{ "vectors": { "size": 1024, "distance": "Cosine" }, "optimizers_config": { "default_segment_number": 2 } }' 2>/dev/null && \ ok " Collection '$COLLECTION' created" || fail " Failed to create collection" fi fi # ============================================================================= # Phase 3: Transform and ingest via Core RAG-API # ============================================================================= if should_run "ingest"; then log "Phase 3: Ingesting IACE library documents..." # Create text files from JSON exports for RAG ingestion python3 - "$WORK_DIR" <<'PYEOF' import json, sys, os work_dir = sys.argv[1] output_dir = os.path.join(work_dir, "chunks") os.makedirs(output_dir, exist_ok=True) chunk_count = 0 def write_chunk(filename, text, metadata): global chunk_count chunk_count += 1 filepath = os.path.join(output_dir, filename) with open(filepath, 'w') as f: json.dump({"text": text, "metadata": metadata}, f, ensure_ascii=False) # --- Hazard Library --- try: with open(os.path.join(work_dir, "hazard-library.json")) as f: data = json.load(f) hazards = data.get("hazards", data) if isinstance(data, dict) else data for h in hazards: text = f"""Gefaehrdung {h.get('id','')}: {h.get('name_de', h.get('name',''))} Kategorie: {h.get('category','')} Beschreibung: {h.get('description_de', h.get('description',''))} Typische Ursachen: {h.get('typical_causes','')} Gefaehrliche Situation: {h.get('hazardous_situation','')} Moegliche Schaeden: {h.get('possible_damages','')} Betroffene Rollen: {', '.join(h.get('affected_roles',[]))} Lebensphasen: {', '.join(h.get('lifecycle_phases',[]))} Massnahmenarten: {', '.join(h.get('measure_types',[]))} Nachweisarten: {', '.join(h.get('evidence_types',[]))}""" write_chunk(f"hazard_{h.get('id','unknown')}.json", text, { "regulation_id": f"iace_hazard_{h.get('id','')}", "regulation_name": h.get('name_de', h.get('name','')), "regulation_short": "IACE Hazard Library", "category": h.get('category',''), "source": "iace_hazard_library" }) print(f" Hazards: {len(hazards)} chunks") except Exception as e: print(f" Hazards: ERROR - {e}") # --- Component Library --- try: with open(os.path.join(work_dir, "component-library.json")) as f: data = json.load(f) components = data.get("components", data) if isinstance(data, dict) else data for c in components: text = f"""Maschinenkomponente {c.get('id','')}: {c.get('name_de','')} ({c.get('name_en','')}) Kategorie: {c.get('category','')} Beschreibung: {c.get('description_de','')} Typische Gefaehrdungskategorien: {', '.join(c.get('typical_hazard_categories',[]))} Typische Energiequellen: {', '.join(c.get('typical_energy_sources',[]))} Komponententyp: {c.get('maps_to_component_type','')} Tags: {', '.join(c.get('tags',[]))}""" write_chunk(f"component_{c.get('id','unknown')}.json", text, { "regulation_id": f"iace_component_{c.get('id','')}", "regulation_name": c.get('name_de',''), "regulation_short": "IACE Component Library", "category": c.get('category',''), "source": "iace_component_library" }) print(f" Components: {len(components)} chunks") except Exception as e: print(f" Components: ERROR - {e}") # --- Energy Sources --- try: with open(os.path.join(work_dir, "energy-sources.json")) as f: data = json.load(f) sources = data.get("energy_sources", data) if isinstance(data, dict) else data for s in sources: text = f"""Energiequelle {s.get('id','')}: {s.get('name_de','')} ({s.get('name_en','')}) Beschreibung: {s.get('description_de','')} Typische Komponenten: {', '.join(s.get('typical_components',[]))} Typische Gefaehrdungskategorien: {', '.join(s.get('typical_hazard_categories',[]))} Tags: {', '.join(s.get('tags',[]))}""" write_chunk(f"energy_{s.get('id','unknown')}.json", text, { "regulation_id": f"iace_energy_{s.get('id','')}", "regulation_name": s.get('name_de',''), "regulation_short": "IACE Energy Sources", "category": "energy", "source": "iace_energy_sources" }) print(f" Energy Sources: {len(sources)} chunks") except Exception as e: print(f" Energy Sources: ERROR - {e}") # --- Protective Measures --- try: with open(os.path.join(work_dir, "protective-measures.json")) as f: data = json.load(f) measures = data.get("measures", data) if isinstance(data, dict) else data for m in measures: text = f"""Schutzmassnahme {m.get('id','')}: {m.get('name_de', m.get('name',''))} Reduktionstyp: {m.get('reduction_type','')} Beschreibung: {m.get('description_de', m.get('description',''))} Massnahmenart: {m.get('measure_type','')} Wirksamkeit: {m.get('effectiveness','')}""" write_chunk(f"measure_{m.get('id','unknown')}.json", text, { "regulation_id": f"iace_measure_{m.get('id','')}", "regulation_name": m.get('name_de', m.get('name','')), "regulation_short": "IACE Protective Measures", "category": m.get('reduction_type',''), "source": "iace_protective_measures" }) print(f" Measures: {len(measures)} chunks") except Exception as e: print(f" Measures: ERROR - {e}") # --- Evidence Types --- try: with open(os.path.join(work_dir, "evidence-types.json")) as f: data = json.load(f) evidence = data.get("evidence_types", data) if isinstance(data, dict) else data for e in evidence: text = f"""Nachweistyp {e.get('id','')}: {e.get('name_de', e.get('name',''))} Methode: {e.get('method', e.get('verification_method',''))} Beschreibung: {e.get('description_de', e.get('description',''))}""" write_chunk(f"evidence_{e.get('id','unknown')}.json", text, { "regulation_id": f"iace_evidence_{e.get('id','')}", "regulation_name": e.get('name_de', e.get('name','')), "regulation_short": "IACE Evidence Types", "category": "evidence", "source": "iace_evidence_types" }) print(f" Evidence Types: {len(evidence)} chunks") except Exception as e: print(f" Evidence Types: ERROR - {e}") print(f"\nTotal chunks prepared: {chunk_count}") print(f"Output directory: {output_dir}") PYEOF # Upload each chunk via Core RAG-API CHUNK_DIR="$WORK_DIR/chunks" if [[ -d "$CHUNK_DIR" ]]; then TOTAL=$(ls "$CHUNK_DIR"/*.json 2>/dev/null | wc -l | tr -d ' ') log " Uploading $TOTAL chunks to Qdrant via RAG-API..." for chunk_file in "$CHUNK_DIR"/*.json; do [[ -f "$chunk_file" ]] || continue BASENAME=$(basename "$chunk_file" .json) # Extract text and metadata TEXT=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['text'])") REG_ID=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['metadata']['regulation_id'])") # Check if already in Qdrant (dedup by regulation_id) EXISTING=$(curl $CURL_OPTS -X POST "$QDRANT_URL/collections/$COLLECTION/points/scroll" \ -H "Content-Type: application/json" \ -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$REG_ID\"}}]},\"limit\":1}" \ 2>/dev/null | python3 -c "import json,sys; d=json.load(sys.stdin); print(len(d.get('result',{}).get('points',[])))" 2>/dev/null || echo "0") if [[ "$EXISTING" -gt 0 ]]; then SKIPPED=$((SKIPPED + 1)) continue fi # Create a temporary text file for upload TMPFILE=$(mktemp "$WORK_DIR/tmp_XXXXXX.txt") echo "$TEXT" > "$TMPFILE" HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \ -X POST "$RAG_URL" \ -F "file=@$TMPFILE" \ -F "collection=$COLLECTION" \ -F "data_type=iace_library" \ -F "use_case=ce_risk_assessment" \ -F "year=2026" \ -F "chunk_strategy=recursive" \ -F "chunk_size=1024" \ -F "chunk_overlap=128" \ 2>/dev/null) rm -f "$TMPFILE" if [[ "$HTTP_CODE" =~ ^2 ]]; then UPLOADED=$((UPLOADED + 1)) else FAILED=$((FAILED + 1)) warn " Failed to upload $BASENAME (HTTP $HTTP_CODE)" fi done ok " Ingestion complete: $UPLOADED uploaded, $SKIPPED skipped, $FAILED failed" else warn " No chunks directory found at $CHUNK_DIR" fi fi # ============================================================================= # Phase 4: Verify # ============================================================================= if should_run "verify"; then log "Phase 4: Verifying IACE library collection..." POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \ python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0") log " Collection '$COLLECTION': $POINT_COUNT points" if [[ "$POINT_COUNT" -gt 0 ]]; then ok " Verification passed" else warn " Collection is empty — ingestion may have failed" fi fi # ============================================================================= # Phase 5: Record version in compliance_corpus_versions # ============================================================================= if should_run "version"; then log "Phase 5: Recording corpus version..." POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \ python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0") VERSION="1.0.0" DIGEST=$(echo -n "iace-libraries-v1-$(date +%Y%m%d)" | shasum -a 256 | cut -d' ' -f1) psql "$DB_URL" -c " INSERT INTO compliance_corpus_versions (collection_name, version, documents_count, chunks_count, regulations, digest, notes) VALUES ('$COLLECTION', '$VERSION', 7, $POINT_COUNT, ARRAY['iace_hazard_library','iace_component_library','iace_energy_sources','iace_protective_measures','iace_evidence_types','iace_tag_taxonomy','iace_hazard_patterns'], '$DIGEST', 'IACE CE-Risikobeurteilung Bibliotheken: 150 Hazards, 120 Komponenten, 20 Energiequellen, 200 Schutzmassnahmen, 50 Evidenztypen, 85 Tags, 44 Patterns') ON CONFLICT DO NOTHING; " 2>/dev/null && ok " Corpus version recorded" || warn " Version recording failed (table may not exist)" fi # ============================================================================= # Summary # ============================================================================= log "================================================================" log "IACE Library RAG Ingestion Summary" log " Collection: $COLLECTION" log " Uploaded: $UPLOADED" log " Skipped: $SKIPPED" log " Failed: $FAILED" log "================================================================"