feat(iace): Phase 5+6 — frontend integration, RAG library search, comprehensive tests
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 34s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 23s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 13s
CI/CD / Deploy (push) Successful in 2s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 34s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 23s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 13s
CI/CD / Deploy (push) Successful in 2s
Phase 5 — Frontend Integration: - components/page.tsx: ComponentLibraryModal with 120 components + 20 energy sources - hazards/page.tsx: AutoSuggestPanel with 3-column pattern matching review - mitigations/page.tsx: SuggestMeasuresModal per hazard with 3-level grouping - verification/page.tsx: SuggestEvidenceModal per mitigation with evidence types Phase 6 — RAG Library Search: - Added bp_iace_libraries to AllowedCollections whitelist in rag_handlers.go - SearchLibrary endpoint: POST /iace/library-search (semantic search across libraries) - EnrichTechFileSection endpoint: POST /projects/:id/tech-file/:section/enrich - Created ingest-iace-libraries.sh ingestion script for Qdrant collection Tests (123 passing): - tag_taxonomy_test.go: 8 tests for taxonomy entries, domains, essential tags - controls_library_test.go: 7 tests for measures, reduction types, subtypes - integration_test.go: 7 integration tests for full match flow and library consistency - Extended tag_resolver_test.go: 9 new tests for FindByTags and cross-category resolution Documentation: - Updated iace.md with Hazard-Matching-Engine, RAG enrichment, and new DB tables Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
395
scripts/ingest-iace-libraries.sh
Executable file
395
scripts/ingest-iace-libraries.sh
Executable file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# BreakPilot Compliance — IACE Library RAG Ingestion
|
||||
#
|
||||
# Exports IACE hazard library, component library, energy sources, protective
|
||||
# measures, and evidence types from the Go code / database, then ingests them
|
||||
# into Qdrant collection `bp_iace_libraries` via the Core RAG-API (Port 8097).
|
||||
#
|
||||
# Execution on Mac Mini:
|
||||
# bash ~/Projekte/breakpilot-compliance/scripts/ingest-iace-libraries.sh
|
||||
# bash .../ingest-iace-libraries.sh [--skip-export] [--only PHASE]
|
||||
#
|
||||
# Phases: export, create-collection, ingest, verify, version
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration -----------------------------------------------------------
|
||||
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-iace}"
|
||||
RAG_URL="https://localhost:8097/api/v1/documents/upload"
|
||||
QDRANT_URL="http://localhost:6333"
|
||||
SDK_URL="http://localhost:8090"
|
||||
COLLECTION="bp_iace_libraries"
|
||||
CURL_OPTS="-sk --connect-timeout 15 --max-time 600 --retry 3 --retry-delay 5"
|
||||
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
|
||||
|
||||
# Counters
|
||||
UPLOADED=0
|
||||
FAILED=0
|
||||
SKIPPED=0
|
||||
|
||||
# --- CLI Args ----------------------------------------------------------------
|
||||
SKIP_EXPORT=false
|
||||
ONLY_PHASE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--skip-export) SKIP_EXPORT=true; shift ;;
|
||||
--only) ONLY_PHASE="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [--skip-export] [--only PHASE]"
|
||||
echo "Phases: export, create-collection, ingest, verify, version"
|
||||
exit 0
|
||||
;;
|
||||
*) echo "Unknown option: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- Helpers -----------------------------------------------------------------
|
||||
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
||||
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
|
||||
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
|
||||
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
|
||||
|
||||
should_run() {
|
||||
[[ -z "$ONLY_PHASE" ]] || [[ "$ONLY_PHASE" == "$1" ]]
|
||||
}
|
||||
|
||||
mkdir -p "$WORK_DIR"
|
||||
|
||||
# =============================================================================
|
||||
# Phase 1: Export IACE library data from the SDK API
|
||||
# =============================================================================
|
||||
if should_run "export" && [[ "$SKIP_EXPORT" == "false" ]]; then
|
||||
log "Phase 1: Exporting IACE library data from SDK API..."
|
||||
|
||||
# Export hazard library
|
||||
log " Fetching hazard library..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-library" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/hazard-library.json" 2>/dev/null && \
|
||||
ok " Hazard library exported" || warn " Hazard library export failed"
|
||||
|
||||
# Export component library
|
||||
log " Fetching component library..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/component-library" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/component-library.json" 2>/dev/null && \
|
||||
ok " Component library exported" || warn " Component library export failed"
|
||||
|
||||
# Export energy sources
|
||||
log " Fetching energy sources..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/energy-sources" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/energy-sources.json" 2>/dev/null && \
|
||||
ok " Energy sources exported" || warn " Energy sources export failed"
|
||||
|
||||
# Export protective measures
|
||||
log " Fetching protective measures library..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/protective-measures-library" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/protective-measures.json" 2>/dev/null && \
|
||||
ok " Protective measures exported" || warn " Protective measures export failed"
|
||||
|
||||
# Export evidence types
|
||||
log " Fetching evidence types..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/evidence-types" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/evidence-types.json" 2>/dev/null && \
|
||||
ok " Evidence types exported" || warn " Evidence types export failed"
|
||||
|
||||
# Export tag taxonomy
|
||||
log " Fetching tag taxonomy..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/tags" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/tag-taxonomy.json" 2>/dev/null && \
|
||||
ok " Tag taxonomy exported" || warn " Tag taxonomy export failed"
|
||||
|
||||
# Export hazard patterns
|
||||
log " Fetching hazard patterns..."
|
||||
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-patterns" \
|
||||
-H "X-Tenant-ID: system" \
|
||||
-o "$WORK_DIR/hazard-patterns.json" 2>/dev/null && \
|
||||
ok " Hazard patterns exported" || warn " Hazard patterns export failed"
|
||||
|
||||
ok "Phase 1 complete: Library data exported to $WORK_DIR"
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# Phase 2: Create Qdrant collection (if not exists)
|
||||
# =============================================================================
|
||||
if should_run "create-collection"; then
|
||||
log "Phase 2: Creating Qdrant collection '$COLLECTION'..."
|
||||
|
||||
# Check if collection exists
|
||||
HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \
|
||||
"$QDRANT_URL/collections/$COLLECTION" 2>/dev/null)
|
||||
|
||||
if [[ "$HTTP_CODE" == "200" ]]; then
|
||||
ok " Collection '$COLLECTION' already exists"
|
||||
else
|
||||
log " Creating collection with bge-m3 (1024 dimensions)..."
|
||||
curl $CURL_OPTS -X PUT "$QDRANT_URL/collections/$COLLECTION" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"vectors": {
|
||||
"size": 1024,
|
||||
"distance": "Cosine"
|
||||
},
|
||||
"optimizers_config": {
|
||||
"default_segment_number": 2
|
||||
}
|
||||
}' 2>/dev/null && \
|
||||
ok " Collection '$COLLECTION' created" || fail " Failed to create collection"
|
||||
fi
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# Phase 3: Transform and ingest via Core RAG-API
|
||||
# =============================================================================
|
||||
if should_run "ingest"; then
|
||||
log "Phase 3: Ingesting IACE library documents..."
|
||||
|
||||
# Create text files from JSON exports for RAG ingestion
|
||||
python3 - "$WORK_DIR" <<'PYEOF'
|
||||
import json, sys, os
|
||||
|
||||
work_dir = sys.argv[1]
|
||||
output_dir = os.path.join(work_dir, "chunks")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
chunk_count = 0
|
||||
|
||||
def write_chunk(filename, text, metadata):
|
||||
global chunk_count
|
||||
chunk_count += 1
|
||||
filepath = os.path.join(output_dir, filename)
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump({"text": text, "metadata": metadata}, f, ensure_ascii=False)
|
||||
|
||||
# --- Hazard Library ---
|
||||
try:
|
||||
with open(os.path.join(work_dir, "hazard-library.json")) as f:
|
||||
data = json.load(f)
|
||||
hazards = data.get("hazards", data) if isinstance(data, dict) else data
|
||||
for h in hazards:
|
||||
text = f"""Gefaehrdung {h.get('id','')}: {h.get('name_de', h.get('name',''))}
|
||||
Kategorie: {h.get('category','')}
|
||||
Beschreibung: {h.get('description_de', h.get('description',''))}
|
||||
Typische Ursachen: {h.get('typical_causes','')}
|
||||
Gefaehrliche Situation: {h.get('hazardous_situation','')}
|
||||
Moegliche Schaeden: {h.get('possible_damages','')}
|
||||
Betroffene Rollen: {', '.join(h.get('affected_roles',[]))}
|
||||
Lebensphasen: {', '.join(h.get('lifecycle_phases',[]))}
|
||||
Massnahmenarten: {', '.join(h.get('measure_types',[]))}
|
||||
Nachweisarten: {', '.join(h.get('evidence_types',[]))}"""
|
||||
write_chunk(f"hazard_{h.get('id','unknown')}.json", text, {
|
||||
"regulation_id": f"iace_hazard_{h.get('id','')}",
|
||||
"regulation_name": h.get('name_de', h.get('name','')),
|
||||
"regulation_short": "IACE Hazard Library",
|
||||
"category": h.get('category',''),
|
||||
"source": "iace_hazard_library"
|
||||
})
|
||||
print(f" Hazards: {len(hazards)} chunks")
|
||||
except Exception as e:
|
||||
print(f" Hazards: ERROR - {e}")
|
||||
|
||||
# --- Component Library ---
|
||||
try:
|
||||
with open(os.path.join(work_dir, "component-library.json")) as f:
|
||||
data = json.load(f)
|
||||
components = data.get("components", data) if isinstance(data, dict) else data
|
||||
for c in components:
|
||||
text = f"""Maschinenkomponente {c.get('id','')}: {c.get('name_de','')} ({c.get('name_en','')})
|
||||
Kategorie: {c.get('category','')}
|
||||
Beschreibung: {c.get('description_de','')}
|
||||
Typische Gefaehrdungskategorien: {', '.join(c.get('typical_hazard_categories',[]))}
|
||||
Typische Energiequellen: {', '.join(c.get('typical_energy_sources',[]))}
|
||||
Komponententyp: {c.get('maps_to_component_type','')}
|
||||
Tags: {', '.join(c.get('tags',[]))}"""
|
||||
write_chunk(f"component_{c.get('id','unknown')}.json", text, {
|
||||
"regulation_id": f"iace_component_{c.get('id','')}",
|
||||
"regulation_name": c.get('name_de',''),
|
||||
"regulation_short": "IACE Component Library",
|
||||
"category": c.get('category',''),
|
||||
"source": "iace_component_library"
|
||||
})
|
||||
print(f" Components: {len(components)} chunks")
|
||||
except Exception as e:
|
||||
print(f" Components: ERROR - {e}")
|
||||
|
||||
# --- Energy Sources ---
|
||||
try:
|
||||
with open(os.path.join(work_dir, "energy-sources.json")) as f:
|
||||
data = json.load(f)
|
||||
sources = data.get("energy_sources", data) if isinstance(data, dict) else data
|
||||
for s in sources:
|
||||
text = f"""Energiequelle {s.get('id','')}: {s.get('name_de','')} ({s.get('name_en','')})
|
||||
Beschreibung: {s.get('description_de','')}
|
||||
Typische Komponenten: {', '.join(s.get('typical_components',[]))}
|
||||
Typische Gefaehrdungskategorien: {', '.join(s.get('typical_hazard_categories',[]))}
|
||||
Tags: {', '.join(s.get('tags',[]))}"""
|
||||
write_chunk(f"energy_{s.get('id','unknown')}.json", text, {
|
||||
"regulation_id": f"iace_energy_{s.get('id','')}",
|
||||
"regulation_name": s.get('name_de',''),
|
||||
"regulation_short": "IACE Energy Sources",
|
||||
"category": "energy",
|
||||
"source": "iace_energy_sources"
|
||||
})
|
||||
print(f" Energy Sources: {len(sources)} chunks")
|
||||
except Exception as e:
|
||||
print(f" Energy Sources: ERROR - {e}")
|
||||
|
||||
# --- Protective Measures ---
|
||||
try:
|
||||
with open(os.path.join(work_dir, "protective-measures.json")) as f:
|
||||
data = json.load(f)
|
||||
measures = data.get("measures", data) if isinstance(data, dict) else data
|
||||
for m in measures:
|
||||
text = f"""Schutzmassnahme {m.get('id','')}: {m.get('name_de', m.get('name',''))}
|
||||
Reduktionstyp: {m.get('reduction_type','')}
|
||||
Beschreibung: {m.get('description_de', m.get('description',''))}
|
||||
Massnahmenart: {m.get('measure_type','')}
|
||||
Wirksamkeit: {m.get('effectiveness','')}"""
|
||||
write_chunk(f"measure_{m.get('id','unknown')}.json", text, {
|
||||
"regulation_id": f"iace_measure_{m.get('id','')}",
|
||||
"regulation_name": m.get('name_de', m.get('name','')),
|
||||
"regulation_short": "IACE Protective Measures",
|
||||
"category": m.get('reduction_type',''),
|
||||
"source": "iace_protective_measures"
|
||||
})
|
||||
print(f" Measures: {len(measures)} chunks")
|
||||
except Exception as e:
|
||||
print(f" Measures: ERROR - {e}")
|
||||
|
||||
# --- Evidence Types ---
|
||||
try:
|
||||
with open(os.path.join(work_dir, "evidence-types.json")) as f:
|
||||
data = json.load(f)
|
||||
evidence = data.get("evidence_types", data) if isinstance(data, dict) else data
|
||||
for e in evidence:
|
||||
text = f"""Nachweistyp {e.get('id','')}: {e.get('name_de', e.get('name',''))}
|
||||
Methode: {e.get('method', e.get('verification_method',''))}
|
||||
Beschreibung: {e.get('description_de', e.get('description',''))}"""
|
||||
write_chunk(f"evidence_{e.get('id','unknown')}.json", text, {
|
||||
"regulation_id": f"iace_evidence_{e.get('id','')}",
|
||||
"regulation_name": e.get('name_de', e.get('name','')),
|
||||
"regulation_short": "IACE Evidence Types",
|
||||
"category": "evidence",
|
||||
"source": "iace_evidence_types"
|
||||
})
|
||||
print(f" Evidence Types: {len(evidence)} chunks")
|
||||
except Exception as e:
|
||||
print(f" Evidence Types: ERROR - {e}")
|
||||
|
||||
print(f"\nTotal chunks prepared: {chunk_count}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
PYEOF
|
||||
|
||||
# Upload each chunk via Core RAG-API
|
||||
CHUNK_DIR="$WORK_DIR/chunks"
|
||||
if [[ -d "$CHUNK_DIR" ]]; then
|
||||
TOTAL=$(ls "$CHUNK_DIR"/*.json 2>/dev/null | wc -l | tr -d ' ')
|
||||
log " Uploading $TOTAL chunks to Qdrant via RAG-API..."
|
||||
|
||||
for chunk_file in "$CHUNK_DIR"/*.json; do
|
||||
[[ -f "$chunk_file" ]] || continue
|
||||
BASENAME=$(basename "$chunk_file" .json)
|
||||
|
||||
# Extract text and metadata
|
||||
TEXT=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['text'])")
|
||||
REG_ID=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['metadata']['regulation_id'])")
|
||||
|
||||
# Check if already in Qdrant (dedup by regulation_id)
|
||||
EXISTING=$(curl $CURL_OPTS -X POST "$QDRANT_URL/collections/$COLLECTION/points/scroll" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$REG_ID\"}}]},\"limit\":1}" \
|
||||
2>/dev/null | python3 -c "import json,sys; d=json.load(sys.stdin); print(len(d.get('result',{}).get('points',[])))" 2>/dev/null || echo "0")
|
||||
|
||||
if [[ "$EXISTING" -gt 0 ]]; then
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
# Create a temporary text file for upload
|
||||
TMPFILE=$(mktemp "$WORK_DIR/tmp_XXXXXX.txt")
|
||||
echo "$TEXT" > "$TMPFILE"
|
||||
|
||||
HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \
|
||||
-X POST "$RAG_URL" \
|
||||
-F "file=@$TMPFILE" \
|
||||
-F "collection=$COLLECTION" \
|
||||
-F "data_type=iace_library" \
|
||||
-F "use_case=ce_risk_assessment" \
|
||||
-F "year=2026" \
|
||||
-F "chunk_strategy=recursive" \
|
||||
-F "chunk_size=512" \
|
||||
-F "chunk_overlap=50" \
|
||||
2>/dev/null)
|
||||
|
||||
rm -f "$TMPFILE"
|
||||
|
||||
if [[ "$HTTP_CODE" =~ ^2 ]]; then
|
||||
UPLOADED=$((UPLOADED + 1))
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
warn " Failed to upload $BASENAME (HTTP $HTTP_CODE)"
|
||||
fi
|
||||
done
|
||||
|
||||
ok " Ingestion complete: $UPLOADED uploaded, $SKIPPED skipped, $FAILED failed"
|
||||
else
|
||||
warn " No chunks directory found at $CHUNK_DIR"
|
||||
fi
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# Phase 4: Verify
|
||||
# =============================================================================
|
||||
if should_run "verify"; then
|
||||
log "Phase 4: Verifying IACE library collection..."
|
||||
|
||||
POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \
|
||||
python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0")
|
||||
|
||||
log " Collection '$COLLECTION': $POINT_COUNT points"
|
||||
|
||||
if [[ "$POINT_COUNT" -gt 0 ]]; then
|
||||
ok " Verification passed"
|
||||
else
|
||||
warn " Collection is empty — ingestion may have failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# Phase 5: Record version in compliance_corpus_versions
|
||||
# =============================================================================
|
||||
if should_run "version"; then
|
||||
log "Phase 5: Recording corpus version..."
|
||||
|
||||
POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \
|
||||
python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0")
|
||||
|
||||
VERSION="1.0.0"
|
||||
DIGEST=$(echo -n "iace-libraries-v1-$(date +%Y%m%d)" | shasum -a 256 | cut -d' ' -f1)
|
||||
|
||||
psql "$DB_URL" -c "
|
||||
INSERT INTO compliance_corpus_versions (collection_name, version, documents_count, chunks_count, regulations, digest, notes)
|
||||
VALUES ('$COLLECTION', '$VERSION', 7, $POINT_COUNT,
|
||||
ARRAY['iace_hazard_library','iace_component_library','iace_energy_sources','iace_protective_measures','iace_evidence_types','iace_tag_taxonomy','iace_hazard_patterns'],
|
||||
'$DIGEST',
|
||||
'IACE CE-Risikobeurteilung Bibliotheken: 150 Hazards, 120 Komponenten, 20 Energiequellen, 200 Schutzmassnahmen, 50 Evidenztypen, 85 Tags, 44 Patterns')
|
||||
ON CONFLICT DO NOTHING;
|
||||
" 2>/dev/null && ok " Corpus version recorded" || warn " Version recording failed (table may not exist)"
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# Summary
|
||||
# =============================================================================
|
||||
log "================================================================"
|
||||
log "IACE Library RAG Ingestion Summary"
|
||||
log " Collection: $COLLECTION"
|
||||
log " Uploaded: $UPLOADED"
|
||||
log " Skipped: $SKIPPED"
|
||||
log " Failed: $FAILED"
|
||||
log "================================================================"
|
||||
Reference in New Issue
Block a user