Files
breakpilot-compliance/scripts/ingest-iace-libraries.sh
Benjamin Admin c52dbdb8f1
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024
Phase 1 (LLM Quality):
- Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill)
- Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts

Phase 2 (Retrieval Quality):
- Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go)
- Fallback to dense-only search if Query API unavailable
- Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default)
- CPU-only PyTorch dependency to keep Docker image small

Phase 3 (Data Layer):
- Cross-regulation dedup pass (threshold 0.95) links controls across regulations
- DedupResult.link_type field distinguishes dedup_merge vs cross_regulation
- Chunk size defaults updated 512/50 → 1024/128 for new ingestions only
- Existing collections and controls are NOT affected

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:49:43 +01:00

396 lines
16 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# BreakPilot Compliance — IACE Library RAG Ingestion
#
# Exports IACE hazard library, component library, energy sources, protective
# measures, and evidence types from the Go code / database, then ingests them
# into Qdrant collection `bp_iace_libraries` via the Core RAG-API (Port 8097).
#
# Execution on Mac Mini:
# bash ~/Projekte/breakpilot-compliance/scripts/ingest-iace-libraries.sh
# bash .../ingest-iace-libraries.sh [--skip-export] [--only PHASE]
#
# Phases: export, create-collection, ingest, verify, version
# =============================================================================
set -euo pipefail
# --- Configuration -----------------------------------------------------------
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-iace}"
RAG_URL="https://localhost:8097/api/v1/documents/upload"
QDRANT_URL="http://localhost:6333"
SDK_URL="http://localhost:8090"
COLLECTION="bp_iace_libraries"
CURL_OPTS="-sk --connect-timeout 15 --max-time 600 --retry 3 --retry-delay 5"
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
# Counters
UPLOADED=0
FAILED=0
SKIPPED=0
# --- CLI Args ----------------------------------------------------------------
SKIP_EXPORT=false
ONLY_PHASE=""
while [[ $# -gt 0 ]]; do
case $1 in
--skip-export) SKIP_EXPORT=true; shift ;;
--only) ONLY_PHASE="$2"; shift 2 ;;
-h|--help)
echo "Usage: $0 [--skip-export] [--only PHASE]"
echo "Phases: export, create-collection, ingest, verify, version"
exit 0
;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
# --- Helpers -----------------------------------------------------------------
log() { echo "[$(date '+%H:%M:%S')] $*"; }
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
should_run() {
[[ -z "$ONLY_PHASE" ]] || [[ "$ONLY_PHASE" == "$1" ]]
}
mkdir -p "$WORK_DIR"
# =============================================================================
# Phase 1: Export IACE library data from the SDK API
# =============================================================================
if should_run "export" && [[ "$SKIP_EXPORT" == "false" ]]; then
log "Phase 1: Exporting IACE library data from SDK API..."
# Export hazard library
log " Fetching hazard library..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-library" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/hazard-library.json" 2>/dev/null && \
ok " Hazard library exported" || warn " Hazard library export failed"
# Export component library
log " Fetching component library..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/component-library" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/component-library.json" 2>/dev/null && \
ok " Component library exported" || warn " Component library export failed"
# Export energy sources
log " Fetching energy sources..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/energy-sources" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/energy-sources.json" 2>/dev/null && \
ok " Energy sources exported" || warn " Energy sources export failed"
# Export protective measures
log " Fetching protective measures library..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/protective-measures-library" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/protective-measures.json" 2>/dev/null && \
ok " Protective measures exported" || warn " Protective measures export failed"
# Export evidence types
log " Fetching evidence types..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/evidence-types" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/evidence-types.json" 2>/dev/null && \
ok " Evidence types exported" || warn " Evidence types export failed"
# Export tag taxonomy
log " Fetching tag taxonomy..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/tags" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/tag-taxonomy.json" 2>/dev/null && \
ok " Tag taxonomy exported" || warn " Tag taxonomy export failed"
# Export hazard patterns
log " Fetching hazard patterns..."
curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-patterns" \
-H "X-Tenant-ID: system" \
-o "$WORK_DIR/hazard-patterns.json" 2>/dev/null && \
ok " Hazard patterns exported" || warn " Hazard patterns export failed"
ok "Phase 1 complete: Library data exported to $WORK_DIR"
fi
# =============================================================================
# Phase 2: Create Qdrant collection (if not exists)
# =============================================================================
if should_run "create-collection"; then
log "Phase 2: Creating Qdrant collection '$COLLECTION'..."
# Check if collection exists
HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \
"$QDRANT_URL/collections/$COLLECTION" 2>/dev/null)
if [[ "$HTTP_CODE" == "200" ]]; then
ok " Collection '$COLLECTION' already exists"
else
log " Creating collection with bge-m3 (1024 dimensions)..."
curl $CURL_OPTS -X PUT "$QDRANT_URL/collections/$COLLECTION" \
-H "Content-Type: application/json" \
-d '{
"vectors": {
"size": 1024,
"distance": "Cosine"
},
"optimizers_config": {
"default_segment_number": 2
}
}' 2>/dev/null && \
ok " Collection '$COLLECTION' created" || fail " Failed to create collection"
fi
fi
# =============================================================================
# Phase 3: Transform and ingest via Core RAG-API
# =============================================================================
if should_run "ingest"; then
log "Phase 3: Ingesting IACE library documents..."
# Create text files from JSON exports for RAG ingestion
python3 - "$WORK_DIR" <<'PYEOF'
import json, sys, os
work_dir = sys.argv[1]
output_dir = os.path.join(work_dir, "chunks")
os.makedirs(output_dir, exist_ok=True)
chunk_count = 0
def write_chunk(filename, text, metadata):
global chunk_count
chunk_count += 1
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w') as f:
json.dump({"text": text, "metadata": metadata}, f, ensure_ascii=False)
# --- Hazard Library ---
try:
with open(os.path.join(work_dir, "hazard-library.json")) as f:
data = json.load(f)
hazards = data.get("hazards", data) if isinstance(data, dict) else data
for h in hazards:
text = f"""Gefaehrdung {h.get('id','')}: {h.get('name_de', h.get('name',''))}
Kategorie: {h.get('category','')}
Beschreibung: {h.get('description_de', h.get('description',''))}
Typische Ursachen: {h.get('typical_causes','')}
Gefaehrliche Situation: {h.get('hazardous_situation','')}
Moegliche Schaeden: {h.get('possible_damages','')}
Betroffene Rollen: {', '.join(h.get('affected_roles',[]))}
Lebensphasen: {', '.join(h.get('lifecycle_phases',[]))}
Massnahmenarten: {', '.join(h.get('measure_types',[]))}
Nachweisarten: {', '.join(h.get('evidence_types',[]))}"""
write_chunk(f"hazard_{h.get('id','unknown')}.json", text, {
"regulation_id": f"iace_hazard_{h.get('id','')}",
"regulation_name": h.get('name_de', h.get('name','')),
"regulation_short": "IACE Hazard Library",
"category": h.get('category',''),
"source": "iace_hazard_library"
})
print(f" Hazards: {len(hazards)} chunks")
except Exception as e:
print(f" Hazards: ERROR - {e}")
# --- Component Library ---
try:
with open(os.path.join(work_dir, "component-library.json")) as f:
data = json.load(f)
components = data.get("components", data) if isinstance(data, dict) else data
for c in components:
text = f"""Maschinenkomponente {c.get('id','')}: {c.get('name_de','')} ({c.get('name_en','')})
Kategorie: {c.get('category','')}
Beschreibung: {c.get('description_de','')}
Typische Gefaehrdungskategorien: {', '.join(c.get('typical_hazard_categories',[]))}
Typische Energiequellen: {', '.join(c.get('typical_energy_sources',[]))}
Komponententyp: {c.get('maps_to_component_type','')}
Tags: {', '.join(c.get('tags',[]))}"""
write_chunk(f"component_{c.get('id','unknown')}.json", text, {
"regulation_id": f"iace_component_{c.get('id','')}",
"regulation_name": c.get('name_de',''),
"regulation_short": "IACE Component Library",
"category": c.get('category',''),
"source": "iace_component_library"
})
print(f" Components: {len(components)} chunks")
except Exception as e:
print(f" Components: ERROR - {e}")
# --- Energy Sources ---
try:
with open(os.path.join(work_dir, "energy-sources.json")) as f:
data = json.load(f)
sources = data.get("energy_sources", data) if isinstance(data, dict) else data
for s in sources:
text = f"""Energiequelle {s.get('id','')}: {s.get('name_de','')} ({s.get('name_en','')})
Beschreibung: {s.get('description_de','')}
Typische Komponenten: {', '.join(s.get('typical_components',[]))}
Typische Gefaehrdungskategorien: {', '.join(s.get('typical_hazard_categories',[]))}
Tags: {', '.join(s.get('tags',[]))}"""
write_chunk(f"energy_{s.get('id','unknown')}.json", text, {
"regulation_id": f"iace_energy_{s.get('id','')}",
"regulation_name": s.get('name_de',''),
"regulation_short": "IACE Energy Sources",
"category": "energy",
"source": "iace_energy_sources"
})
print(f" Energy Sources: {len(sources)} chunks")
except Exception as e:
print(f" Energy Sources: ERROR - {e}")
# --- Protective Measures ---
try:
with open(os.path.join(work_dir, "protective-measures.json")) as f:
data = json.load(f)
measures = data.get("measures", data) if isinstance(data, dict) else data
for m in measures:
text = f"""Schutzmassnahme {m.get('id','')}: {m.get('name_de', m.get('name',''))}
Reduktionstyp: {m.get('reduction_type','')}
Beschreibung: {m.get('description_de', m.get('description',''))}
Massnahmenart: {m.get('measure_type','')}
Wirksamkeit: {m.get('effectiveness','')}"""
write_chunk(f"measure_{m.get('id','unknown')}.json", text, {
"regulation_id": f"iace_measure_{m.get('id','')}",
"regulation_name": m.get('name_de', m.get('name','')),
"regulation_short": "IACE Protective Measures",
"category": m.get('reduction_type',''),
"source": "iace_protective_measures"
})
print(f" Measures: {len(measures)} chunks")
except Exception as e:
print(f" Measures: ERROR - {e}")
# --- Evidence Types ---
try:
with open(os.path.join(work_dir, "evidence-types.json")) as f:
data = json.load(f)
evidence = data.get("evidence_types", data) if isinstance(data, dict) else data
for e in evidence:
text = f"""Nachweistyp {e.get('id','')}: {e.get('name_de', e.get('name',''))}
Methode: {e.get('method', e.get('verification_method',''))}
Beschreibung: {e.get('description_de', e.get('description',''))}"""
write_chunk(f"evidence_{e.get('id','unknown')}.json", text, {
"regulation_id": f"iace_evidence_{e.get('id','')}",
"regulation_name": e.get('name_de', e.get('name','')),
"regulation_short": "IACE Evidence Types",
"category": "evidence",
"source": "iace_evidence_types"
})
print(f" Evidence Types: {len(evidence)} chunks")
except Exception as e:
print(f" Evidence Types: ERROR - {e}")
print(f"\nTotal chunks prepared: {chunk_count}")
print(f"Output directory: {output_dir}")
PYEOF
# Upload each chunk via Core RAG-API
CHUNK_DIR="$WORK_DIR/chunks"
if [[ -d "$CHUNK_DIR" ]]; then
TOTAL=$(ls "$CHUNK_DIR"/*.json 2>/dev/null | wc -l | tr -d ' ')
log " Uploading $TOTAL chunks to Qdrant via RAG-API..."
for chunk_file in "$CHUNK_DIR"/*.json; do
[[ -f "$chunk_file" ]] || continue
BASENAME=$(basename "$chunk_file" .json)
# Extract text and metadata
TEXT=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['text'])")
REG_ID=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['metadata']['regulation_id'])")
# Check if already in Qdrant (dedup by regulation_id)
EXISTING=$(curl $CURL_OPTS -X POST "$QDRANT_URL/collections/$COLLECTION/points/scroll" \
-H "Content-Type: application/json" \
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$REG_ID\"}}]},\"limit\":1}" \
2>/dev/null | python3 -c "import json,sys; d=json.load(sys.stdin); print(len(d.get('result',{}).get('points',[])))" 2>/dev/null || echo "0")
if [[ "$EXISTING" -gt 0 ]]; then
SKIPPED=$((SKIPPED + 1))
continue
fi
# Create a temporary text file for upload
TMPFILE=$(mktemp "$WORK_DIR/tmp_XXXXXX.txt")
echo "$TEXT" > "$TMPFILE"
HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \
-X POST "$RAG_URL" \
-F "file=@$TMPFILE" \
-F "collection=$COLLECTION" \
-F "data_type=iace_library" \
-F "use_case=ce_risk_assessment" \
-F "year=2026" \
-F "chunk_strategy=recursive" \
-F "chunk_size=1024" \
-F "chunk_overlap=128" \
2>/dev/null)
rm -f "$TMPFILE"
if [[ "$HTTP_CODE" =~ ^2 ]]; then
UPLOADED=$((UPLOADED + 1))
else
FAILED=$((FAILED + 1))
warn " Failed to upload $BASENAME (HTTP $HTTP_CODE)"
fi
done
ok " Ingestion complete: $UPLOADED uploaded, $SKIPPED skipped, $FAILED failed"
else
warn " No chunks directory found at $CHUNK_DIR"
fi
fi
# =============================================================================
# Phase 4: Verify
# =============================================================================
if should_run "verify"; then
log "Phase 4: Verifying IACE library collection..."
POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \
python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0")
log " Collection '$COLLECTION': $POINT_COUNT points"
if [[ "$POINT_COUNT" -gt 0 ]]; then
ok " Verification passed"
else
warn " Collection is empty — ingestion may have failed"
fi
fi
# =============================================================================
# Phase 5: Record version in compliance_corpus_versions
# =============================================================================
if should_run "version"; then
log "Phase 5: Recording corpus version..."
POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \
python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0")
VERSION="1.0.0"
DIGEST=$(echo -n "iace-libraries-v1-$(date +%Y%m%d)" | shasum -a 256 | cut -d' ' -f1)
psql "$DB_URL" -c "
INSERT INTO compliance_corpus_versions (collection_name, version, documents_count, chunks_count, regulations, digest, notes)
VALUES ('$COLLECTION', '$VERSION', 7, $POINT_COUNT,
ARRAY['iace_hazard_library','iace_component_library','iace_energy_sources','iace_protective_measures','iace_evidence_types','iace_tag_taxonomy','iace_hazard_patterns'],
'$DIGEST',
'IACE CE-Risikobeurteilung Bibliotheken: 150 Hazards, 120 Komponenten, 20 Energiequellen, 200 Schutzmassnahmen, 50 Evidenztypen, 85 Tags, 44 Patterns')
ON CONFLICT DO NOTHING;
" 2>/dev/null && ok " Corpus version recorded" || warn " Version recording failed (table may not exist)"
fi
# =============================================================================
# Summary
# =============================================================================
log "================================================================"
log "IACE Library RAG Ingestion Summary"
log " Collection: $COLLECTION"
log " Uploaded: $UPLOADED"
log " Skipped: $SKIPPED"
log " Failed: $FAILED"
log "================================================================"