feat(iace): Phase 5+6 — frontend integration, RAG library search, comprehensive tests

Phase 5 — Frontend Integration: - components/page.tsx: ComponentLibraryModal with 120 components + 20 energy sources - hazards/page.tsx: AutoSuggestPanel with 3-column pattern matching review - mitigations/page.tsx: SuggestMeasuresModal per hazard with 3-level grouping - verification/page.tsx: SuggestEvidenceModal per mitigation with evidence types Phase 6 — RAG Library Search: - Added bp_iace_libraries to AllowedCollections whitelist in rag_handlers.go - SearchLibrary endpoint: POST /iace/library-search (semantic search across libraries) - EnrichTechFileSection endpoint: POST /projects/:id/tech-file/:section/enrich - Created ingest-iace-libraries.sh ingestion script for Qdrant collection Tests (123 passing): - tag_taxonomy_test.go: 8 tests for taxonomy entries, domains, essential tags - controls_library_test.go: 7 tests for measures, reduction types, subtypes - integration_test.go: 7 integration tests for full match flow and library consistency - Extended tag_resolver_test.go: 9 new tests for FindByTags and cross-category resolution Documentation: - Updated iace.md with Hazard-Matching-Engine, RAG enrichment, and new DB tables Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 10:22:49 +01:00
parent 3b2006ebce
commit 9c1355c05f
13 changed files with 2422 additions and 43 deletions
@@ -0,0 +1,395 @@
+#!/usr/bin/env bash
+# =============================================================================
+# BreakPilot Compliance — IACE Library RAG Ingestion
+#
+# Exports IACE hazard library, component library, energy sources, protective
+# measures, and evidence types from the Go code / database, then ingests them
+# into Qdrant collection `bp_iace_libraries` via the Core RAG-API (Port 8097).
+#
+# Execution on Mac Mini:
+#   bash ~/Projekte/breakpilot-compliance/scripts/ingest-iace-libraries.sh
+#   bash .../ingest-iace-libraries.sh [--skip-export] [--only PHASE]
+#
+# Phases: export, create-collection, ingest, verify, version
+# =============================================================================
+set -euo pipefail
+
+# --- Configuration -----------------------------------------------------------
+WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-iace}"
+RAG_URL="https://localhost:8097/api/v1/documents/upload"
+QDRANT_URL="http://localhost:6333"
+SDK_URL="http://localhost:8090"
+COLLECTION="bp_iace_libraries"
+CURL_OPTS="-sk --connect-timeout 15 --max-time 600 --retry 3 --retry-delay 5"
+DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
+
+# Counters
+UPLOADED=0
+FAILED=0
+SKIPPED=0
+
+# --- CLI Args ----------------------------------------------------------------
+SKIP_EXPORT=false
+ONLY_PHASE=""
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --skip-export) SKIP_EXPORT=true; shift ;;
+    --only) ONLY_PHASE="$2"; shift 2 ;;
+    -h|--help)
+      echo "Usage: $0 [--skip-export] [--only PHASE]"
+      echo "Phases: export, create-collection, ingest, verify, version"
+      exit 0
+      ;;
+    *) echo "Unknown option: $1"; exit 1 ;;
+  esac
+done
+
+# --- Helpers -----------------------------------------------------------------
+log()  { echo "[$(date '+%H:%M:%S')] $*"; }
+ok()   { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
+warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
+fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
+
+should_run() {
+  [[ -z "$ONLY_PHASE" ]] || [[ "$ONLY_PHASE" == "$1" ]]
+}
+
+mkdir -p "$WORK_DIR"
+
+# =============================================================================
+# Phase 1: Export IACE library data from the SDK API
+# =============================================================================
+if should_run "export" && [[ "$SKIP_EXPORT" == "false" ]]; then
+  log "Phase 1: Exporting IACE library data from SDK API..."
+
+  # Export hazard library
+  log "  Fetching hazard library..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-library" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/hazard-library.json" 2>/dev/null && \
+    ok "  Hazard library exported" || warn "  Hazard library export failed"
+
+  # Export component library
+  log "  Fetching component library..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/component-library" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/component-library.json" 2>/dev/null && \
+    ok "  Component library exported" || warn "  Component library export failed"
+
+  # Export energy sources
+  log "  Fetching energy sources..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/energy-sources" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/energy-sources.json" 2>/dev/null && \
+    ok "  Energy sources exported" || warn "  Energy sources export failed"
+
+  # Export protective measures
+  log "  Fetching protective measures library..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/protective-measures-library" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/protective-measures.json" 2>/dev/null && \
+    ok "  Protective measures exported" || warn "  Protective measures export failed"
+
+  # Export evidence types
+  log "  Fetching evidence types..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/evidence-types" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/evidence-types.json" 2>/dev/null && \
+    ok "  Evidence types exported" || warn "  Evidence types export failed"
+
+  # Export tag taxonomy
+  log "  Fetching tag taxonomy..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/tags" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/tag-taxonomy.json" 2>/dev/null && \
+    ok "  Tag taxonomy exported" || warn "  Tag taxonomy export failed"
+
+  # Export hazard patterns
+  log "  Fetching hazard patterns..."
+  curl $CURL_OPTS "$SDK_URL/sdk/v1/iace/hazard-patterns" \
+    -H "X-Tenant-ID: system" \
+    -o "$WORK_DIR/hazard-patterns.json" 2>/dev/null && \
+    ok "  Hazard patterns exported" || warn "  Hazard patterns export failed"
+
+  ok "Phase 1 complete: Library data exported to $WORK_DIR"
+fi
+
+# =============================================================================
+# Phase 2: Create Qdrant collection (if not exists)
+# =============================================================================
+if should_run "create-collection"; then
+  log "Phase 2: Creating Qdrant collection '$COLLECTION'..."
+
+  # Check if collection exists
+  HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \
+    "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null)
+
+  if [[ "$HTTP_CODE" == "200" ]]; then
+    ok "  Collection '$COLLECTION' already exists"
+  else
+    log "  Creating collection with bge-m3 (1024 dimensions)..."
+    curl $CURL_OPTS -X PUT "$QDRANT_URL/collections/$COLLECTION" \
+      -H "Content-Type: application/json" \
+      -d '{
+        "vectors": {
+          "size": 1024,
+          "distance": "Cosine"
+        },
+        "optimizers_config": {
+          "default_segment_number": 2
+        }
+      }' 2>/dev/null && \
+      ok "  Collection '$COLLECTION' created" || fail "  Failed to create collection"
+  fi
+fi
+
+# =============================================================================
+# Phase 3: Transform and ingest via Core RAG-API
+# =============================================================================
+if should_run "ingest"; then
+  log "Phase 3: Ingesting IACE library documents..."
+
+  # Create text files from JSON exports for RAG ingestion
+  python3 - "$WORK_DIR" <<'PYEOF'
+import json, sys, os
+
+work_dir = sys.argv[1]
+output_dir = os.path.join(work_dir, "chunks")
+os.makedirs(output_dir, exist_ok=True)
+
+chunk_count = 0
+
+def write_chunk(filename, text, metadata):
+    global chunk_count
+    chunk_count += 1
+    filepath = os.path.join(output_dir, filename)
+    with open(filepath, 'w') as f:
+        json.dump({"text": text, "metadata": metadata}, f, ensure_ascii=False)
+
+# --- Hazard Library ---
+try:
+    with open(os.path.join(work_dir, "hazard-library.json")) as f:
+        data = json.load(f)
+    hazards = data.get("hazards", data) if isinstance(data, dict) else data
+    for h in hazards:
+        text = f"""Gefaehrdung {h.get('id','')}: {h.get('name_de', h.get('name',''))}
+Kategorie: {h.get('category','')}
+Beschreibung: {h.get('description_de', h.get('description',''))}
+Typische Ursachen: {h.get('typical_causes','')}
+Gefaehrliche Situation: {h.get('hazardous_situation','')}
+Moegliche Schaeden: {h.get('possible_damages','')}
+Betroffene Rollen: {', '.join(h.get('affected_roles',[]))}
+Lebensphasen: {', '.join(h.get('lifecycle_phases',[]))}
+Massnahmenarten: {', '.join(h.get('measure_types',[]))}
+Nachweisarten: {', '.join(h.get('evidence_types',[]))}"""
+        write_chunk(f"hazard_{h.get('id','unknown')}.json", text, {
+            "regulation_id": f"iace_hazard_{h.get('id','')}",
+            "regulation_name": h.get('name_de', h.get('name','')),
+            "regulation_short": "IACE Hazard Library",
+            "category": h.get('category',''),
+            "source": "iace_hazard_library"
+        })
+    print(f"  Hazards: {len(hazards)} chunks")
+except Exception as e:
+    print(f"  Hazards: ERROR - {e}")
+
+# --- Component Library ---
+try:
+    with open(os.path.join(work_dir, "component-library.json")) as f:
+        data = json.load(f)
+    components = data.get("components", data) if isinstance(data, dict) else data
+    for c in components:
+        text = f"""Maschinenkomponente {c.get('id','')}: {c.get('name_de','')} ({c.get('name_en','')})
+Kategorie: {c.get('category','')}
+Beschreibung: {c.get('description_de','')}
+Typische Gefaehrdungskategorien: {', '.join(c.get('typical_hazard_categories',[]))}
+Typische Energiequellen: {', '.join(c.get('typical_energy_sources',[]))}
+Komponententyp: {c.get('maps_to_component_type','')}
+Tags: {', '.join(c.get('tags',[]))}"""
+        write_chunk(f"component_{c.get('id','unknown')}.json", text, {
+            "regulation_id": f"iace_component_{c.get('id','')}",
+            "regulation_name": c.get('name_de',''),
+            "regulation_short": "IACE Component Library",
+            "category": c.get('category',''),
+            "source": "iace_component_library"
+        })
+    print(f"  Components: {len(components)} chunks")
+except Exception as e:
+    print(f"  Components: ERROR - {e}")
+
+# --- Energy Sources ---
+try:
+    with open(os.path.join(work_dir, "energy-sources.json")) as f:
+        data = json.load(f)
+    sources = data.get("energy_sources", data) if isinstance(data, dict) else data
+    for s in sources:
+        text = f"""Energiequelle {s.get('id','')}: {s.get('name_de','')} ({s.get('name_en','')})
+Beschreibung: {s.get('description_de','')}
+Typische Komponenten: {', '.join(s.get('typical_components',[]))}
+Typische Gefaehrdungskategorien: {', '.join(s.get('typical_hazard_categories',[]))}
+Tags: {', '.join(s.get('tags',[]))}"""
+        write_chunk(f"energy_{s.get('id','unknown')}.json", text, {
+            "regulation_id": f"iace_energy_{s.get('id','')}",
+            "regulation_name": s.get('name_de',''),
+            "regulation_short": "IACE Energy Sources",
+            "category": "energy",
+            "source": "iace_energy_sources"
+        })
+    print(f"  Energy Sources: {len(sources)} chunks")
+except Exception as e:
+    print(f"  Energy Sources: ERROR - {e}")
+
+# --- Protective Measures ---
+try:
+    with open(os.path.join(work_dir, "protective-measures.json")) as f:
+        data = json.load(f)
+    measures = data.get("measures", data) if isinstance(data, dict) else data
+    for m in measures:
+        text = f"""Schutzmassnahme {m.get('id','')}: {m.get('name_de', m.get('name',''))}
+Reduktionstyp: {m.get('reduction_type','')}
+Beschreibung: {m.get('description_de', m.get('description',''))}
+Massnahmenart: {m.get('measure_type','')}
+Wirksamkeit: {m.get('effectiveness','')}"""
+        write_chunk(f"measure_{m.get('id','unknown')}.json", text, {
+            "regulation_id": f"iace_measure_{m.get('id','')}",
+            "regulation_name": m.get('name_de', m.get('name','')),
+            "regulation_short": "IACE Protective Measures",
+            "category": m.get('reduction_type',''),
+            "source": "iace_protective_measures"
+        })
+    print(f"  Measures: {len(measures)} chunks")
+except Exception as e:
+    print(f"  Measures: ERROR - {e}")
+
+# --- Evidence Types ---
+try:
+    with open(os.path.join(work_dir, "evidence-types.json")) as f:
+        data = json.load(f)
+    evidence = data.get("evidence_types", data) if isinstance(data, dict) else data
+    for e in evidence:
+        text = f"""Nachweistyp {e.get('id','')}: {e.get('name_de', e.get('name',''))}
+Methode: {e.get('method', e.get('verification_method',''))}
+Beschreibung: {e.get('description_de', e.get('description',''))}"""
+        write_chunk(f"evidence_{e.get('id','unknown')}.json", text, {
+            "regulation_id": f"iace_evidence_{e.get('id','')}",
+            "regulation_name": e.get('name_de', e.get('name','')),
+            "regulation_short": "IACE Evidence Types",
+            "category": "evidence",
+            "source": "iace_evidence_types"
+        })
+    print(f"  Evidence Types: {len(evidence)} chunks")
+except Exception as e:
+    print(f"  Evidence Types: ERROR - {e}")
+
+print(f"\nTotal chunks prepared: {chunk_count}")
+print(f"Output directory: {output_dir}")
+PYEOF
+
+  # Upload each chunk via Core RAG-API
+  CHUNK_DIR="$WORK_DIR/chunks"
+  if [[ -d "$CHUNK_DIR" ]]; then
+    TOTAL=$(ls "$CHUNK_DIR"/*.json 2>/dev/null | wc -l | tr -d ' ')
+    log "  Uploading $TOTAL chunks to Qdrant via RAG-API..."
+
+    for chunk_file in "$CHUNK_DIR"/*.json; do
+      [[ -f "$chunk_file" ]] || continue
+      BASENAME=$(basename "$chunk_file" .json)
+
+      # Extract text and metadata
+      TEXT=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['text'])")
+      REG_ID=$(python3 -c "import json; d=json.load(open('$chunk_file')); print(d['metadata']['regulation_id'])")
+
+      # Check if already in Qdrant (dedup by regulation_id)
+      EXISTING=$(curl $CURL_OPTS -X POST "$QDRANT_URL/collections/$COLLECTION/points/scroll" \
+        -H "Content-Type: application/json" \
+        -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$REG_ID\"}}]},\"limit\":1}" \
+        2>/dev/null | python3 -c "import json,sys; d=json.load(sys.stdin); print(len(d.get('result',{}).get('points',[])))" 2>/dev/null || echo "0")
+
+      if [[ "$EXISTING" -gt 0 ]]; then
+        SKIPPED=$((SKIPPED + 1))
+        continue
+      fi
+
+      # Create a temporary text file for upload
+      TMPFILE=$(mktemp "$WORK_DIR/tmp_XXXXXX.txt")
+      echo "$TEXT" > "$TMPFILE"
+
+      HTTP_CODE=$(curl $CURL_OPTS -o /dev/null -w "%{http_code}" \
+        -X POST "$RAG_URL" \
+        -F "file=@$TMPFILE" \
+        -F "collection=$COLLECTION" \
+        -F "data_type=iace_library" \
+        -F "use_case=ce_risk_assessment" \
+        -F "year=2026" \
+        -F "chunk_strategy=recursive" \
+        -F "chunk_size=512" \
+        -F "chunk_overlap=50" \
+        2>/dev/null)
+
+      rm -f "$TMPFILE"
+
+      if [[ "$HTTP_CODE" =~ ^2 ]]; then
+        UPLOADED=$((UPLOADED + 1))
+      else
+        FAILED=$((FAILED + 1))
+        warn "  Failed to upload $BASENAME (HTTP $HTTP_CODE)"
+      fi
+    done
+
+    ok "  Ingestion complete: $UPLOADED uploaded, $SKIPPED skipped, $FAILED failed"
+  else
+    warn "  No chunks directory found at $CHUNK_DIR"
+  fi
+fi
+
+# =============================================================================
+# Phase 4: Verify
+# =============================================================================
+if should_run "verify"; then
+  log "Phase 4: Verifying IACE library collection..."
+
+  POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \
+    python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0")
+
+  log "  Collection '$COLLECTION': $POINT_COUNT points"
+
+  if [[ "$POINT_COUNT" -gt 0 ]]; then
+    ok "  Verification passed"
+  else
+    warn "  Collection is empty — ingestion may have failed"
+  fi
+fi
+
+# =============================================================================
+# Phase 5: Record version in compliance_corpus_versions
+# =============================================================================
+if should_run "version"; then
+  log "Phase 5: Recording corpus version..."
+
+  POINT_COUNT=$(curl $CURL_OPTS "$QDRANT_URL/collections/$COLLECTION" 2>/dev/null | \
+    python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('result',{}).get('points_count',0))" 2>/dev/null || echo "0")
+
+  VERSION="1.0.0"
+  DIGEST=$(echo -n "iace-libraries-v1-$(date +%Y%m%d)" | shasum -a 256 | cut -d' ' -f1)
+
+  psql "$DB_URL" -c "
+    INSERT INTO compliance_corpus_versions (collection_name, version, documents_count, chunks_count, regulations, digest, notes)
+    VALUES ('$COLLECTION', '$VERSION', 7, $POINT_COUNT,
+            ARRAY['iace_hazard_library','iace_component_library','iace_energy_sources','iace_protective_measures','iace_evidence_types','iace_tag_taxonomy','iace_hazard_patterns'],
+            '$DIGEST',
+            'IACE CE-Risikobeurteilung Bibliotheken: 150 Hazards, 120 Komponenten, 20 Energiequellen, 200 Schutzmassnahmen, 50 Evidenztypen, 85 Tags, 44 Patterns')
+    ON CONFLICT DO NOTHING;
+  " 2>/dev/null && ok "  Corpus version recorded" || warn "  Version recording failed (table may not exist)"
+fi
+
+# =============================================================================
+# Summary
+# =============================================================================
+log "================================================================"
+log "IACE Library RAG Ingestion Summary"
+log "  Collection: $COLLECTION"
+log "  Uploaded:   $UPLOADED"
+log "  Skipped:    $SKIPPED"
+log "  Failed:     $FAILED"
+log "================================================================"