diff --git a/scripts/ingest-phase-i.sh b/scripts/ingest-phase-i.sh new file mode 100755 index 0000000..92d8450 --- /dev/null +++ b/scripts/ingest-phase-i.sh @@ -0,0 +1,348 @@ +#!/usr/bin/env bash +# ============================================================================= +# BreakPilot Compliance — Phase I RAG Ingestion +# +# Downloads and ingests ~12 new technical standards and guidelines: +# - 3 NIST Special Publications (800-160, 800-30, 800-82) → bp_compliance_ce +# - 5 Open Standards (SLSA, SPDX, CycloneDX, OpenTelemetry, CVSS) → bp_compliance_ce +# - 2 EU Guidelines (Machinery Guide, GPAI) → bp_compliance_ce +# - 2 Optional (GPAI Scope, FDA Human Factors) → bp_compliance_ce +# +# Alle Dokumente: kommerziell nutzbar (Public Domain US / Apache-2.0 / CC-BY / EU Public) +# +# Run on Mac Mini: +# bash ~/Projekte/breakpilot-compliance/scripts/ingest-phase-i.sh +# ============================================================================= +set -euo pipefail + +WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-i}" +RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}" +QDRANT_URL="${QDRANT_URL:-http://localhost:6333}" +CURL_OPTS="-sk --connect-timeout 10 --max-time 300" +CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900" + +UPLOADED=0 +FAILED=0 +SKIPPED=0 + +log() { echo "[$(date '+%H:%M:%S')] $*"; } +ok() { echo "[$(date '+%H:%M:%S')] ok $*"; } +warn() { echo "[$(date '+%H:%M:%S')] WARN $*" >&2; } +fail() { echo "[$(date '+%H:%M:%S')] FAIL $*" >&2; } + +download_pdf() { + local url="$1" + local target="$2" + if [[ -f "$target" ]]; then + log "PDF exists: $(basename "$target") (skipping download)" + return 0 + fi + log "Downloading: $(basename "$target")" + curl $CURL_OPTS_LARGE -L "$url" -o "$target" 2>/dev/null || { + warn "Download failed: $url" + rm -f "$target" + return 0 + } + local fsize + fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0) + if [[ "$fsize" -lt 1000 ]]; then + warn "Download too small (${fsize}B): $(basename "$target")" + rm -f "$target" + else + log " Downloaded: $(( fsize / 1024 ))KB" + fi +} + +upload_file() { + local file="$1" + local collection="$2" + local data_type="$3" + local use_case="$4" + local year="$5" + local metadata_json="$6" + local label="${7:-$(basename "$file")}" + + if [[ ! -f "$file" ]]; then + warn "File not found: $file" + FAILED=$((FAILED + 1)) + return 0 + fi + + # Dedup check + local reg_id + reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "") + if [[ -n "$reg_id" ]]; then + local existing + existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \ + -H "Content-Type: application/json" \ + -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \ + 2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0") + if [[ "$existing" -gt 0 ]] 2>/dev/null; then + log "SKIP (already in Qdrant): $label [regulation_id=$reg_id]" + SKIPPED=$((SKIPPED + 1)) + return 0 + fi + fi + + local filesize + filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0) + if [[ "$filesize" -lt 100 ]]; then + warn "File too small (${filesize}B): $label" + SKIPPED=$((SKIPPED + 1)) + return 0 + fi + + log "Uploading: $label -> $collection ($(( filesize / 1024 ))KB)" + + local curl_opts="$CURL_OPTS" + [[ "$filesize" -gt 256000 ]] && curl_opts="$CURL_OPTS_LARGE" + + local response + response=$(curl $curl_opts -X POST "$RAG_URL" \ + -F "file=@${file}" \ + -F "collection=${collection}" \ + -F "data_type=${data_type}" \ + -F "use_case=${use_case}" \ + -F "year=${year}" \ + -F "chunk_strategy=recursive" \ + -F "chunk_size=1024" \ + -F "chunk_overlap=128" \ + -F "metadata_json=${metadata_json}" \ + 2>/dev/null) || true + + if echo "$response" | grep -q '"chunks_count"\|"vectors_indexed"'; then + local chunks + chunks=$(echo "$response" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('chunks_count', d.get('vectors_indexed',0)))" 2>/dev/null || echo "?") + ok "$label -> $chunks chunks" + UPLOADED=$((UPLOADED + 1)) + else + fail "Upload failed: $label" + fail "Response: ${response:0:200}" + FAILED=$((FAILED + 1)) + fi +} + +# ============================================================================= +# PHASE I-1: Downloads +# ============================================================================= +phase_i_download() { + log "==========================================" + log "PHASE I-1: Downloads" + log "==========================================" + mkdir -p "$WORK_DIR/pdfs" + + # --- Priority 1: NIST Special Publications --- + log "--- NIST Special Publications ---" + + download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-160v1r1.pdf" \ + "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" + + download_pdf "https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-30r1.pdf" \ + "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" + + download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-82r3.pdf" \ + "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" + + # --- Priority 1: Open Standards (PDF-verfuegbar) --- + log "--- Open Standards ---" + + download_pdf "https://spdx.dev/wp-content/uploads/sites/31/2024/12/SPDX-3.0.1-1.pdf" \ + "$WORK_DIR/pdfs/spdx_3_0_1.pdf" + + download_pdf "https://www.first.org/cvss/v4-0/cvss-v40-specification.pdf" \ + "$WORK_DIR/pdfs/cvss_v4_0.pdf" + + # --- Priority 2: EU + FDA --- + log "--- EU + FDA Guidelines ---" + + download_pdf "https://ec.europa.eu/docsroom/documents/60145/attachments/1/translations/en/renditions/pdf" \ + "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" + + download_pdf "https://www.fda.gov/media/80481/download" \ + "$WORK_DIR/pdfs/fda_human_factors.pdf" + + # --- Web-based Specs (GitHub Markdown → manuell als PDF bereitstellen) --- + log "--- Web-basierte Specs (SLSA, CycloneDX, OpenTelemetry) ---" + log " Diese Specs sind primaer web-basiert. Lade GitHub-Repos als Referenz..." + + # SLSA: https://slsa.dev/spec/draft/ — Apache-2.0 + # Versuch die GitHub-generierte PDF zu holen, oder das Markdown + if [[ ! -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then + log " SLSA: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" + log " Quelle: https://slsa.dev/spec/draft/" + fi + + # CycloneDX: https://cyclonedx.org/specification/overview/ — Apache-2.0 + if [[ ! -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then + log " CycloneDX: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" + log " Quelle: https://cyclonedx.org/specification/overview/" + fi + + # OpenTelemetry: https://opentelemetry.io/docs/specs/otel/ — Apache-2.0 + if [[ ! -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then + log " OpenTelemetry: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" + log " Quelle: https://opentelemetry.io/docs/specs/otel/" + fi + + # GPAI Code of Practice: EU Public + if [[ ! -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then + log " GPAI Code of Practice: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" + log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai" + fi + + # GPAI Scope Guidelines: EU Public + if [[ ! -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then + log " GPAI Scope Guidelines: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" + log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers" + fi + + log "Downloads abgeschlossen." +} + +# ============================================================================= +# PHASE I-2: NIST Special Publications → bp_compliance_ce +# ============================================================================= +phase_i_nist() { + log "==========================================" + log "PHASE I-2: NIST SPs -> bp_compliance_ce" + log "==========================================" + + local col="bp_compliance_ce" + + upload_file "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" "$col" "compliance_ce" "security_engineering" "2022" \ + '{"regulation_id":"nist_sp_800_160v1r1","regulation_name_de":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_name_en":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_short":"NIST SP 800-160","category":"security_engineering","license":"public_domain_us","source":"nist.gov"}' \ + "NIST SP 800-160 Vol. 1 Rev. 1 (Trustworthy Secure Systems)" + + upload_file "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" "$col" "compliance_ce" "risk_assessment" "2012" \ + '{"regulation_id":"nist_sp_800_30r1","regulation_name_de":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_name_en":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_short":"NIST SP 800-30","category":"risk_assessment","license":"public_domain_us","source":"nist.gov"}' \ + "NIST SP 800-30 Rev. 1 (Risk Assessments)" + + upload_file "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" "$col" "compliance_ce" "ot_security" "2023" \ + '{"regulation_id":"nist_sp_800_82r3","regulation_name_de":"NIST SP 800-82 Rev. 3 — Guide to OT Security","regulation_name_en":"NIST SP 800-82 Rev. 3 — Guide to Operational Technology Security","regulation_short":"NIST SP 800-82","category":"ot_security","license":"public_domain_us","source":"nist.gov"}' \ + "NIST SP 800-82 Rev. 3 (OT Security)" +} + +# ============================================================================= +# PHASE I-3: Open Standards → bp_compliance_ce +# ============================================================================= +phase_i_standards() { + log "==========================================" + log "PHASE I-3: Open Standards -> bp_compliance_ce" + log "==========================================" + + local col="bp_compliance_ce" + + upload_file "$WORK_DIR/pdfs/spdx_3_0_1.pdf" "$col" "compliance_ce" "sbom" "2024" \ + '{"regulation_id":"spdx_3_0_1","regulation_name_de":"SPDX 3.0.1 — Software Package Data Exchange","regulation_name_en":"SPDX 3.0.1 — Software Package Data Exchange","regulation_short":"SPDX 3.0","category":"sbom","license":"CC-BY-3.0","source":"spdx.dev"}' \ + "SPDX 3.0.1 Specification" + + upload_file "$WORK_DIR/pdfs/cvss_v4_0.pdf" "$col" "compliance_ce" "vulnerability_scoring" "2023" \ + '{"regulation_id":"cvss_v4_0","regulation_name_de":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_name_en":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_short":"CVSS v4.0","category":"vulnerability_scoring","license":"CC-BY-4.0","source":"first.org"}' \ + "CVSS v4.0 Specification" + + # Web-basierte Specs — nur wenn manuell als PDF bereitgestellt + if [[ -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/slsa_v1_0.pdf" "$col" "compliance_ce" "supply_chain_security" "2023" \ + '{"regulation_id":"slsa_v1_0","regulation_name_de":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_name_en":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_short":"SLSA v1.0","category":"supply_chain_security","license":"Apache-2.0","source":"slsa.dev"}' \ + "SLSA v1.0 Specification" + else + warn "SLSA PDF nicht vorhanden — uebersprungen (manuell drucken: https://slsa.dev/spec/draft/)" + SKIPPED=$((SKIPPED + 1)) + fi + + if [[ -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/cyclonedx_spec.pdf" "$col" "compliance_ce" "sbom" "2024" \ + '{"regulation_id":"cyclonedx_1_6","regulation_name_de":"CycloneDX 1.6 — SBOM Standard","regulation_name_en":"CycloneDX 1.6 — Software Bill of Materials Standard","regulation_short":"CycloneDX 1.6","category":"sbom","license":"Apache-2.0","source":"cyclonedx.org"}' \ + "CycloneDX 1.6 Specification" + else + warn "CycloneDX PDF nicht vorhanden — uebersprungen (manuell drucken: https://cyclonedx.org/specification/overview/)" + SKIPPED=$((SKIPPED + 1)) + fi + + if [[ -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/opentelemetry_spec.pdf" "$col" "compliance_ce" "observability" "2024" \ + '{"regulation_id":"opentelemetry_spec","regulation_name_de":"OpenTelemetry Specification — Observability Framework","regulation_name_en":"OpenTelemetry Specification — Observability Framework","regulation_short":"OpenTelemetry","category":"observability","license":"Apache-2.0","source":"opentelemetry.io"}' \ + "OpenTelemetry Specification" + else + warn "OpenTelemetry PDF nicht vorhanden — uebersprungen (manuell drucken: https://opentelemetry.io/docs/specs/otel/)" + SKIPPED=$((SKIPPED + 1)) + fi +} + +# ============================================================================= +# PHASE I-4: EU Guidelines + FDA → bp_compliance_ce +# ============================================================================= +phase_i_guidelines() { + log "==========================================" + log "PHASE I-4: EU Guidelines + FDA -> bp_compliance_ce" + log "==========================================" + + local col="bp_compliance_ce" + + upload_file "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" "$col" "compliance_ce" "product_safety" "2010" \ + '{"regulation_id":"eu_machinery_guide_2006_42","regulation_name_de":"Leitfaden Maschinenrichtlinie 2006/42/EG (2. Auflage)","regulation_name_en":"Guide to Application of the Machinery Directive 2006/42/EC","regulation_short":"Machinery Guide","category":"product_safety","license":"eu_public","source":"ec.europa.eu"}' \ + "EU Machinery Directive Guide 2006/42/EC" + + upload_file "$WORK_DIR/pdfs/fda_human_factors.pdf" "$col" "compliance_ce" "human_factors" "2016" \ + '{"regulation_id":"fda_human_factors","regulation_name_de":"FDA Human Factors Engineering — Medical Devices","regulation_name_en":"Applying Human Factors and Usability Engineering to Medical Devices","regulation_short":"FDA HFE","category":"human_factors","license":"public_domain_us","source":"fda.gov"}' \ + "FDA Human Factors Guidance" + + # GPAI — nur wenn manuell als PDF bereitgestellt + if [[ -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \ + '{"regulation_id":"gpai_code_of_practice","regulation_name_de":"GPAI Code of Practice — Verhaltenskodex fuer KI-Modelle","regulation_name_en":"General-Purpose AI Code of Practice","regulation_short":"GPAI CoP","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \ + "GPAI Code of Practice" + else + warn "GPAI Code of Practice PDF nicht vorhanden — uebersprungen" + SKIPPED=$((SKIPPED + 1)) + fi + + if [[ -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \ + '{"regulation_id":"gpai_scope_guidelines","regulation_name_de":"GPAI Scope Guidelines — Leitlinien fuer KI-Anbieter","regulation_name_en":"Guidelines for Providers of General-Purpose AI Models","regulation_short":"GPAI Guidelines","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \ + "GPAI Scope Guidelines" + else + warn "GPAI Scope Guidelines PDF nicht vorhanden — uebersprungen" + SKIPPED=$((SKIPPED + 1)) + fi +} + +# ============================================================================= +# MAIN +# ============================================================================= +main() { + log "==========================================" + log "PHASE I: RAG Ingestion — Technische Standards + Guidelines" + log "==========================================" + log "Work dir: $WORK_DIR" + log "RAG URL: $RAG_URL" + log "" + log "Dokumente: 7 direkte PDFs + 5 web-basierte (manuell als PDF)" + log "" + + phase_i_download + phase_i_nist + phase_i_standards + phase_i_guidelines + + log "==========================================" + log "PHASE I ABGESCHLOSSEN" + log " Hochgeladen: $UPLOADED" + log " Uebersprungen: $SKIPPED" + log " Fehlgeschlagen: $FAILED" + log "==========================================" + + if [[ "$SKIPPED" -gt 0 ]]; then + log "" + log "HINWEIS: Web-basierte Specs muessen manuell als PDF gedruckt werden." + log "Lege die PDFs nach: $WORK_DIR/pdfs/" + log " - slsa_v1_0.pdf (https://slsa.dev/spec/draft/)" + log " - cyclonedx_spec.pdf (https://cyclonedx.org/specification/overview/)" + log " - opentelemetry_spec.pdf (https://opentelemetry.io/docs/specs/otel/)" + log " - gpai_code_of_practice.pdf (https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai)" + log " - gpai_scope_guidelines.pdf (https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers)" + log "Dann Script erneut ausfuehren — bereits hochgeladene werden per Dedup uebersprungen." + fi +} + +main "$@"