Covers NIST SP 800-160/30/82, SPDX 3.0, CVSS v4.0, SLSA v1.0, CycloneDX 1.6, OpenTelemetry, EU Machinery Guide 2006/42/EC, FDA Human Factors, and 5 GPAI documents (Scope Guidelines, Communication, CoP Safety/Transparency/Copyright). All documents include license metadata in regulation payloads. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
349 lines
16 KiB
Bash
Executable File
349 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# BreakPilot Compliance — Phase I RAG Ingestion
|
|
#
|
|
# Downloads and ingests ~12 new technical standards and guidelines:
|
|
# - 3 NIST Special Publications (800-160, 800-30, 800-82) → bp_compliance_ce
|
|
# - 5 Open Standards (SLSA, SPDX, CycloneDX, OpenTelemetry, CVSS) → bp_compliance_ce
|
|
# - 2 EU Guidelines (Machinery Guide, GPAI) → bp_compliance_ce
|
|
# - 2 Optional (GPAI Scope, FDA Human Factors) → bp_compliance_ce
|
|
#
|
|
# Alle Dokumente: kommerziell nutzbar (Public Domain US / Apache-2.0 / CC-BY / EU Public)
|
|
#
|
|
# Run on Mac Mini:
|
|
# bash ~/Projekte/breakpilot-compliance/scripts/ingest-phase-i.sh
|
|
# =============================================================================
|
|
set -euo pipefail
|
|
|
|
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-i}"
|
|
RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}"
|
|
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
|
|
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
|
|
CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900"
|
|
|
|
UPLOADED=0
|
|
FAILED=0
|
|
SKIPPED=0
|
|
|
|
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
|
ok() { echo "[$(date '+%H:%M:%S')] ok $*"; }
|
|
warn() { echo "[$(date '+%H:%M:%S')] WARN $*" >&2; }
|
|
fail() { echo "[$(date '+%H:%M:%S')] FAIL $*" >&2; }
|
|
|
|
download_pdf() {
|
|
local url="$1"
|
|
local target="$2"
|
|
if [[ -f "$target" ]]; then
|
|
log "PDF exists: $(basename "$target") (skipping download)"
|
|
return 0
|
|
fi
|
|
log "Downloading: $(basename "$target")"
|
|
curl $CURL_OPTS_LARGE -L "$url" -o "$target" 2>/dev/null || {
|
|
warn "Download failed: $url"
|
|
rm -f "$target"
|
|
return 0
|
|
}
|
|
local fsize
|
|
fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0)
|
|
if [[ "$fsize" -lt 1000 ]]; then
|
|
warn "Download too small (${fsize}B): $(basename "$target")"
|
|
rm -f "$target"
|
|
else
|
|
log " Downloaded: $(( fsize / 1024 ))KB"
|
|
fi
|
|
}
|
|
|
|
upload_file() {
|
|
local file="$1"
|
|
local collection="$2"
|
|
local data_type="$3"
|
|
local use_case="$4"
|
|
local year="$5"
|
|
local metadata_json="$6"
|
|
local label="${7:-$(basename "$file")}"
|
|
|
|
if [[ ! -f "$file" ]]; then
|
|
warn "File not found: $file"
|
|
FAILED=$((FAILED + 1))
|
|
return 0
|
|
fi
|
|
|
|
# Dedup check
|
|
local reg_id
|
|
reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
|
|
if [[ -n "$reg_id" ]]; then
|
|
local existing
|
|
existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
|
|
2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
|
|
if [[ "$existing" -gt 0 ]] 2>/dev/null; then
|
|
log "SKIP (already in Qdrant): $label [regulation_id=$reg_id]"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
local filesize
|
|
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
|
if [[ "$filesize" -lt 100 ]]; then
|
|
warn "File too small (${filesize}B): $label"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
return 0
|
|
fi
|
|
|
|
log "Uploading: $label -> $collection ($(( filesize / 1024 ))KB)"
|
|
|
|
local curl_opts="$CURL_OPTS"
|
|
[[ "$filesize" -gt 256000 ]] && curl_opts="$CURL_OPTS_LARGE"
|
|
|
|
local response
|
|
response=$(curl $curl_opts -X POST "$RAG_URL" \
|
|
-F "file=@${file}" \
|
|
-F "collection=${collection}" \
|
|
-F "data_type=${data_type}" \
|
|
-F "use_case=${use_case}" \
|
|
-F "year=${year}" \
|
|
-F "chunk_strategy=recursive" \
|
|
-F "chunk_size=1024" \
|
|
-F "chunk_overlap=128" \
|
|
-F "metadata_json=${metadata_json}" \
|
|
2>/dev/null) || true
|
|
|
|
if echo "$response" | grep -q '"chunks_count"\|"vectors_indexed"'; then
|
|
local chunks
|
|
chunks=$(echo "$response" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('chunks_count', d.get('vectors_indexed',0)))" 2>/dev/null || echo "?")
|
|
ok "$label -> $chunks chunks"
|
|
UPLOADED=$((UPLOADED + 1))
|
|
else
|
|
fail "Upload failed: $label"
|
|
fail "Response: ${response:0:200}"
|
|
FAILED=$((FAILED + 1))
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE I-1: Downloads
|
|
# =============================================================================
|
|
phase_i_download() {
|
|
log "=========================================="
|
|
log "PHASE I-1: Downloads"
|
|
log "=========================================="
|
|
mkdir -p "$WORK_DIR/pdfs"
|
|
|
|
# --- Priority 1: NIST Special Publications ---
|
|
log "--- NIST Special Publications ---"
|
|
|
|
download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-160v1r1.pdf" \
|
|
"$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf"
|
|
|
|
download_pdf "https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-30r1.pdf" \
|
|
"$WORK_DIR/pdfs/nist_sp_800_30r1.pdf"
|
|
|
|
download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-82r3.pdf" \
|
|
"$WORK_DIR/pdfs/nist_sp_800_82r3.pdf"
|
|
|
|
# --- Priority 1: Open Standards (PDF-verfuegbar) ---
|
|
log "--- Open Standards ---"
|
|
|
|
download_pdf "https://spdx.dev/wp-content/uploads/sites/31/2024/12/SPDX-3.0.1-1.pdf" \
|
|
"$WORK_DIR/pdfs/spdx_3_0_1.pdf"
|
|
|
|
download_pdf "https://www.first.org/cvss/v4-0/cvss-v40-specification.pdf" \
|
|
"$WORK_DIR/pdfs/cvss_v4_0.pdf"
|
|
|
|
# --- Priority 2: EU + FDA ---
|
|
log "--- EU + FDA Guidelines ---"
|
|
|
|
download_pdf "https://ec.europa.eu/docsroom/documents/60145/attachments/1/translations/en/renditions/pdf" \
|
|
"$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf"
|
|
|
|
download_pdf "https://www.fda.gov/media/80481/download" \
|
|
"$WORK_DIR/pdfs/fda_human_factors.pdf"
|
|
|
|
# --- Web-based Specs (GitHub Markdown → manuell als PDF bereitstellen) ---
|
|
log "--- Web-basierte Specs (SLSA, CycloneDX, OpenTelemetry) ---"
|
|
log " Diese Specs sind primaer web-basiert. Lade GitHub-Repos als Referenz..."
|
|
|
|
# SLSA: https://slsa.dev/spec/draft/ — Apache-2.0
|
|
# Versuch die GitHub-generierte PDF zu holen, oder das Markdown
|
|
if [[ ! -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then
|
|
log " SLSA: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
|
log " Quelle: https://slsa.dev/spec/draft/"
|
|
fi
|
|
|
|
# CycloneDX: https://cyclonedx.org/specification/overview/ — Apache-2.0
|
|
if [[ ! -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then
|
|
log " CycloneDX: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
|
log " Quelle: https://cyclonedx.org/specification/overview/"
|
|
fi
|
|
|
|
# OpenTelemetry: https://opentelemetry.io/docs/specs/otel/ — Apache-2.0
|
|
if [[ ! -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then
|
|
log " OpenTelemetry: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
|
log " Quelle: https://opentelemetry.io/docs/specs/otel/"
|
|
fi
|
|
|
|
# GPAI Code of Practice: EU Public
|
|
if [[ ! -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then
|
|
log " GPAI Code of Practice: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
|
log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai"
|
|
fi
|
|
|
|
# GPAI Scope Guidelines: EU Public
|
|
if [[ ! -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then
|
|
log " GPAI Scope Guidelines: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
|
log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers"
|
|
fi
|
|
|
|
log "Downloads abgeschlossen."
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE I-2: NIST Special Publications → bp_compliance_ce
|
|
# =============================================================================
|
|
phase_i_nist() {
|
|
log "=========================================="
|
|
log "PHASE I-2: NIST SPs -> bp_compliance_ce"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_ce"
|
|
|
|
upload_file "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" "$col" "compliance_ce" "security_engineering" "2022" \
|
|
'{"regulation_id":"nist_sp_800_160v1r1","regulation_name_de":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_name_en":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_short":"NIST SP 800-160","category":"security_engineering","license":"public_domain_us","source":"nist.gov"}' \
|
|
"NIST SP 800-160 Vol. 1 Rev. 1 (Trustworthy Secure Systems)"
|
|
|
|
upload_file "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" "$col" "compliance_ce" "risk_assessment" "2012" \
|
|
'{"regulation_id":"nist_sp_800_30r1","regulation_name_de":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_name_en":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_short":"NIST SP 800-30","category":"risk_assessment","license":"public_domain_us","source":"nist.gov"}' \
|
|
"NIST SP 800-30 Rev. 1 (Risk Assessments)"
|
|
|
|
upload_file "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" "$col" "compliance_ce" "ot_security" "2023" \
|
|
'{"regulation_id":"nist_sp_800_82r3","regulation_name_de":"NIST SP 800-82 Rev. 3 — Guide to OT Security","regulation_name_en":"NIST SP 800-82 Rev. 3 — Guide to Operational Technology Security","regulation_short":"NIST SP 800-82","category":"ot_security","license":"public_domain_us","source":"nist.gov"}' \
|
|
"NIST SP 800-82 Rev. 3 (OT Security)"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE I-3: Open Standards → bp_compliance_ce
|
|
# =============================================================================
|
|
phase_i_standards() {
|
|
log "=========================================="
|
|
log "PHASE I-3: Open Standards -> bp_compliance_ce"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_ce"
|
|
|
|
upload_file "$WORK_DIR/pdfs/spdx_3_0_1.pdf" "$col" "compliance_ce" "sbom" "2024" \
|
|
'{"regulation_id":"spdx_3_0_1","regulation_name_de":"SPDX 3.0.1 — Software Package Data Exchange","regulation_name_en":"SPDX 3.0.1 — Software Package Data Exchange","regulation_short":"SPDX 3.0","category":"sbom","license":"CC-BY-3.0","source":"spdx.dev"}' \
|
|
"SPDX 3.0.1 Specification"
|
|
|
|
upload_file "$WORK_DIR/pdfs/cvss_v4_0.pdf" "$col" "compliance_ce" "vulnerability_scoring" "2023" \
|
|
'{"regulation_id":"cvss_v4_0","regulation_name_de":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_name_en":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_short":"CVSS v4.0","category":"vulnerability_scoring","license":"CC-BY-4.0","source":"first.org"}' \
|
|
"CVSS v4.0 Specification"
|
|
|
|
# Web-basierte Specs — nur wenn manuell als PDF bereitgestellt
|
|
if [[ -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/slsa_v1_0.pdf" "$col" "compliance_ce" "supply_chain_security" "2023" \
|
|
'{"regulation_id":"slsa_v1_0","regulation_name_de":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_name_en":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_short":"SLSA v1.0","category":"supply_chain_security","license":"Apache-2.0","source":"slsa.dev"}' \
|
|
"SLSA v1.0 Specification"
|
|
else
|
|
warn "SLSA PDF nicht vorhanden — uebersprungen (manuell drucken: https://slsa.dev/spec/draft/)"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
fi
|
|
|
|
if [[ -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/cyclonedx_spec.pdf" "$col" "compliance_ce" "sbom" "2024" \
|
|
'{"regulation_id":"cyclonedx_1_6","regulation_name_de":"CycloneDX 1.6 — SBOM Standard","regulation_name_en":"CycloneDX 1.6 — Software Bill of Materials Standard","regulation_short":"CycloneDX 1.6","category":"sbom","license":"Apache-2.0","source":"cyclonedx.org"}' \
|
|
"CycloneDX 1.6 Specification"
|
|
else
|
|
warn "CycloneDX PDF nicht vorhanden — uebersprungen (manuell drucken: https://cyclonedx.org/specification/overview/)"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
fi
|
|
|
|
if [[ -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/opentelemetry_spec.pdf" "$col" "compliance_ce" "observability" "2024" \
|
|
'{"regulation_id":"opentelemetry_spec","regulation_name_de":"OpenTelemetry Specification — Observability Framework","regulation_name_en":"OpenTelemetry Specification — Observability Framework","regulation_short":"OpenTelemetry","category":"observability","license":"Apache-2.0","source":"opentelemetry.io"}' \
|
|
"OpenTelemetry Specification"
|
|
else
|
|
warn "OpenTelemetry PDF nicht vorhanden — uebersprungen (manuell drucken: https://opentelemetry.io/docs/specs/otel/)"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE I-4: EU Guidelines + FDA → bp_compliance_ce
|
|
# =============================================================================
|
|
phase_i_guidelines() {
|
|
log "=========================================="
|
|
log "PHASE I-4: EU Guidelines + FDA -> bp_compliance_ce"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_ce"
|
|
|
|
upload_file "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" "$col" "compliance_ce" "product_safety" "2010" \
|
|
'{"regulation_id":"eu_machinery_guide_2006_42","regulation_name_de":"Leitfaden Maschinenrichtlinie 2006/42/EG (2. Auflage)","regulation_name_en":"Guide to Application of the Machinery Directive 2006/42/EC","regulation_short":"Machinery Guide","category":"product_safety","license":"eu_public","source":"ec.europa.eu"}' \
|
|
"EU Machinery Directive Guide 2006/42/EC"
|
|
|
|
upload_file "$WORK_DIR/pdfs/fda_human_factors.pdf" "$col" "compliance_ce" "human_factors" "2016" \
|
|
'{"regulation_id":"fda_human_factors","regulation_name_de":"FDA Human Factors Engineering — Medical Devices","regulation_name_en":"Applying Human Factors and Usability Engineering to Medical Devices","regulation_short":"FDA HFE","category":"human_factors","license":"public_domain_us","source":"fda.gov"}' \
|
|
"FDA Human Factors Guidance"
|
|
|
|
# GPAI — nur wenn manuell als PDF bereitgestellt
|
|
if [[ -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \
|
|
'{"regulation_id":"gpai_code_of_practice","regulation_name_de":"GPAI Code of Practice — Verhaltenskodex fuer KI-Modelle","regulation_name_en":"General-Purpose AI Code of Practice","regulation_short":"GPAI CoP","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \
|
|
"GPAI Code of Practice"
|
|
else
|
|
warn "GPAI Code of Practice PDF nicht vorhanden — uebersprungen"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
fi
|
|
|
|
if [[ -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \
|
|
'{"regulation_id":"gpai_scope_guidelines","regulation_name_de":"GPAI Scope Guidelines — Leitlinien fuer KI-Anbieter","regulation_name_en":"Guidelines for Providers of General-Purpose AI Models","regulation_short":"GPAI Guidelines","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \
|
|
"GPAI Scope Guidelines"
|
|
else
|
|
warn "GPAI Scope Guidelines PDF nicht vorhanden — uebersprungen"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
main() {
|
|
log "=========================================="
|
|
log "PHASE I: RAG Ingestion — Technische Standards + Guidelines"
|
|
log "=========================================="
|
|
log "Work dir: $WORK_DIR"
|
|
log "RAG URL: $RAG_URL"
|
|
log ""
|
|
log "Dokumente: 7 direkte PDFs + 5 web-basierte (manuell als PDF)"
|
|
log ""
|
|
|
|
phase_i_download
|
|
phase_i_nist
|
|
phase_i_standards
|
|
phase_i_guidelines
|
|
|
|
log "=========================================="
|
|
log "PHASE I ABGESCHLOSSEN"
|
|
log " Hochgeladen: $UPLOADED"
|
|
log " Uebersprungen: $SKIPPED"
|
|
log " Fehlgeschlagen: $FAILED"
|
|
log "=========================================="
|
|
|
|
if [[ "$SKIPPED" -gt 0 ]]; then
|
|
log ""
|
|
log "HINWEIS: Web-basierte Specs muessen manuell als PDF gedruckt werden."
|
|
log "Lege die PDFs nach: $WORK_DIR/pdfs/"
|
|
log " - slsa_v1_0.pdf (https://slsa.dev/spec/draft/)"
|
|
log " - cyclonedx_spec.pdf (https://cyclonedx.org/specification/overview/)"
|
|
log " - opentelemetry_spec.pdf (https://opentelemetry.io/docs/specs/otel/)"
|
|
log " - gpai_code_of_practice.pdf (https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai)"
|
|
log " - gpai_scope_guidelines.pdf (https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers)"
|
|
log "Dann Script erneut ausfuehren — bereits hochgeladene werden per Dedup uebersprungen."
|
|
fi
|
|
}
|
|
|
|
main "$@"
|