Files
breakpilot-compliance/scripts/ingest-phase-i.sh
Benjamin Admin b29a7caee7 feat(scripts): add Phase I ingestion script for 12 new documents
Covers NIST SP 800-160/30/82, SPDX 3.0, CVSS v4.0, SLSA v1.0,
CycloneDX 1.6, OpenTelemetry, EU Machinery Guide 2006/42/EC,
FDA Human Factors, and 5 GPAI documents (Scope Guidelines,
Communication, CoP Safety/Transparency/Copyright).

All documents include license metadata in regulation payloads.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 09:18:30 +01:00

349 lines
16 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# BreakPilot Compliance — Phase I RAG Ingestion
#
# Downloads and ingests ~12 new technical standards and guidelines:
# - 3 NIST Special Publications (800-160, 800-30, 800-82) → bp_compliance_ce
# - 5 Open Standards (SLSA, SPDX, CycloneDX, OpenTelemetry, CVSS) → bp_compliance_ce
# - 2 EU Guidelines (Machinery Guide, GPAI) → bp_compliance_ce
# - 2 Optional (GPAI Scope, FDA Human Factors) → bp_compliance_ce
#
# Alle Dokumente: kommerziell nutzbar (Public Domain US / Apache-2.0 / CC-BY / EU Public)
#
# Run on Mac Mini:
# bash ~/Projekte/breakpilot-compliance/scripts/ingest-phase-i.sh
# =============================================================================
set -euo pipefail
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-i}"
RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}"
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900"
UPLOADED=0
FAILED=0
SKIPPED=0
log() { echo "[$(date '+%H:%M:%S')] $*"; }
ok() { echo "[$(date '+%H:%M:%S')] ok $*"; }
warn() { echo "[$(date '+%H:%M:%S')] WARN $*" >&2; }
fail() { echo "[$(date '+%H:%M:%S')] FAIL $*" >&2; }
download_pdf() {
local url="$1"
local target="$2"
if [[ -f "$target" ]]; then
log "PDF exists: $(basename "$target") (skipping download)"
return 0
fi
log "Downloading: $(basename "$target")"
curl $CURL_OPTS_LARGE -L "$url" -o "$target" 2>/dev/null || {
warn "Download failed: $url"
rm -f "$target"
return 0
}
local fsize
fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0)
if [[ "$fsize" -lt 1000 ]]; then
warn "Download too small (${fsize}B): $(basename "$target")"
rm -f "$target"
else
log " Downloaded: $(( fsize / 1024 ))KB"
fi
}
upload_file() {
local file="$1"
local collection="$2"
local data_type="$3"
local use_case="$4"
local year="$5"
local metadata_json="$6"
local label="${7:-$(basename "$file")}"
if [[ ! -f "$file" ]]; then
warn "File not found: $file"
FAILED=$((FAILED + 1))
return 0
fi
# Dedup check
local reg_id
reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
if [[ -n "$reg_id" ]]; then
local existing
existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
-H "Content-Type: application/json" \
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
if [[ "$existing" -gt 0 ]] 2>/dev/null; then
log "SKIP (already in Qdrant): $label [regulation_id=$reg_id]"
SKIPPED=$((SKIPPED + 1))
return 0
fi
fi
local filesize
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
if [[ "$filesize" -lt 100 ]]; then
warn "File too small (${filesize}B): $label"
SKIPPED=$((SKIPPED + 1))
return 0
fi
log "Uploading: $label -> $collection ($(( filesize / 1024 ))KB)"
local curl_opts="$CURL_OPTS"
[[ "$filesize" -gt 256000 ]] && curl_opts="$CURL_OPTS_LARGE"
local response
response=$(curl $curl_opts -X POST "$RAG_URL" \
-F "file=@${file}" \
-F "collection=${collection}" \
-F "data_type=${data_type}" \
-F "use_case=${use_case}" \
-F "year=${year}" \
-F "chunk_strategy=recursive" \
-F "chunk_size=1024" \
-F "chunk_overlap=128" \
-F "metadata_json=${metadata_json}" \
2>/dev/null) || true
if echo "$response" | grep -q '"chunks_count"\|"vectors_indexed"'; then
local chunks
chunks=$(echo "$response" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('chunks_count', d.get('vectors_indexed',0)))" 2>/dev/null || echo "?")
ok "$label -> $chunks chunks"
UPLOADED=$((UPLOADED + 1))
else
fail "Upload failed: $label"
fail "Response: ${response:0:200}"
FAILED=$((FAILED + 1))
fi
}
# =============================================================================
# PHASE I-1: Downloads
# =============================================================================
phase_i_download() {
log "=========================================="
log "PHASE I-1: Downloads"
log "=========================================="
mkdir -p "$WORK_DIR/pdfs"
# --- Priority 1: NIST Special Publications ---
log "--- NIST Special Publications ---"
download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-160v1r1.pdf" \
"$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf"
download_pdf "https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-30r1.pdf" \
"$WORK_DIR/pdfs/nist_sp_800_30r1.pdf"
download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-82r3.pdf" \
"$WORK_DIR/pdfs/nist_sp_800_82r3.pdf"
# --- Priority 1: Open Standards (PDF-verfuegbar) ---
log "--- Open Standards ---"
download_pdf "https://spdx.dev/wp-content/uploads/sites/31/2024/12/SPDX-3.0.1-1.pdf" \
"$WORK_DIR/pdfs/spdx_3_0_1.pdf"
download_pdf "https://www.first.org/cvss/v4-0/cvss-v40-specification.pdf" \
"$WORK_DIR/pdfs/cvss_v4_0.pdf"
# --- Priority 2: EU + FDA ---
log "--- EU + FDA Guidelines ---"
download_pdf "https://ec.europa.eu/docsroom/documents/60145/attachments/1/translations/en/renditions/pdf" \
"$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf"
download_pdf "https://www.fda.gov/media/80481/download" \
"$WORK_DIR/pdfs/fda_human_factors.pdf"
# --- Web-based Specs (GitHub Markdown → manuell als PDF bereitstellen) ---
log "--- Web-basierte Specs (SLSA, CycloneDX, OpenTelemetry) ---"
log " Diese Specs sind primaer web-basiert. Lade GitHub-Repos als Referenz..."
# SLSA: https://slsa.dev/spec/draft/ — Apache-2.0
# Versuch die GitHub-generierte PDF zu holen, oder das Markdown
if [[ ! -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then
log " SLSA: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
log " Quelle: https://slsa.dev/spec/draft/"
fi
# CycloneDX: https://cyclonedx.org/specification/overview/ — Apache-2.0
if [[ ! -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then
log " CycloneDX: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
log " Quelle: https://cyclonedx.org/specification/overview/"
fi
# OpenTelemetry: https://opentelemetry.io/docs/specs/otel/ — Apache-2.0
if [[ ! -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then
log " OpenTelemetry: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
log " Quelle: https://opentelemetry.io/docs/specs/otel/"
fi
# GPAI Code of Practice: EU Public
if [[ ! -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then
log " GPAI Code of Practice: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai"
fi
# GPAI Scope Guidelines: EU Public
if [[ ! -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then
log " GPAI Scope Guidelines: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers"
fi
log "Downloads abgeschlossen."
}
# =============================================================================
# PHASE I-2: NIST Special Publications → bp_compliance_ce
# =============================================================================
phase_i_nist() {
log "=========================================="
log "PHASE I-2: NIST SPs -> bp_compliance_ce"
log "=========================================="
local col="bp_compliance_ce"
upload_file "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" "$col" "compliance_ce" "security_engineering" "2022" \
'{"regulation_id":"nist_sp_800_160v1r1","regulation_name_de":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_name_en":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_short":"NIST SP 800-160","category":"security_engineering","license":"public_domain_us","source":"nist.gov"}' \
"NIST SP 800-160 Vol. 1 Rev. 1 (Trustworthy Secure Systems)"
upload_file "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" "$col" "compliance_ce" "risk_assessment" "2012" \
'{"regulation_id":"nist_sp_800_30r1","regulation_name_de":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_name_en":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_short":"NIST SP 800-30","category":"risk_assessment","license":"public_domain_us","source":"nist.gov"}' \
"NIST SP 800-30 Rev. 1 (Risk Assessments)"
upload_file "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" "$col" "compliance_ce" "ot_security" "2023" \
'{"regulation_id":"nist_sp_800_82r3","regulation_name_de":"NIST SP 800-82 Rev. 3 — Guide to OT Security","regulation_name_en":"NIST SP 800-82 Rev. 3 — Guide to Operational Technology Security","regulation_short":"NIST SP 800-82","category":"ot_security","license":"public_domain_us","source":"nist.gov"}' \
"NIST SP 800-82 Rev. 3 (OT Security)"
}
# =============================================================================
# PHASE I-3: Open Standards → bp_compliance_ce
# =============================================================================
phase_i_standards() {
log "=========================================="
log "PHASE I-3: Open Standards -> bp_compliance_ce"
log "=========================================="
local col="bp_compliance_ce"
upload_file "$WORK_DIR/pdfs/spdx_3_0_1.pdf" "$col" "compliance_ce" "sbom" "2024" \
'{"regulation_id":"spdx_3_0_1","regulation_name_de":"SPDX 3.0.1 — Software Package Data Exchange","regulation_name_en":"SPDX 3.0.1 — Software Package Data Exchange","regulation_short":"SPDX 3.0","category":"sbom","license":"CC-BY-3.0","source":"spdx.dev"}' \
"SPDX 3.0.1 Specification"
upload_file "$WORK_DIR/pdfs/cvss_v4_0.pdf" "$col" "compliance_ce" "vulnerability_scoring" "2023" \
'{"regulation_id":"cvss_v4_0","regulation_name_de":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_name_en":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_short":"CVSS v4.0","category":"vulnerability_scoring","license":"CC-BY-4.0","source":"first.org"}' \
"CVSS v4.0 Specification"
# Web-basierte Specs — nur wenn manuell als PDF bereitgestellt
if [[ -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then
upload_file "$WORK_DIR/pdfs/slsa_v1_0.pdf" "$col" "compliance_ce" "supply_chain_security" "2023" \
'{"regulation_id":"slsa_v1_0","regulation_name_de":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_name_en":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_short":"SLSA v1.0","category":"supply_chain_security","license":"Apache-2.0","source":"slsa.dev"}' \
"SLSA v1.0 Specification"
else
warn "SLSA PDF nicht vorhanden — uebersprungen (manuell drucken: https://slsa.dev/spec/draft/)"
SKIPPED=$((SKIPPED + 1))
fi
if [[ -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then
upload_file "$WORK_DIR/pdfs/cyclonedx_spec.pdf" "$col" "compliance_ce" "sbom" "2024" \
'{"regulation_id":"cyclonedx_1_6","regulation_name_de":"CycloneDX 1.6 — SBOM Standard","regulation_name_en":"CycloneDX 1.6 — Software Bill of Materials Standard","regulation_short":"CycloneDX 1.6","category":"sbom","license":"Apache-2.0","source":"cyclonedx.org"}' \
"CycloneDX 1.6 Specification"
else
warn "CycloneDX PDF nicht vorhanden — uebersprungen (manuell drucken: https://cyclonedx.org/specification/overview/)"
SKIPPED=$((SKIPPED + 1))
fi
if [[ -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then
upload_file "$WORK_DIR/pdfs/opentelemetry_spec.pdf" "$col" "compliance_ce" "observability" "2024" \
'{"regulation_id":"opentelemetry_spec","regulation_name_de":"OpenTelemetry Specification — Observability Framework","regulation_name_en":"OpenTelemetry Specification — Observability Framework","regulation_short":"OpenTelemetry","category":"observability","license":"Apache-2.0","source":"opentelemetry.io"}' \
"OpenTelemetry Specification"
else
warn "OpenTelemetry PDF nicht vorhanden — uebersprungen (manuell drucken: https://opentelemetry.io/docs/specs/otel/)"
SKIPPED=$((SKIPPED + 1))
fi
}
# =============================================================================
# PHASE I-4: EU Guidelines + FDA → bp_compliance_ce
# =============================================================================
phase_i_guidelines() {
log "=========================================="
log "PHASE I-4: EU Guidelines + FDA -> bp_compliance_ce"
log "=========================================="
local col="bp_compliance_ce"
upload_file "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" "$col" "compliance_ce" "product_safety" "2010" \
'{"regulation_id":"eu_machinery_guide_2006_42","regulation_name_de":"Leitfaden Maschinenrichtlinie 2006/42/EG (2. Auflage)","regulation_name_en":"Guide to Application of the Machinery Directive 2006/42/EC","regulation_short":"Machinery Guide","category":"product_safety","license":"eu_public","source":"ec.europa.eu"}' \
"EU Machinery Directive Guide 2006/42/EC"
upload_file "$WORK_DIR/pdfs/fda_human_factors.pdf" "$col" "compliance_ce" "human_factors" "2016" \
'{"regulation_id":"fda_human_factors","regulation_name_de":"FDA Human Factors Engineering — Medical Devices","regulation_name_en":"Applying Human Factors and Usability Engineering to Medical Devices","regulation_short":"FDA HFE","category":"human_factors","license":"public_domain_us","source":"fda.gov"}' \
"FDA Human Factors Guidance"
# GPAI — nur wenn manuell als PDF bereitgestellt
if [[ -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then
upload_file "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \
'{"regulation_id":"gpai_code_of_practice","regulation_name_de":"GPAI Code of Practice — Verhaltenskodex fuer KI-Modelle","regulation_name_en":"General-Purpose AI Code of Practice","regulation_short":"GPAI CoP","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \
"GPAI Code of Practice"
else
warn "GPAI Code of Practice PDF nicht vorhanden — uebersprungen"
SKIPPED=$((SKIPPED + 1))
fi
if [[ -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then
upload_file "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \
'{"regulation_id":"gpai_scope_guidelines","regulation_name_de":"GPAI Scope Guidelines — Leitlinien fuer KI-Anbieter","regulation_name_en":"Guidelines for Providers of General-Purpose AI Models","regulation_short":"GPAI Guidelines","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \
"GPAI Scope Guidelines"
else
warn "GPAI Scope Guidelines PDF nicht vorhanden — uebersprungen"
SKIPPED=$((SKIPPED + 1))
fi
}
# =============================================================================
# MAIN
# =============================================================================
main() {
log "=========================================="
log "PHASE I: RAG Ingestion — Technische Standards + Guidelines"
log "=========================================="
log "Work dir: $WORK_DIR"
log "RAG URL: $RAG_URL"
log ""
log "Dokumente: 7 direkte PDFs + 5 web-basierte (manuell als PDF)"
log ""
phase_i_download
phase_i_nist
phase_i_standards
phase_i_guidelines
log "=========================================="
log "PHASE I ABGESCHLOSSEN"
log " Hochgeladen: $UPLOADED"
log " Uebersprungen: $SKIPPED"
log " Fehlgeschlagen: $FAILED"
log "=========================================="
if [[ "$SKIPPED" -gt 0 ]]; then
log ""
log "HINWEIS: Web-basierte Specs muessen manuell als PDF gedruckt werden."
log "Lege die PDFs nach: $WORK_DIR/pdfs/"
log " - slsa_v1_0.pdf (https://slsa.dev/spec/draft/)"
log " - cyclonedx_spec.pdf (https://cyclonedx.org/specification/overview/)"
log " - opentelemetry_spec.pdf (https://opentelemetry.io/docs/specs/otel/)"
log " - gpai_code_of_practice.pdf (https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai)"
log " - gpai_scope_guidelines.pdf (https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers)"
log "Dann Script erneut ausfuehren — bereits hochgeladene werden per Dedup uebersprungen."
fi
}
main "$@"