#!/usr/bin/env bash # ============================================================================= # BreakPilot Compliance — Phase I RAG Ingestion # # Downloads and ingests ~12 new technical standards and guidelines: # - 3 NIST Special Publications (800-160, 800-30, 800-82) → bp_compliance_ce # - 5 Open Standards (SLSA, SPDX, CycloneDX, OpenTelemetry, CVSS) → bp_compliance_ce # - 2 EU Guidelines (Machinery Guide, GPAI) → bp_compliance_ce # - 2 Optional (GPAI Scope, FDA Human Factors) → bp_compliance_ce # # Alle Dokumente: kommerziell nutzbar (Public Domain US / Apache-2.0 / CC-BY / EU Public) # # Run on Mac Mini: # bash ~/Projekte/breakpilot-compliance/scripts/ingest-phase-i.sh # ============================================================================= set -euo pipefail WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-i}" RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}" QDRANT_URL="${QDRANT_URL:-http://localhost:6333}" CURL_OPTS="-sk --connect-timeout 10 --max-time 300" CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900" UPLOADED=0 FAILED=0 SKIPPED=0 log() { echo "[$(date '+%H:%M:%S')] $*"; } ok() { echo "[$(date '+%H:%M:%S')] ok $*"; } warn() { echo "[$(date '+%H:%M:%S')] WARN $*" >&2; } fail() { echo "[$(date '+%H:%M:%S')] FAIL $*" >&2; } download_pdf() { local url="$1" local target="$2" if [[ -f "$target" ]]; then log "PDF exists: $(basename "$target") (skipping download)" return 0 fi log "Downloading: $(basename "$target")" curl $CURL_OPTS_LARGE -L "$url" -o "$target" 2>/dev/null || { warn "Download failed: $url" rm -f "$target" return 0 } local fsize fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0) if [[ "$fsize" -lt 1000 ]]; then warn "Download too small (${fsize}B): $(basename "$target")" rm -f "$target" else log " Downloaded: $(( fsize / 1024 ))KB" fi } upload_file() { local file="$1" local collection="$2" local data_type="$3" local use_case="$4" local year="$5" local metadata_json="$6" local label="${7:-$(basename "$file")}" if [[ ! -f "$file" ]]; then warn "File not found: $file" FAILED=$((FAILED + 1)) return 0 fi # Dedup check local reg_id reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "") if [[ -n "$reg_id" ]]; then local existing existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \ -H "Content-Type: application/json" \ -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \ 2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0") if [[ "$existing" -gt 0 ]] 2>/dev/null; then log "SKIP (already in Qdrant): $label [regulation_id=$reg_id]" SKIPPED=$((SKIPPED + 1)) return 0 fi fi local filesize filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0) if [[ "$filesize" -lt 100 ]]; then warn "File too small (${filesize}B): $label" SKIPPED=$((SKIPPED + 1)) return 0 fi log "Uploading: $label -> $collection ($(( filesize / 1024 ))KB)" local curl_opts="$CURL_OPTS" [[ "$filesize" -gt 256000 ]] && curl_opts="$CURL_OPTS_LARGE" local response response=$(curl $curl_opts -X POST "$RAG_URL" \ -F "file=@${file}" \ -F "collection=${collection}" \ -F "data_type=${data_type}" \ -F "use_case=${use_case}" \ -F "year=${year}" \ -F "chunk_strategy=recursive" \ -F "chunk_size=1024" \ -F "chunk_overlap=128" \ -F "metadata_json=${metadata_json}" \ 2>/dev/null) || true if echo "$response" | grep -q '"chunks_count"\|"vectors_indexed"'; then local chunks chunks=$(echo "$response" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('chunks_count', d.get('vectors_indexed',0)))" 2>/dev/null || echo "?") ok "$label -> $chunks chunks" UPLOADED=$((UPLOADED + 1)) else fail "Upload failed: $label" fail "Response: ${response:0:200}" FAILED=$((FAILED + 1)) fi } # ============================================================================= # PHASE I-1: Downloads # ============================================================================= phase_i_download() { log "==========================================" log "PHASE I-1: Downloads" log "==========================================" mkdir -p "$WORK_DIR/pdfs" # --- Priority 1: NIST Special Publications --- log "--- NIST Special Publications ---" download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-160v1r1.pdf" \ "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" download_pdf "https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-30r1.pdf" \ "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-82r3.pdf" \ "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" # --- Priority 1: Open Standards (PDF-verfuegbar) --- log "--- Open Standards ---" download_pdf "https://spdx.dev/wp-content/uploads/sites/31/2024/12/SPDX-3.0.1-1.pdf" \ "$WORK_DIR/pdfs/spdx_3_0_1.pdf" download_pdf "https://www.first.org/cvss/v4-0/cvss-v40-specification.pdf" \ "$WORK_DIR/pdfs/cvss_v4_0.pdf" # --- Priority 2: EU + FDA --- log "--- EU + FDA Guidelines ---" download_pdf "https://ec.europa.eu/docsroom/documents/60145/attachments/1/translations/en/renditions/pdf" \ "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" download_pdf "https://www.fda.gov/media/80481/download" \ "$WORK_DIR/pdfs/fda_human_factors.pdf" # --- Web-based Specs (GitHub Markdown → manuell als PDF bereitstellen) --- log "--- Web-basierte Specs (SLSA, CycloneDX, OpenTelemetry) ---" log " Diese Specs sind primaer web-basiert. Lade GitHub-Repos als Referenz..." # SLSA: https://slsa.dev/spec/draft/ — Apache-2.0 # Versuch die GitHub-generierte PDF zu holen, oder das Markdown if [[ ! -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then log " SLSA: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" log " Quelle: https://slsa.dev/spec/draft/" fi # CycloneDX: https://cyclonedx.org/specification/overview/ — Apache-2.0 if [[ ! -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then log " CycloneDX: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" log " Quelle: https://cyclonedx.org/specification/overview/" fi # OpenTelemetry: https://opentelemetry.io/docs/specs/otel/ — Apache-2.0 if [[ ! -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then log " OpenTelemetry: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" log " Quelle: https://opentelemetry.io/docs/specs/otel/" fi # GPAI Code of Practice: EU Public if [[ ! -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then log " GPAI Code of Practice: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai" fi # GPAI Scope Guidelines: EU Public if [[ ! -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then log " GPAI Scope Guidelines: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden" log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers" fi log "Downloads abgeschlossen." } # ============================================================================= # PHASE I-2: NIST Special Publications → bp_compliance_ce # ============================================================================= phase_i_nist() { log "==========================================" log "PHASE I-2: NIST SPs -> bp_compliance_ce" log "==========================================" local col="bp_compliance_ce" upload_file "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" "$col" "compliance_ce" "security_engineering" "2022" \ '{"regulation_id":"nist_sp_800_160v1r1","regulation_name_de":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_name_en":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_short":"NIST SP 800-160","category":"security_engineering","license":"public_domain_us","source":"nist.gov"}' \ "NIST SP 800-160 Vol. 1 Rev. 1 (Trustworthy Secure Systems)" upload_file "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" "$col" "compliance_ce" "risk_assessment" "2012" \ '{"regulation_id":"nist_sp_800_30r1","regulation_name_de":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_name_en":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_short":"NIST SP 800-30","category":"risk_assessment","license":"public_domain_us","source":"nist.gov"}' \ "NIST SP 800-30 Rev. 1 (Risk Assessments)" upload_file "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" "$col" "compliance_ce" "ot_security" "2023" \ '{"regulation_id":"nist_sp_800_82r3","regulation_name_de":"NIST SP 800-82 Rev. 3 — Guide to OT Security","regulation_name_en":"NIST SP 800-82 Rev. 3 — Guide to Operational Technology Security","regulation_short":"NIST SP 800-82","category":"ot_security","license":"public_domain_us","source":"nist.gov"}' \ "NIST SP 800-82 Rev. 3 (OT Security)" } # ============================================================================= # PHASE I-3: Open Standards → bp_compliance_ce # ============================================================================= phase_i_standards() { log "==========================================" log "PHASE I-3: Open Standards -> bp_compliance_ce" log "==========================================" local col="bp_compliance_ce" upload_file "$WORK_DIR/pdfs/spdx_3_0_1.pdf" "$col" "compliance_ce" "sbom" "2024" \ '{"regulation_id":"spdx_3_0_1","regulation_name_de":"SPDX 3.0.1 — Software Package Data Exchange","regulation_name_en":"SPDX 3.0.1 — Software Package Data Exchange","regulation_short":"SPDX 3.0","category":"sbom","license":"CC-BY-3.0","source":"spdx.dev"}' \ "SPDX 3.0.1 Specification" upload_file "$WORK_DIR/pdfs/cvss_v4_0.pdf" "$col" "compliance_ce" "vulnerability_scoring" "2023" \ '{"regulation_id":"cvss_v4_0","regulation_name_de":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_name_en":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_short":"CVSS v4.0","category":"vulnerability_scoring","license":"CC-BY-4.0","source":"first.org"}' \ "CVSS v4.0 Specification" # Web-basierte Specs — nur wenn manuell als PDF bereitgestellt if [[ -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then upload_file "$WORK_DIR/pdfs/slsa_v1_0.pdf" "$col" "compliance_ce" "supply_chain_security" "2023" \ '{"regulation_id":"slsa_v1_0","regulation_name_de":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_name_en":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_short":"SLSA v1.0","category":"supply_chain_security","license":"Apache-2.0","source":"slsa.dev"}' \ "SLSA v1.0 Specification" else warn "SLSA PDF nicht vorhanden — uebersprungen (manuell drucken: https://slsa.dev/spec/draft/)" SKIPPED=$((SKIPPED + 1)) fi if [[ -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then upload_file "$WORK_DIR/pdfs/cyclonedx_spec.pdf" "$col" "compliance_ce" "sbom" "2024" \ '{"regulation_id":"cyclonedx_1_6","regulation_name_de":"CycloneDX 1.6 — SBOM Standard","regulation_name_en":"CycloneDX 1.6 — Software Bill of Materials Standard","regulation_short":"CycloneDX 1.6","category":"sbom","license":"Apache-2.0","source":"cyclonedx.org"}' \ "CycloneDX 1.6 Specification" else warn "CycloneDX PDF nicht vorhanden — uebersprungen (manuell drucken: https://cyclonedx.org/specification/overview/)" SKIPPED=$((SKIPPED + 1)) fi if [[ -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then upload_file "$WORK_DIR/pdfs/opentelemetry_spec.pdf" "$col" "compliance_ce" "observability" "2024" \ '{"regulation_id":"opentelemetry_spec","regulation_name_de":"OpenTelemetry Specification — Observability Framework","regulation_name_en":"OpenTelemetry Specification — Observability Framework","regulation_short":"OpenTelemetry","category":"observability","license":"Apache-2.0","source":"opentelemetry.io"}' \ "OpenTelemetry Specification" else warn "OpenTelemetry PDF nicht vorhanden — uebersprungen (manuell drucken: https://opentelemetry.io/docs/specs/otel/)" SKIPPED=$((SKIPPED + 1)) fi } # ============================================================================= # PHASE I-4: EU Guidelines + FDA → bp_compliance_ce # ============================================================================= phase_i_guidelines() { log "==========================================" log "PHASE I-4: EU Guidelines + FDA -> bp_compliance_ce" log "==========================================" local col="bp_compliance_ce" upload_file "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" "$col" "compliance_ce" "product_safety" "2010" \ '{"regulation_id":"eu_machinery_guide_2006_42","regulation_name_de":"Leitfaden Maschinenrichtlinie 2006/42/EG (2. Auflage)","regulation_name_en":"Guide to Application of the Machinery Directive 2006/42/EC","regulation_short":"Machinery Guide","category":"product_safety","license":"eu_public","source":"ec.europa.eu"}' \ "EU Machinery Directive Guide 2006/42/EC" upload_file "$WORK_DIR/pdfs/fda_human_factors.pdf" "$col" "compliance_ce" "human_factors" "2016" \ '{"regulation_id":"fda_human_factors","regulation_name_de":"FDA Human Factors Engineering — Medical Devices","regulation_name_en":"Applying Human Factors and Usability Engineering to Medical Devices","regulation_short":"FDA HFE","category":"human_factors","license":"public_domain_us","source":"fda.gov"}' \ "FDA Human Factors Guidance" # GPAI — nur wenn manuell als PDF bereitgestellt if [[ -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then upload_file "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \ '{"regulation_id":"gpai_code_of_practice","regulation_name_de":"GPAI Code of Practice — Verhaltenskodex fuer KI-Modelle","regulation_name_en":"General-Purpose AI Code of Practice","regulation_short":"GPAI CoP","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \ "GPAI Code of Practice" else warn "GPAI Code of Practice PDF nicht vorhanden — uebersprungen" SKIPPED=$((SKIPPED + 1)) fi if [[ -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then upload_file "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \ '{"regulation_id":"gpai_scope_guidelines","regulation_name_de":"GPAI Scope Guidelines — Leitlinien fuer KI-Anbieter","regulation_name_en":"Guidelines for Providers of General-Purpose AI Models","regulation_short":"GPAI Guidelines","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \ "GPAI Scope Guidelines" else warn "GPAI Scope Guidelines PDF nicht vorhanden — uebersprungen" SKIPPED=$((SKIPPED + 1)) fi } # ============================================================================= # MAIN # ============================================================================= main() { log "==========================================" log "PHASE I: RAG Ingestion — Technische Standards + Guidelines" log "==========================================" log "Work dir: $WORK_DIR" log "RAG URL: $RAG_URL" log "" log "Dokumente: 7 direkte PDFs + 5 web-basierte (manuell als PDF)" log "" phase_i_download phase_i_nist phase_i_standards phase_i_guidelines log "==========================================" log "PHASE I ABGESCHLOSSEN" log " Hochgeladen: $UPLOADED" log " Uebersprungen: $SKIPPED" log " Fehlgeschlagen: $FAILED" log "==========================================" if [[ "$SKIPPED" -gt 0 ]]; then log "" log "HINWEIS: Web-basierte Specs muessen manuell als PDF gedruckt werden." log "Lege die PDFs nach: $WORK_DIR/pdfs/" log " - slsa_v1_0.pdf (https://slsa.dev/spec/draft/)" log " - cyclonedx_spec.pdf (https://cyclonedx.org/specification/overview/)" log " - opentelemetry_spec.pdf (https://opentelemetry.io/docs/specs/otel/)" log " - gpai_code_of_practice.pdf (https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai)" log " - gpai_scope_guidelines.pdf (https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers)" log "Dann Script erneut ausfuehren — bereits hochgeladene werden per Dedup uebersprungen." fi } main "$@"