Compare commits
2 Commits
a14e2f3a00
...
0027f78fc5
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0027f78fc5 | ||
|
|
b29a7caee7 |
@@ -13,11 +13,14 @@ import (
|
||||
func TestAllowedCollections(t *testing.T) {
|
||||
allowed := []string{
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_recht",
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_compliance_gdpr",
|
||||
"bp_dsfa_corpus",
|
||||
"bp_dsfa_templates",
|
||||
"bp_dsfa_risks",
|
||||
"bp_legal_templates",
|
||||
"bp_iace_libraries",
|
||||
}
|
||||
|
||||
for _, c := range allowed {
|
||||
|
||||
348
scripts/ingest-phase-i.sh
Executable file
348
scripts/ingest-phase-i.sh
Executable file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# BreakPilot Compliance — Phase I RAG Ingestion
|
||||
#
|
||||
# Downloads and ingests ~12 new technical standards and guidelines:
|
||||
# - 3 NIST Special Publications (800-160, 800-30, 800-82) → bp_compliance_ce
|
||||
# - 5 Open Standards (SLSA, SPDX, CycloneDX, OpenTelemetry, CVSS) → bp_compliance_ce
|
||||
# - 2 EU Guidelines (Machinery Guide, GPAI) → bp_compliance_ce
|
||||
# - 2 Optional (GPAI Scope, FDA Human Factors) → bp_compliance_ce
|
||||
#
|
||||
# Alle Dokumente: kommerziell nutzbar (Public Domain US / Apache-2.0 / CC-BY / EU Public)
|
||||
#
|
||||
# Run on Mac Mini:
|
||||
# bash ~/Projekte/breakpilot-compliance/scripts/ingest-phase-i.sh
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-i}"
|
||||
RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}"
|
||||
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
|
||||
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
|
||||
CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900"
|
||||
|
||||
UPLOADED=0
|
||||
FAILED=0
|
||||
SKIPPED=0
|
||||
|
||||
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
||||
ok() { echo "[$(date '+%H:%M:%S')] ok $*"; }
|
||||
warn() { echo "[$(date '+%H:%M:%S')] WARN $*" >&2; }
|
||||
fail() { echo "[$(date '+%H:%M:%S')] FAIL $*" >&2; }
|
||||
|
||||
download_pdf() {
|
||||
local url="$1"
|
||||
local target="$2"
|
||||
if [[ -f "$target" ]]; then
|
||||
log "PDF exists: $(basename "$target") (skipping download)"
|
||||
return 0
|
||||
fi
|
||||
log "Downloading: $(basename "$target")"
|
||||
curl $CURL_OPTS_LARGE -L "$url" -o "$target" 2>/dev/null || {
|
||||
warn "Download failed: $url"
|
||||
rm -f "$target"
|
||||
return 0
|
||||
}
|
||||
local fsize
|
||||
fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0)
|
||||
if [[ "$fsize" -lt 1000 ]]; then
|
||||
warn "Download too small (${fsize}B): $(basename "$target")"
|
||||
rm -f "$target"
|
||||
else
|
||||
log " Downloaded: $(( fsize / 1024 ))KB"
|
||||
fi
|
||||
}
|
||||
|
||||
upload_file() {
|
||||
local file="$1"
|
||||
local collection="$2"
|
||||
local data_type="$3"
|
||||
local use_case="$4"
|
||||
local year="$5"
|
||||
local metadata_json="$6"
|
||||
local label="${7:-$(basename "$file")}"
|
||||
|
||||
if [[ ! -f "$file" ]]; then
|
||||
warn "File not found: $file"
|
||||
FAILED=$((FAILED + 1))
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Dedup check
|
||||
local reg_id
|
||||
reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
|
||||
if [[ -n "$reg_id" ]]; then
|
||||
local existing
|
||||
existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
|
||||
2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
|
||||
if [[ "$existing" -gt 0 ]] 2>/dev/null; then
|
||||
log "SKIP (already in Qdrant): $label [regulation_id=$reg_id]"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
local filesize
|
||||
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
||||
if [[ "$filesize" -lt 100 ]]; then
|
||||
warn "File too small (${filesize}B): $label"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Uploading: $label -> $collection ($(( filesize / 1024 ))KB)"
|
||||
|
||||
local curl_opts="$CURL_OPTS"
|
||||
[[ "$filesize" -gt 256000 ]] && curl_opts="$CURL_OPTS_LARGE"
|
||||
|
||||
local response
|
||||
response=$(curl $curl_opts -X POST "$RAG_URL" \
|
||||
-F "file=@${file}" \
|
||||
-F "collection=${collection}" \
|
||||
-F "data_type=${data_type}" \
|
||||
-F "use_case=${use_case}" \
|
||||
-F "year=${year}" \
|
||||
-F "chunk_strategy=recursive" \
|
||||
-F "chunk_size=1024" \
|
||||
-F "chunk_overlap=128" \
|
||||
-F "metadata_json=${metadata_json}" \
|
||||
2>/dev/null) || true
|
||||
|
||||
if echo "$response" | grep -q '"chunks_count"\|"vectors_indexed"'; then
|
||||
local chunks
|
||||
chunks=$(echo "$response" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('chunks_count', d.get('vectors_indexed',0)))" 2>/dev/null || echo "?")
|
||||
ok "$label -> $chunks chunks"
|
||||
UPLOADED=$((UPLOADED + 1))
|
||||
else
|
||||
fail "Upload failed: $label"
|
||||
fail "Response: ${response:0:200}"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE I-1: Downloads
|
||||
# =============================================================================
|
||||
phase_i_download() {
|
||||
log "=========================================="
|
||||
log "PHASE I-1: Downloads"
|
||||
log "=========================================="
|
||||
mkdir -p "$WORK_DIR/pdfs"
|
||||
|
||||
# --- Priority 1: NIST Special Publications ---
|
||||
log "--- NIST Special Publications ---"
|
||||
|
||||
download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-160v1r1.pdf" \
|
||||
"$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf"
|
||||
|
||||
download_pdf "https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-30r1.pdf" \
|
||||
"$WORK_DIR/pdfs/nist_sp_800_30r1.pdf"
|
||||
|
||||
download_pdf "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-82r3.pdf" \
|
||||
"$WORK_DIR/pdfs/nist_sp_800_82r3.pdf"
|
||||
|
||||
# --- Priority 1: Open Standards (PDF-verfuegbar) ---
|
||||
log "--- Open Standards ---"
|
||||
|
||||
download_pdf "https://spdx.dev/wp-content/uploads/sites/31/2024/12/SPDX-3.0.1-1.pdf" \
|
||||
"$WORK_DIR/pdfs/spdx_3_0_1.pdf"
|
||||
|
||||
download_pdf "https://www.first.org/cvss/v4-0/cvss-v40-specification.pdf" \
|
||||
"$WORK_DIR/pdfs/cvss_v4_0.pdf"
|
||||
|
||||
# --- Priority 2: EU + FDA ---
|
||||
log "--- EU + FDA Guidelines ---"
|
||||
|
||||
download_pdf "https://ec.europa.eu/docsroom/documents/60145/attachments/1/translations/en/renditions/pdf" \
|
||||
"$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf"
|
||||
|
||||
download_pdf "https://www.fda.gov/media/80481/download" \
|
||||
"$WORK_DIR/pdfs/fda_human_factors.pdf"
|
||||
|
||||
# --- Web-based Specs (GitHub Markdown → manuell als PDF bereitstellen) ---
|
||||
log "--- Web-basierte Specs (SLSA, CycloneDX, OpenTelemetry) ---"
|
||||
log " Diese Specs sind primaer web-basiert. Lade GitHub-Repos als Referenz..."
|
||||
|
||||
# SLSA: https://slsa.dev/spec/draft/ — Apache-2.0
|
||||
# Versuch die GitHub-generierte PDF zu holen, oder das Markdown
|
||||
if [[ ! -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then
|
||||
log " SLSA: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
||||
log " Quelle: https://slsa.dev/spec/draft/"
|
||||
fi
|
||||
|
||||
# CycloneDX: https://cyclonedx.org/specification/overview/ — Apache-2.0
|
||||
if [[ ! -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then
|
||||
log " CycloneDX: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
||||
log " Quelle: https://cyclonedx.org/specification/overview/"
|
||||
fi
|
||||
|
||||
# OpenTelemetry: https://opentelemetry.io/docs/specs/otel/ — Apache-2.0
|
||||
if [[ ! -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then
|
||||
log " OpenTelemetry: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
||||
log " Quelle: https://opentelemetry.io/docs/specs/otel/"
|
||||
fi
|
||||
|
||||
# GPAI Code of Practice: EU Public
|
||||
if [[ ! -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then
|
||||
log " GPAI Code of Practice: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
||||
log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai"
|
||||
fi
|
||||
|
||||
# GPAI Scope Guidelines: EU Public
|
||||
if [[ ! -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then
|
||||
log " GPAI Scope Guidelines: Kein PDF verfuegbar — muss manuell als PDF gedruckt werden"
|
||||
log " Quelle: https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers"
|
||||
fi
|
||||
|
||||
log "Downloads abgeschlossen."
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE I-2: NIST Special Publications → bp_compliance_ce
|
||||
# =============================================================================
|
||||
phase_i_nist() {
|
||||
log "=========================================="
|
||||
log "PHASE I-2: NIST SPs -> bp_compliance_ce"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_ce"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/nist_sp_800_160v1r1.pdf" "$col" "compliance_ce" "security_engineering" "2022" \
|
||||
'{"regulation_id":"nist_sp_800_160v1r1","regulation_name_de":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_name_en":"NIST SP 800-160 Vol. 1 Rev. 1 — Engineering Trustworthy Secure Systems","regulation_short":"NIST SP 800-160","category":"security_engineering","license":"public_domain_us","source":"nist.gov"}' \
|
||||
"NIST SP 800-160 Vol. 1 Rev. 1 (Trustworthy Secure Systems)"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/nist_sp_800_30r1.pdf" "$col" "compliance_ce" "risk_assessment" "2012" \
|
||||
'{"regulation_id":"nist_sp_800_30r1","regulation_name_de":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_name_en":"NIST SP 800-30 Rev. 1 — Guide for Conducting Risk Assessments","regulation_short":"NIST SP 800-30","category":"risk_assessment","license":"public_domain_us","source":"nist.gov"}' \
|
||||
"NIST SP 800-30 Rev. 1 (Risk Assessments)"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/nist_sp_800_82r3.pdf" "$col" "compliance_ce" "ot_security" "2023" \
|
||||
'{"regulation_id":"nist_sp_800_82r3","regulation_name_de":"NIST SP 800-82 Rev. 3 — Guide to OT Security","regulation_name_en":"NIST SP 800-82 Rev. 3 — Guide to Operational Technology Security","regulation_short":"NIST SP 800-82","category":"ot_security","license":"public_domain_us","source":"nist.gov"}' \
|
||||
"NIST SP 800-82 Rev. 3 (OT Security)"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE I-3: Open Standards → bp_compliance_ce
|
||||
# =============================================================================
|
||||
phase_i_standards() {
|
||||
log "=========================================="
|
||||
log "PHASE I-3: Open Standards -> bp_compliance_ce"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_ce"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/spdx_3_0_1.pdf" "$col" "compliance_ce" "sbom" "2024" \
|
||||
'{"regulation_id":"spdx_3_0_1","regulation_name_de":"SPDX 3.0.1 — Software Package Data Exchange","regulation_name_en":"SPDX 3.0.1 — Software Package Data Exchange","regulation_short":"SPDX 3.0","category":"sbom","license":"CC-BY-3.0","source":"spdx.dev"}' \
|
||||
"SPDX 3.0.1 Specification"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/cvss_v4_0.pdf" "$col" "compliance_ce" "vulnerability_scoring" "2023" \
|
||||
'{"regulation_id":"cvss_v4_0","regulation_name_de":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_name_en":"CVSS v4.0 — Common Vulnerability Scoring System","regulation_short":"CVSS v4.0","category":"vulnerability_scoring","license":"CC-BY-4.0","source":"first.org"}' \
|
||||
"CVSS v4.0 Specification"
|
||||
|
||||
# Web-basierte Specs — nur wenn manuell als PDF bereitgestellt
|
||||
if [[ -f "$WORK_DIR/pdfs/slsa_v1_0.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/slsa_v1_0.pdf" "$col" "compliance_ce" "supply_chain_security" "2023" \
|
||||
'{"regulation_id":"slsa_v1_0","regulation_name_de":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_name_en":"SLSA v1.0 — Supply-chain Levels for Software Artifacts","regulation_short":"SLSA v1.0","category":"supply_chain_security","license":"Apache-2.0","source":"slsa.dev"}' \
|
||||
"SLSA v1.0 Specification"
|
||||
else
|
||||
warn "SLSA PDF nicht vorhanden — uebersprungen (manuell drucken: https://slsa.dev/spec/draft/)"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
fi
|
||||
|
||||
if [[ -f "$WORK_DIR/pdfs/cyclonedx_spec.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/cyclonedx_spec.pdf" "$col" "compliance_ce" "sbom" "2024" \
|
||||
'{"regulation_id":"cyclonedx_1_6","regulation_name_de":"CycloneDX 1.6 — SBOM Standard","regulation_name_en":"CycloneDX 1.6 — Software Bill of Materials Standard","regulation_short":"CycloneDX 1.6","category":"sbom","license":"Apache-2.0","source":"cyclonedx.org"}' \
|
||||
"CycloneDX 1.6 Specification"
|
||||
else
|
||||
warn "CycloneDX PDF nicht vorhanden — uebersprungen (manuell drucken: https://cyclonedx.org/specification/overview/)"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
fi
|
||||
|
||||
if [[ -f "$WORK_DIR/pdfs/opentelemetry_spec.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/opentelemetry_spec.pdf" "$col" "compliance_ce" "observability" "2024" \
|
||||
'{"regulation_id":"opentelemetry_spec","regulation_name_de":"OpenTelemetry Specification — Observability Framework","regulation_name_en":"OpenTelemetry Specification — Observability Framework","regulation_short":"OpenTelemetry","category":"observability","license":"Apache-2.0","source":"opentelemetry.io"}' \
|
||||
"OpenTelemetry Specification"
|
||||
else
|
||||
warn "OpenTelemetry PDF nicht vorhanden — uebersprungen (manuell drucken: https://opentelemetry.io/docs/specs/otel/)"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE I-4: EU Guidelines + FDA → bp_compliance_ce
|
||||
# =============================================================================
|
||||
phase_i_guidelines() {
|
||||
log "=========================================="
|
||||
log "PHASE I-4: EU Guidelines + FDA -> bp_compliance_ce"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_ce"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/eu_machinery_guide_2006_42.pdf" "$col" "compliance_ce" "product_safety" "2010" \
|
||||
'{"regulation_id":"eu_machinery_guide_2006_42","regulation_name_de":"Leitfaden Maschinenrichtlinie 2006/42/EG (2. Auflage)","regulation_name_en":"Guide to Application of the Machinery Directive 2006/42/EC","regulation_short":"Machinery Guide","category":"product_safety","license":"eu_public","source":"ec.europa.eu"}' \
|
||||
"EU Machinery Directive Guide 2006/42/EC"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/fda_human_factors.pdf" "$col" "compliance_ce" "human_factors" "2016" \
|
||||
'{"regulation_id":"fda_human_factors","regulation_name_de":"FDA Human Factors Engineering — Medical Devices","regulation_name_en":"Applying Human Factors and Usability Engineering to Medical Devices","regulation_short":"FDA HFE","category":"human_factors","license":"public_domain_us","source":"fda.gov"}' \
|
||||
"FDA Human Factors Guidance"
|
||||
|
||||
# GPAI — nur wenn manuell als PDF bereitgestellt
|
||||
if [[ -f "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/gpai_code_of_practice.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \
|
||||
'{"regulation_id":"gpai_code_of_practice","regulation_name_de":"GPAI Code of Practice — Verhaltenskodex fuer KI-Modelle","regulation_name_en":"General-Purpose AI Code of Practice","regulation_short":"GPAI CoP","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \
|
||||
"GPAI Code of Practice"
|
||||
else
|
||||
warn "GPAI Code of Practice PDF nicht vorhanden — uebersprungen"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
fi
|
||||
|
||||
if [[ -f "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/gpai_scope_guidelines.pdf" "$col" "compliance_ce" "ai_regulation" "2025" \
|
||||
'{"regulation_id":"gpai_scope_guidelines","regulation_name_de":"GPAI Scope Guidelines — Leitlinien fuer KI-Anbieter","regulation_name_en":"Guidelines for Providers of General-Purpose AI Models","regulation_short":"GPAI Guidelines","category":"ai_regulation","license":"eu_public","source":"ec.europa.eu"}' \
|
||||
"GPAI Scope Guidelines"
|
||||
else
|
||||
warn "GPAI Scope Guidelines PDF nicht vorhanden — uebersprungen"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
main() {
|
||||
log "=========================================="
|
||||
log "PHASE I: RAG Ingestion — Technische Standards + Guidelines"
|
||||
log "=========================================="
|
||||
log "Work dir: $WORK_DIR"
|
||||
log "RAG URL: $RAG_URL"
|
||||
log ""
|
||||
log "Dokumente: 7 direkte PDFs + 5 web-basierte (manuell als PDF)"
|
||||
log ""
|
||||
|
||||
phase_i_download
|
||||
phase_i_nist
|
||||
phase_i_standards
|
||||
phase_i_guidelines
|
||||
|
||||
log "=========================================="
|
||||
log "PHASE I ABGESCHLOSSEN"
|
||||
log " Hochgeladen: $UPLOADED"
|
||||
log " Uebersprungen: $SKIPPED"
|
||||
log " Fehlgeschlagen: $FAILED"
|
||||
log "=========================================="
|
||||
|
||||
if [[ "$SKIPPED" -gt 0 ]]; then
|
||||
log ""
|
||||
log "HINWEIS: Web-basierte Specs muessen manuell als PDF gedruckt werden."
|
||||
log "Lege die PDFs nach: $WORK_DIR/pdfs/"
|
||||
log " - slsa_v1_0.pdf (https://slsa.dev/spec/draft/)"
|
||||
log " - cyclonedx_spec.pdf (https://cyclonedx.org/specification/overview/)"
|
||||
log " - opentelemetry_spec.pdf (https://opentelemetry.io/docs/specs/otel/)"
|
||||
log " - gpai_code_of_practice.pdf (https://digital-strategy.ec.europa.eu/en/policies/contents-code-gpai)"
|
||||
log " - gpai_scope_guidelines.pdf (https://digital-strategy.ec.europa.eu/en/policies/guidelines-gpai-providers)"
|
||||
log "Dann Script erneut ausfuehren — bereits hochgeladene werden per Dedup uebersprungen."
|
||||
fi
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user