#!/usr/bin/env bash # ============================================================================= # BreakPilot Compliance — CE/Safety RAG Corpus Ingestion # # Laedt 15 freie CE-/Safety-relevante Dokumente herunter und ingestiert sie # in Qdrant via die Core RAG-API (Port 8097). # # Sammlungen: # bp_compliance_ce — Maschinenrecht, Safety-Frameworks, OT-Security # bp_compliance_datenschutz — AI/Datenschutz-Guidance (ENISA, OECD) # # Ausfuehrung auf dem Mac Mini: # bash ~/Projekte/breakpilot-compliance/scripts/ingest-ce-corpus.sh # bash .../ingest-ce-corpus.sh [--skip-download] [--only PHASE] # # Phasen: download, ce, datenschutz, verify, version # ============================================================================= set -euo pipefail # --- Configuration ----------------------------------------------------------- WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-ce}" RAG_URL="https://localhost:8097/api/v1/documents/upload" QDRANT_URL="http://localhost:6333" CURL_OPTS="-sk --connect-timeout 15 --max-time 600 --retry 3 --retry-delay 5" DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}" # Counters UPLOADED=0 FAILED=0 SKIPPED=0 # --- CLI Args ---------------------------------------------------------------- SKIP_DOWNLOAD=false ONLY_PHASE="" while [[ $# -gt 0 ]]; do case $1 in --skip-download) SKIP_DOWNLOAD=true; shift ;; --only) ONLY_PHASE="$2"; shift 2 ;; -h|--help) echo "Usage: $0 [--skip-download] [--only PHASE]" echo "Phases: download, ce, datenschutz, verify, version" exit 0 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done # --- Helpers ----------------------------------------------------------------- log() { echo "[$(date '+%H:%M:%S')] $*"; } ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; } warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; } fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; } upload_file() { local file="$1" local collection="$2" local data_type="$3" local use_case="$4" local year="$5" local metadata_json="$6" local label="${7:-$(basename "$file")}" if [[ ! -f "$file" ]]; then warn "File not found: $file" FAILED=$((FAILED + 1)) return 1 fi local filesize filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0) if [[ "$filesize" -lt 1000 ]]; then warn "File too small (${filesize}B), skipping: $label" SKIPPED=$((SKIPPED + 1)) return 1 fi log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)" local response response=$(curl $CURL_OPTS -X POST "$RAG_URL" \ -F "file=@${file}" \ -F "collection=${collection}" \ -F "data_type=${data_type}" \ -F "use_case=${use_case}" \ -F "year=${year}" \ -F "chunk_strategy=recursive" \ -F "chunk_size=512" \ -F "chunk_overlap=50" \ -F "metadata_json=${metadata_json}" \ 2>/dev/null) || true if echo "$response" | grep -q '"chunks_count"'; then local chunks chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?") ok "$label → $chunks chunks" UPLOADED=$((UPLOADED + 1)) elif echo "$response" | grep -q '"vectors_indexed"'; then local vectors vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?") ok "$label → $vectors vectors" UPLOADED=$((UPLOADED + 1)) else fail "Upload failed: $label" fail "Response: $response" FAILED=$((FAILED + 1)) return 1 fi } download_pdf() { local url="$1" local target="$2" if [[ -f "$target" ]]; then local filesize filesize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0) if [[ "$filesize" -gt 1000 ]]; then log "PDF exists: $(basename "$target") (skipping download)" return 0 fi rm -f "$target" fi log "Downloading: $(basename "$target") from $url" curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || { warn "Download failed: $url" return 1 } } collection_count() { local col="$1" curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?" } # ============================================================================= # PHASE A: Downloads # ============================================================================= phase_download() { log "==========================================" log "PHASE A: Downloads (15 CE/Safety-Dokumente)" log "==========================================" mkdir -p "$WORK_DIR/pdfs" # --- EU-Rechtstexte (EUR-Lex, oeffentliches Recht) --- # 1. Machinery Regulation (EU) 2023/1230 download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1230" \ "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" # 2. Machinery Directive 2006/42/EC download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32006L0042" \ "$WORK_DIR/pdfs/machinery_directive_2006_42.pdf" # 3. Low Voltage Directive 2014/35/EU download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32014L0035" \ "$WORK_DIR/pdfs/lvd_2014_35.pdf" # 4. EMC Directive 2014/30/EU download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32014L0030" \ "$WORK_DIR/pdfs/emc_directive_2014_30.pdf" # 5. Radio Equipment Directive 2014/53/EU download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32014L0053" \ "$WORK_DIR/pdfs/red_directive_2014_53.pdf" # 6. AI Act (EU) 2024/1689 download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32024R1689" \ "$WORK_DIR/pdfs/ai_act_2024_1689.pdf" # 7. Guide to the Machinery Directive (EC, oeffentlich) download_pdf \ "https://single-market-economy.ec.europa.eu/system/files/2021-10/machinery-guide-2010_en.pdf" \ "$WORK_DIR/pdfs/machinery_directive_guide.pdf" # --- NIST Publikationen (US Gov, public domain) --- # 8. NIST SP 800-218 (SSDF) download_pdf \ "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \ "$WORK_DIR/pdfs/nist_sp800_218_ssdf.pdf" # 9. NIST AI RMF 1.0 download_pdf \ "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.100-1.pdf" \ "$WORK_DIR/pdfs/nist_ai_rmf_100_1.pdf" # --- ENISA (European Union Agency for Cybersecurity, oeffentlich) --- # 10. ENISA Secure Software Development download_pdf \ "https://www.enisa.europa.eu/publications/guidelines-for-securing-the-internet-of-things/@@download/fullReport" \ "$WORK_DIR/pdfs/enisa_iot_security_guidelines.pdf" # 11. ENISA Cybersecurity for AI download_pdf \ "https://www.enisa.europa.eu/publications/securing-machine-learning-algorithms/@@download/fullReport" \ "$WORK_DIR/pdfs/enisa_securing_ml_algorithms.pdf" # --- NASA (US Gov, public domain) --- # 12. NASA Software Safety Guidebook download_pdf \ "https://swehb.nasa.gov/download/attachments/17957036/NASA-GB-8719.13.pdf" \ "$WORK_DIR/pdfs/nasa_software_safety_guidebook.pdf" # --- OWASP (CC BY-SA 4.0) --- # 13. OWASP Top 10 2021 (PDF) download_pdf \ "https://owasp.org/Top10/assets/OWASP-Top-10-2021-en.pdf" \ "$WORK_DIR/pdfs/owasp_top10_2021.pdf" # --- OECD (oeffentlich zugaenglich) --- # 14. OECD AI Principles (HTML->Text, download als plain text) if [[ ! -f "$WORK_DIR/pdfs/oecd_ai_principles.txt" ]]; then log "Downloading OECD AI Principles (text)" curl $CURL_OPTS -L "https://www.oecd.org/digital/artificial-intelligence/ai-principles/" \ 2>/dev/null | python3 -c " import sys from html.parser import HTMLParser class E(HTMLParser): def __init__(self): super().__init__(); self.t=[] def handle_data(self,d): self.t.append(d) def handle_endtag(self,t): if t in ('p','div','li','h1','h2','h3'): self.t.append('\n') p=E(); p.feed(sys.stdin.read()); print(''.join(p.t)) " > "$WORK_DIR/pdfs/oecd_ai_principles.txt" 2>/dev/null || warn "OECD AI Principles download failed" fi # --- MITRE CWE (MIT License) --- # 15. MITRE CWE Top 25 Most Dangerous Software Weaknesses (2023) if [[ ! -f "$WORK_DIR/pdfs/mitre_cwe_top25_2023.txt" ]]; then log "Downloading MITRE CWE Top 25 (text)" curl $CURL_OPTS -L "https://cwe.mitre.org/top25/archive/2023/2023_top25_list.html" \ 2>/dev/null | python3 -c " import sys from html.parser import HTMLParser class E(HTMLParser): def __init__(self): super().__init__(); self.t=[]; self.skip=False def handle_starttag(self,t,a): if t in ('script','style'): self.skip=True def handle_endtag(self,t): if t in ('script','style'): self.skip=False if t in ('td','th','tr','p','div','h1','h2','h3','li'): self.t.append('\n') def handle_data(self,d): if not self.skip: self.t.append(d) p=E(); p.feed(sys.stdin.read()); print(''.join(p.t)) " > "$WORK_DIR/pdfs/mitre_cwe_top25_2023.txt" 2>/dev/null || warn "MITRE CWE Top 25 download failed" fi log "Download phase complete." } # ============================================================================= # PHASE B: CE-Dokumente → bp_compliance_ce # ============================================================================= phase_ce() { log "==========================================" log "PHASE B: CE/Safety-Dokumente → bp_compliance_ce" log "==========================================" local col="bp_compliance_ce" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # 1. Machinery Regulation 2023/1230 upload_file "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" "$col" \ "compliance_ce" "legal_reference" "2023" \ '{"regulation_id":"EU-2023-1230","regulation_name_en":"Machinery Regulation","category":"ce_machinery","license":"eu_public","source_org":"EUR-Lex","celex":"32023R1230"}' \ "Machinery Regulation (EU) 2023/1230" # 2. Machinery Directive 2006/42/EC upload_file "$WORK_DIR/pdfs/machinery_directive_2006_42.pdf" "$col" \ "compliance_ce" "legal_reference" "2006" \ '{"regulation_id":"EU-2006-42","regulation_name_en":"Machinery Directive","category":"ce_machinery","license":"eu_public","source_org":"EUR-Lex","celex":"32006L0042"}' \ "Machinery Directive 2006/42/EC" # 3. Low Voltage Directive 2014/35/EU upload_file "$WORK_DIR/pdfs/lvd_2014_35.pdf" "$col" \ "compliance_ce" "legal_reference" "2014" \ '{"regulation_id":"EU-2014-35","regulation_name_en":"Low Voltage Directive","category":"ce_electrical","license":"eu_public","source_org":"EUR-Lex","celex":"32014L0035"}' \ "Low Voltage Directive 2014/35/EU" # 4. EMC Directive 2014/30/EU upload_file "$WORK_DIR/pdfs/emc_directive_2014_30.pdf" "$col" \ "compliance_ce" "legal_reference" "2014" \ '{"regulation_id":"EU-2014-30","regulation_name_en":"EMC Directive","category":"ce_emc","license":"eu_public","source_org":"EUR-Lex","celex":"32014L0030"}' \ "EMC Directive 2014/30/EU" # 5. Radio Equipment Directive 2014/53/EU upload_file "$WORK_DIR/pdfs/red_directive_2014_53.pdf" "$col" \ "compliance_ce" "legal_reference" "2014" \ '{"regulation_id":"EU-2014-53","regulation_name_en":"Radio Equipment Directive","category":"ce_radio","license":"eu_public","source_org":"EUR-Lex","celex":"32014L0053"}' \ "Radio Equipment Directive 2014/53/EU" # 6. AI Act 2024/1689 upload_file "$WORK_DIR/pdfs/ai_act_2024_1689.pdf" "$col" \ "compliance_ce" "legal_reference" "2024" \ '{"regulation_id":"EU-2024-1689","regulation_name_en":"AI Act","category":"ce_ai","license":"eu_public","source_org":"EUR-Lex","celex":"32024R1689"}' \ "AI Act (EU) 2024/1689" # 7. Guide to the Machinery Directive upload_file "$WORK_DIR/pdfs/machinery_directive_guide.pdf" "$col" \ "compliance_ce" "guidance" "2021" \ '{"regulation_id":"EC-machinery-guide","regulation_name_en":"Guide to the Machinery Directive","category":"ce_machinery_guidance","license":"eu_public","source_org":"European Commission"}' \ "EC Guide to the Machinery Directive" # 8. NIST SP 800-218 (SSDF) upload_file "$WORK_DIR/pdfs/nist_sp800_218_ssdf.pdf" "$col" \ "compliance_ce" "guidance" "2022" \ '{"regulation_id":"NIST-SP-800-218","regulation_name_en":"Secure Software Development Framework","category":"ce_software_safety","license":"us_gov_public","source_org":"NIST"}' \ "NIST SP 800-218 (SSDF)" # 9. NIST AI RMF 1.0 upload_file "$WORK_DIR/pdfs/nist_ai_rmf_100_1.pdf" "$col" \ "compliance_ce" "guidance" "2023" \ '{"regulation_id":"NIST-AI-100-1","regulation_name_en":"AI Risk Management Framework","category":"ce_ai_safety","license":"us_gov_public","source_org":"NIST"}' \ "NIST AI RMF 1.0 (NIST.AI.100-1)" # 10. ENISA IoT Security Guidelines upload_file "$WORK_DIR/pdfs/enisa_iot_security_guidelines.pdf" "$col" \ "compliance_ce" "guidance" "2019" \ '{"regulation_id":"ENISA-IoT-Security","regulation_name_en":"Guidelines for Securing the IoT","category":"ce_ot_cybersecurity","license":"eu_public","source_org":"ENISA"}' \ "ENISA Guidelines for Securing the IoT" # 12. NASA Software Safety Guidebook upload_file "$WORK_DIR/pdfs/nasa_software_safety_guidebook.pdf" "$col" \ "compliance_ce" "guidance" "2004" \ '{"regulation_id":"NASA-GB-8719.13","regulation_name_en":"NASA Software Safety Guidebook","category":"ce_software_safety","license":"us_gov_public","source_org":"NASA"}' \ "NASA Software Safety Guidebook (NASA-GB-8719.13)" # 13. OWASP Top 10 2021 upload_file "$WORK_DIR/pdfs/owasp_top10_2021.pdf" "$col" \ "compliance_ce" "guidance" "2021" \ '{"regulation_id":"OWASP-Top10-2021","regulation_name_en":"OWASP Top 10 2021","category":"ce_software_security","license":"cc_by_sa_4","source_org":"OWASP"}' \ "OWASP Top 10 (2021)" # 15. MITRE CWE Top 25 upload_file "$WORK_DIR/pdfs/mitre_cwe_top25_2023.txt" "$col" \ "compliance_ce" "guidance" "2023" \ '{"regulation_id":"MITRE-CWE-Top25-2023","regulation_name_en":"MITRE CWE Top 25 Most Dangerous Software Weaknesses","category":"ce_software_weaknesses","license":"mit","source_org":"MITRE"}' \ "MITRE CWE Top 25 (2023)" local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE C: AI/Datenschutz-Dokumente → bp_compliance_datenschutz # ============================================================================= phase_datenschutz() { log "==========================================" log "PHASE C: AI/Datenschutz → bp_compliance_datenschutz" log "==========================================" local col="bp_compliance_datenschutz" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # 11. ENISA Securing ML Algorithms upload_file "$WORK_DIR/pdfs/enisa_securing_ml_algorithms.pdf" "$col" \ "compliance_datenschutz" "guidance" "2021" \ '{"regulation_id":"ENISA-Securing-ML","regulation_name_en":"Securing Machine Learning Algorithms","category":"ai_cybersecurity","license":"eu_public","source_org":"ENISA"}' \ "ENISA Securing Machine Learning Algorithms" # 14. OECD AI Principles upload_file "$WORK_DIR/pdfs/oecd_ai_principles.txt" "$col" \ "compliance_datenschutz" "guidance" "2019" \ '{"regulation_id":"OECD-AI-Principles","regulation_name_en":"OECD Principles on Artificial Intelligence","category":"ai_governance","license":"oecd_public","source_org":"OECD"}' \ "OECD AI Principles (2019)" local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE D: Verifizierung # ============================================================================= phase_verify() { log "==========================================" log "PHASE D: Verifizierung" log "==========================================" echo "" echo "=== Collection Stats ===" for col in bp_compliance_ce bp_compliance_datenschutz; do local count count=$(collection_count "$col") printf " %-35s %s chunks\n" "$col" "$count" done echo "" echo "=== Test-Suchen ===" log "Suche: 'Machinery Regulation software safety requirements' in bp_compliance_ce" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"Machinery Regulation software safety requirements","collection":"bp_compliance_ce","limit":3,"min_score":0.4}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:100]}...') except Exception as e: print(f' (parse error: {e})') " 2>/dev/null || echo " (search failed)" log "Suche: 'NIST secure software development practices' in bp_compliance_ce" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"NIST secure software development practices","collection":"bp_compliance_ce","limit":3,"min_score":0.4}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:100]}...') except Exception as e: print(f' (parse error: {e})') " 2>/dev/null || echo " (search failed)" log "Suche: 'AI risk governance OECD principles' in bp_compliance_datenschutz" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"AI risk governance principles transparency accountability","collection":"bp_compliance_datenschutz","limit":3,"min_score":0.4}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:100]}...') except Exception as e: print(f' (parse error: {e})') " 2>/dev/null || echo " (search failed)" echo "" } # ============================================================================= # PHASE E: Corpus Version Registration # ============================================================================= phase_register_version() { log "==========================================" log "PHASE E: Corpus Version Registration" log "==========================================" local today today=$(date '+%Y-%m-%d') local col_ce="bp_compliance_ce" local col_ds="bp_compliance_datenschutz" for col in "$col_ce" "$col_ds"; do local count count=$(collection_count "$col") if [[ "$count" == "?" || "$count" == "0" ]]; then warn "Skipping version for $col (count=$count)" continue fi local existing_count existing_count=$(psql "$DB_URL" -tAc \ "SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \ 2>/dev/null || echo "0") local seq=$((existing_count + 1)) local version="${today}.${seq}" local regs="" case "$col" in bp_compliance_ce) regs='{EU-2023-1230,EU-2006-42,EU-2014-35,EU-2014-30,EU-2014-53,EU-2024-1689,NIST-SP-800-218,NIST-AI-100-1,ENISA-IoT-Security,NASA-GB-8719,OWASP-Top10-2021,MITRE-CWE-Top25-2023}' ;; bp_compliance_datenschutz) regs='{ENISA-Securing-ML,OECD-AI-Principles}' ;; esac local digest digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ | python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \ 2>/dev/null || echo "") log "Registering version $version for $col ($count chunks)" psql "$DB_URL" -c " INSERT INTO compliance_corpus_versions (version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by) VALUES ('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-ce-corpus.sh', 'system') ON CONFLICT DO NOTHING " 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration skipped for $col (DB not available?)" done } # ============================================================================= # MAIN # ============================================================================= main() { log "==========================================" log "BreakPilot CE/Safety Corpus Ingestion" log "==========================================" log "Work dir: $WORK_DIR" log "RAG API: $RAG_URL" log "Qdrant: $QDRANT_URL" echo "" # Check RAG API if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file\|detail"; then warn "RAG API may not be reachable at $RAG_URL — continuing anyway" else ok "RAG API reachable" fi # Check Qdrant if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then fail "Qdrant not reachable at $QDRANT_URL" exit 1 fi ok "Qdrant reachable" echo "" if [[ -n "$ONLY_PHASE" ]]; then case "$ONLY_PHASE" in download) phase_download ;; ce) phase_ce ;; datenschutz) phase_datenschutz ;; verify) phase_verify ;; version) phase_register_version ;; *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;; esac else if [[ "$SKIP_DOWNLOAD" != "true" ]]; then phase_download else log "Skipping download phase (--skip-download)" fi echo "" phase_ce echo "" phase_datenschutz echo "" phase_verify echo "" phase_register_version fi echo "" log "==========================================" log "ERGEBNIS" log "==========================================" log "Uploaded: $UPLOADED" log "Failed: $FAILED" log "Skipped: $SKIPPED" log "==========================================" if [[ $FAILED -gt 0 ]]; then warn "$FAILED uploads fehlgeschlagen!" exit 1 fi ok "CE/Safety Corpus Ingestion abgeschlossen!" } main "$@"