#!/usr/bin/env bash # ============================================================================= # BreakPilot Compliance — Industry Compliance Ingestion # # Laedt 10 freie Industrie-Compliance-Dokumente herunter und ingestiert sie # in Qdrant via die Core RAG-API (Port 8097). # # Dokumente: # 1. EU Machinery Regulation 2023/1230 → bp_compliance_ce # 2. EU Blue Guide 2022 → bp_compliance_ce # 3. ENISA Secure by Design → bp_compliance_datenschutz # 4. ENISA Supply Chain Security → bp_compliance_datenschutz # 5. NIST SP 800-218 (SSDF) → bp_compliance_datenschutz # 6. NIST Cybersecurity Framework 2.0 → bp_compliance_datenschutz # 7. OECD AI Principles → bp_compliance_datenschutz # 8. EU-IFRS Regulation 2023/1803 (DE) → bp_compliance_ce # 9. EU-IFRS Regulation 2023/1803 (EN) → bp_compliance_ce # 10. EFRAG Endorsement Status Report → bp_compliance_datenschutz # # Ausfuehrung auf dem Mac Mini: # ~/rag-ingestion/ingest-industry-compliance.sh [--skip-download] [--only PHASE] # # Phasen: download, ce, datenschutz, verify # ============================================================================= set -euo pipefail # --- Configuration ----------------------------------------------------------- WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}" RAG_URL="https://localhost:8097/api/v1/documents/upload" QDRANT_URL="http://localhost:6333" CURL_OPTS="-sk --connect-timeout 10 --max-time 600" # Counters UPLOADED=0 FAILED=0 SKIPPED=0 # --- CLI Args ---------------------------------------------------------------- SKIP_DOWNLOAD=false ONLY_PHASE="" while [[ $# -gt 0 ]]; do case $1 in --skip-download) SKIP_DOWNLOAD=true; shift ;; --only) ONLY_PHASE="$2"; shift 2 ;; -h|--help) echo "Usage: $0 [--skip-download] [--only PHASE]" echo "Phases: download, ce, datenschutz, verify" exit 0 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done # --- Helpers ----------------------------------------------------------------- log() { echo "[$(date '+%H:%M:%S')] $*"; } ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; } warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; } fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; } upload_file() { local file="$1" local collection="$2" local data_type="$3" local use_case="$4" local year="$5" local metadata_json="$6" local label="${7:-$(basename "$file")}" if [[ ! -f "$file" ]]; then warn "File not found: $file" FAILED=$((FAILED + 1)) return 1 fi local filesize filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0) if [[ "$filesize" -lt 100 ]]; then warn "File too small (${filesize}B), skipping: $label" SKIPPED=$((SKIPPED + 1)) return 1 fi log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)" local response response=$(curl $CURL_OPTS -X POST "$RAG_URL" \ -F "file=@${file}" \ -F "collection=${collection}" \ -F "data_type=${data_type}" \ -F "use_case=${use_case}" \ -F "year=${year}" \ -F "chunk_strategy=recursive" \ -F "chunk_size=1024" \ -F "chunk_overlap=128" \ -F "metadata_json=${metadata_json}" \ 2>/dev/null) || true if echo "$response" | grep -q '"chunks_count"'; then local chunks chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?") ok "$label → $chunks chunks" UPLOADED=$((UPLOADED + 1)) elif echo "$response" | grep -q '"vectors_indexed"'; then local vectors vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?") ok "$label → $vectors vectors" UPLOADED=$((UPLOADED + 1)) else fail "Upload failed: $label" fail "Response: $response" FAILED=$((FAILED + 1)) return 1 fi } download_pdf() { local url="$1" local target="$2" if [[ -f "$target" ]]; then log "PDF exists: $(basename "$target") (skipping)" return 0 fi log "Downloading: $(basename "$target")" curl $CURL_OPTS -L -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' "$url" -o "$target" 2>/dev/null || { warn "Download failed: $url" return 1 } } collection_count() { local col="$1" curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?" } # ============================================================================= # PHASE A: Downloads (7 PDFs) # ============================================================================= phase_download() { log "==========================================" log "PHASE A: Downloads (10 Industry Compliance PDFs)" log "==========================================" mkdir -p "$WORK_DIR/pdfs" # --- A1: EUR-Lex --- log "--- EUR-Lex: Machinery Regulation ---" download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1230" \ "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" # --- A2: EU Blue Guide 2022 --- log "--- EU Blue Guide 2022 ---" download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:52022XC0629(04)" \ "$WORK_DIR/pdfs/blue_guide_2022.pdf" # --- A3: ENISA Publications --- log "--- ENISA Publications ---" download_pdf \ "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Report%20-%20Advancing%20Software%20Security%20in%20the%20EU.pdf" \ "$WORK_DIR/pdfs/enisa_secure_by_design.pdf" download_pdf \ "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Threat%20Landscape%20for%20Supply%20Chain%20Attacks.pdf" \ "$WORK_DIR/pdfs/enisa_supply_chain_security.pdf" # --- A4: NIST Publications --- log "--- NIST Publications ---" download_pdf \ "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \ "$WORK_DIR/pdfs/nist_sp_800_218_ssdf.pdf" download_pdf \ "https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \ "$WORK_DIR/pdfs/nist_csf_2_0.pdf" # --- A5: OECD AI Principles --- log "--- OECD AI Principles ---" download_pdf \ "https://legalinstruments.oecd.org/api/print?ids=648&lang=en" \ "$WORK_DIR/pdfs/oecd_ai_principles.pdf" # --- A6: EUR-Lex IFRS (DE + EN) --- log "--- EUR-Lex: EU-IFRS Regulation 2023/1803 ---" download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803" \ "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_de.pdf" download_pdf \ "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803" \ "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_en.pdf" # --- A7: EFRAG Endorsement Status Report --- log "--- EFRAG Endorsement Status Report ---" download_pdf \ "https://www.efrag.org/sites/default/files/media/document/2025-12/EFRAG%20Endorsement%20Status%20Report%2018%20December%202025.pdf" \ "$WORK_DIR/pdfs/efrag_endorsement_status_2025.pdf" log "Downloads complete." } # ============================================================================= # PHASE B: EU-Rechtstexte → bp_compliance_ce # ============================================================================= phase_ce() { log "==========================================" log "PHASE B: EU-Rechtstexte → bp_compliance_ce" log "==========================================" local col="bp_compliance_ce" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # 1. Machinery Regulation (EU) 2023/1230 upload_file "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" "$col" "compliance_ce" "legal_reference" "2023" \ '{"regulation_id":"eu_2023_1230","regulation_name_de":"Maschinenverordnung","regulation_name_en":"Machinery Regulation","regulation_short":"MACHINERY_REG","category":"maschinensicherheit","celex":"32023R1230","source":"eur-lex","license":"public_law"}' \ "Maschinenverordnung (EU) 2023/1230" # 2. Blue Guide 2022 upload_file "$WORK_DIR/pdfs/blue_guide_2022.pdf" "$col" "compliance_ce" "legal_reference" "2022" \ '{"regulation_id":"eu_blue_guide_2022","regulation_name_de":"Leitfaden fuer die Umsetzung der Produktvorschriften (Blue Guide)","regulation_name_en":"Blue Guide on EU Product Rules","regulation_short":"BLUE_GUIDE","category":"produktregulierung","celex":"52022XC0629(04)","source":"eur-lex","license":"public_law"}' \ "Blue Guide 2022 — EU-Produktvorschriften" # 8. EU-IFRS Regulation 2023/1803 (DE) upload_file "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_de.pdf" "$col" "compliance_ce" "legal_reference" "2023" \ '{"regulation_id":"eu_2023_1803","regulation_name_de":"IFRS-Uebernahmeverordnung","regulation_name_en":"IFRS Adoption Regulation","regulation_short":"EU_IFRS","category":"rechnungslegung","celex":"32023R1803","source":"eur-lex","license":"public_law","language":"de","endorsement_note":"Nur EU-endorsed IFRS. Neuere IASB-Standards sind moeglicherweise noch nicht uebernommen."}' \ "EU-IFRS Regulation 2023/1803 (DE)" # 9. EU-IFRS Regulation 2023/1803 (EN) upload_file "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_en.pdf" "$col" "compliance_ce" "legal_reference" "2023" \ '{"regulation_id":"eu_2023_1803","regulation_name_de":"IFRS-Uebernahmeverordnung","regulation_name_en":"IFRS Adoption Regulation","regulation_short":"EU_IFRS","category":"rechnungslegung","celex":"32023R1803","source":"eur-lex","license":"public_law","language":"en","endorsement_note":"EU-endorsed IFRS only. Newer IASB standards may not yet be adopted by the EU."}' \ "EU-IFRS Regulation 2023/1803 (EN)" local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE C: Frameworks/Guidance → bp_compliance_datenschutz # ============================================================================= phase_datenschutz() { log "==========================================" log "PHASE C: Frameworks/Guidance → bp_compliance_datenschutz" log "==========================================" local col="bp_compliance_datenschutz" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # 3. ENISA Secure by Design upload_file "$WORK_DIR/pdfs/enisa_secure_by_design.pdf" "$col" "compliance_datenschutz" "guidance" "2023" \ '{"source_id":"enisa","doc_type":"guidance","guideline_name":"Advancing Software Security in the EU","license":"reuse_notice","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \ "ENISA: Advancing Software Security in the EU" # 4. ENISA Supply Chain Security upload_file "$WORK_DIR/pdfs/enisa_supply_chain_security.pdf" "$col" "compliance_datenschutz" "guidance" "2021" \ '{"source_id":"enisa","doc_type":"guidance","guideline_name":"Threat Landscape for Supply Chain Attacks","license":"reuse_notice","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \ "ENISA: Supply Chain Security Threat Landscape" # 5. NIST SP 800-218 (SSDF) upload_file "$WORK_DIR/pdfs/nist_sp_800_218_ssdf.pdf" "$col" "compliance_datenschutz" "guidance" "2022" \ '{"source_id":"nist","doc_type":"framework","guideline_name":"Secure Software Development Framework (SSDF) SP 800-218","license":"public_domain","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ "NIST SP 800-218 — Secure Software Development Framework" # 6. NIST Cybersecurity Framework 2.0 upload_file "$WORK_DIR/pdfs/nist_csf_2_0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \ '{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework (CSF) 2.0","license":"public_domain","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ "NIST Cybersecurity Framework 2.0" # 7. OECD AI Principles upload_file "$WORK_DIR/pdfs/oecd_ai_principles.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \ '{"source_id":"oecd","doc_type":"guidance","guideline_name":"OECD Recommendation on Artificial Intelligence (AI Principles)","license":"reuse_notice","attribution":"Organisation for Economic Co-operation and Development (OECD)","source":"oecd.org"}' \ "OECD AI Principles (Recommendation on AI)" # 10. EFRAG Endorsement Status Report upload_file "$WORK_DIR/pdfs/efrag_endorsement_status_2025.pdf" "$col" "compliance_datenschutz" "guidance" "2025" \ '{"source_id":"efrag","doc_type":"guidance","guideline_name":"EFRAG Endorsement Status Report (Dec 2025)","license":"reuse_notice","attribution":"European Financial Reporting Advisory Group (EFRAG)","source":"efrag.org"}' \ "EFRAG Endorsement Status Report (Dec 2025)" local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE D: Verifizierung # ============================================================================= phase_verify() { log "==========================================" log "PHASE D: Verifizierung" log "==========================================" echo "" echo "=== Collection Stats ===" for col in bp_compliance_ce bp_compliance_datenschutz; do local count count=$(collection_count "$col") printf " %-30s %s chunks\n" "$col" "$count" done echo "" echo "=== Test-Suchen ===" log "Suche: 'Maschinenverordnung CE-Kennzeichnung' in bp_compliance_ce" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"Maschinenverordnung CE-Kennzeichnung","collection":"bp_compliance_ce","top_k":3}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'Supply Chain Cybersecurity ENISA' in bp_compliance_datenschutz" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"Supply Chain Cybersecurity ENISA","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'NIST Cybersecurity Framework Governance' in bp_compliance_datenschutz" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"NIST Cybersecurity Framework Governance","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'OECD AI Principles transparency accountability' in bp_compliance_datenschutz" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"OECD AI Principles transparency accountability","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'IFRS Rechnungslegung EU endorsed' in bp_compliance_ce" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"IFRS Rechnungslegung EU endorsed","collection":"bp_compliance_ce","top_k":3}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'EFRAG endorsement status IFRS 18' in bp_compliance_datenschutz" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"EFRAG endorsement status IFRS 18","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" } # ============================================================================= # MAIN # ============================================================================= log "============================================================" log "BreakPilot Industry Compliance Ingestion" log "Work dir: $WORK_DIR" log "RAG API: $RAG_URL" log "============================================================" if [[ -n "$ONLY_PHASE" ]]; then case "$ONLY_PHASE" in download) phase_download ;; ce) phase_ce ;; datenschutz) phase_datenschutz ;; verify) phase_verify ;; *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;; esac else if [[ "$SKIP_DOWNLOAD" == "false" ]]; then phase_download else log "(Skipping downloads)" fi echo "" phase_ce echo "" phase_datenschutz echo "" phase_verify fi echo "" log "============================================================" log "DONE — Uploaded: $UPLOADED | Failed: $FAILED | Skipped: $SKIPPED" log "============================================================" if [[ "$FAILED" -gt 0 ]]; then exit 1 fi