All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 35s
CI / test-python-backend-compliance (push) Successful in 33s
CI / test-python-document-crawler (push) Successful in 21s
CI / test-python-dsms-gateway (push) Successful in 19s
- Hazard-Library: +79 neue Eintraege in 12 Kategorien (software_fault, hmi_error, mechanical_hazard, electrical_hazard, thermal_hazard, emc_hazard, configuration_error, safety_function_failure, logging_audit_failure, integration_error, environmental_hazard, maintenance_hazard) — Gesamtanzahl: ~116 Eintraege in 24 Kategorien - Controls-Library: neue Datei controls_library.go mit 200 Eintraegen in 6 Domaenen (REQ/ARCH/SWDEV/VER/CYBER/DOC) - Handler: GET /sdk/v1/iace/controls-library (?domain=, ?category=) - SEPA: CalculateInherentRisk() + 4. Param Avoidance (0=disabled, 1-5: 3=neutral); RiskComputeInput.Avoidance, RiskAssessment.Avoidance, AssessRiskRequest.Avoidance — backward-kompatibel (A=0 → S×E×P) - Tests: engine_test.go + hazard_library_test.go aktualisiert - Scripts: ingest-ce-corpus.sh — 15 CE/Safety-Dokumente (EUR-Lex, NIST, ENISA, NASA, OWASP, MITRE CWE) in bp_compliance_ce und bp_compliance_datenschutz - Docs: docs-src/services/sdk-modules/iace.md + mkdocs.yml Nav-Eintrag Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
579 lines
22 KiB
Bash
Executable File
579 lines
22 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# BreakPilot Compliance — CE/Safety RAG Corpus Ingestion
|
|
#
|
|
# Laedt 15 freie CE-/Safety-relevante Dokumente herunter und ingestiert sie
|
|
# in Qdrant via die Core RAG-API (Port 8097).
|
|
#
|
|
# Sammlungen:
|
|
# bp_compliance_ce — Maschinenrecht, Safety-Frameworks, OT-Security
|
|
# bp_compliance_datenschutz — AI/Datenschutz-Guidance (ENISA, OECD)
|
|
#
|
|
# Ausfuehrung auf dem Mac Mini:
|
|
# bash ~/Projekte/breakpilot-compliance/scripts/ingest-ce-corpus.sh
|
|
# bash .../ingest-ce-corpus.sh [--skip-download] [--only PHASE]
|
|
#
|
|
# Phasen: download, ce, datenschutz, verify, version
|
|
# =============================================================================
|
|
set -euo pipefail
|
|
|
|
# --- Configuration -----------------------------------------------------------
|
|
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion-ce}"
|
|
RAG_URL="https://localhost:8097/api/v1/documents/upload"
|
|
QDRANT_URL="http://localhost:6333"
|
|
CURL_OPTS="-sk --connect-timeout 15 --max-time 600 --retry 3 --retry-delay 5"
|
|
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
|
|
|
|
# Counters
|
|
UPLOADED=0
|
|
FAILED=0
|
|
SKIPPED=0
|
|
|
|
# --- CLI Args ----------------------------------------------------------------
|
|
SKIP_DOWNLOAD=false
|
|
ONLY_PHASE=""
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--skip-download) SKIP_DOWNLOAD=true; shift ;;
|
|
--only) ONLY_PHASE="$2"; shift 2 ;;
|
|
-h|--help)
|
|
echo "Usage: $0 [--skip-download] [--only PHASE]"
|
|
echo "Phases: download, ce, datenschutz, verify, version"
|
|
exit 0
|
|
;;
|
|
*) echo "Unknown option: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# --- Helpers -----------------------------------------------------------------
|
|
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
|
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
|
|
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
|
|
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
|
|
|
|
upload_file() {
|
|
local file="$1"
|
|
local collection="$2"
|
|
local data_type="$3"
|
|
local use_case="$4"
|
|
local year="$5"
|
|
local metadata_json="$6"
|
|
local label="${7:-$(basename "$file")}"
|
|
|
|
if [[ ! -f "$file" ]]; then
|
|
warn "File not found: $file"
|
|
FAILED=$((FAILED + 1))
|
|
return 1
|
|
fi
|
|
|
|
local filesize
|
|
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
|
if [[ "$filesize" -lt 1000 ]]; then
|
|
warn "File too small (${filesize}B), skipping: $label"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
return 1
|
|
fi
|
|
|
|
log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"
|
|
|
|
local response
|
|
response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
|
|
-F "file=@${file}" \
|
|
-F "collection=${collection}" \
|
|
-F "data_type=${data_type}" \
|
|
-F "use_case=${use_case}" \
|
|
-F "year=${year}" \
|
|
-F "chunk_strategy=recursive" \
|
|
-F "chunk_size=512" \
|
|
-F "chunk_overlap=50" \
|
|
-F "metadata_json=${metadata_json}" \
|
|
2>/dev/null) || true
|
|
|
|
if echo "$response" | grep -q '"chunks_count"'; then
|
|
local chunks
|
|
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
|
|
ok "$label → $chunks chunks"
|
|
UPLOADED=$((UPLOADED + 1))
|
|
elif echo "$response" | grep -q '"vectors_indexed"'; then
|
|
local vectors
|
|
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
|
|
ok "$label → $vectors vectors"
|
|
UPLOADED=$((UPLOADED + 1))
|
|
else
|
|
fail "Upload failed: $label"
|
|
fail "Response: $response"
|
|
FAILED=$((FAILED + 1))
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
download_pdf() {
|
|
local url="$1"
|
|
local target="$2"
|
|
|
|
if [[ -f "$target" ]]; then
|
|
local filesize
|
|
filesize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0)
|
|
if [[ "$filesize" -gt 1000 ]]; then
|
|
log "PDF exists: $(basename "$target") (skipping download)"
|
|
return 0
|
|
fi
|
|
rm -f "$target"
|
|
fi
|
|
|
|
log "Downloading: $(basename "$target") from $url"
|
|
curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
|
|
warn "Download failed: $url"
|
|
return 1
|
|
}
|
|
}
|
|
|
|
collection_count() {
|
|
local col="$1"
|
|
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
|
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE A: Downloads
|
|
# =============================================================================
|
|
phase_download() {
|
|
log "=========================================="
|
|
log "PHASE A: Downloads (15 CE/Safety-Dokumente)"
|
|
log "=========================================="
|
|
|
|
mkdir -p "$WORK_DIR/pdfs"
|
|
|
|
# --- EU-Rechtstexte (EUR-Lex, oeffentliches Recht) ---
|
|
# 1. Machinery Regulation (EU) 2023/1230
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1230" \
|
|
"$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf"
|
|
|
|
# 2. Machinery Directive 2006/42/EC
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32006L0042" \
|
|
"$WORK_DIR/pdfs/machinery_directive_2006_42.pdf"
|
|
|
|
# 3. Low Voltage Directive 2014/35/EU
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32014L0035" \
|
|
"$WORK_DIR/pdfs/lvd_2014_35.pdf"
|
|
|
|
# 4. EMC Directive 2014/30/EU
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32014L0030" \
|
|
"$WORK_DIR/pdfs/emc_directive_2014_30.pdf"
|
|
|
|
# 5. Radio Equipment Directive 2014/53/EU
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32014L0053" \
|
|
"$WORK_DIR/pdfs/red_directive_2014_53.pdf"
|
|
|
|
# 6. AI Act (EU) 2024/1689
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32024R1689" \
|
|
"$WORK_DIR/pdfs/ai_act_2024_1689.pdf"
|
|
|
|
# 7. Guide to the Machinery Directive (EC, oeffentlich)
|
|
download_pdf \
|
|
"https://single-market-economy.ec.europa.eu/system/files/2021-10/machinery-guide-2010_en.pdf" \
|
|
"$WORK_DIR/pdfs/machinery_directive_guide.pdf"
|
|
|
|
# --- NIST Publikationen (US Gov, public domain) ---
|
|
# 8. NIST SP 800-218 (SSDF)
|
|
download_pdf \
|
|
"https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \
|
|
"$WORK_DIR/pdfs/nist_sp800_218_ssdf.pdf"
|
|
|
|
# 9. NIST AI RMF 1.0
|
|
download_pdf \
|
|
"https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.100-1.pdf" \
|
|
"$WORK_DIR/pdfs/nist_ai_rmf_100_1.pdf"
|
|
|
|
# --- ENISA (European Union Agency for Cybersecurity, oeffentlich) ---
|
|
# 10. ENISA Secure Software Development
|
|
download_pdf \
|
|
"https://www.enisa.europa.eu/publications/guidelines-for-securing-the-internet-of-things/@@download/fullReport" \
|
|
"$WORK_DIR/pdfs/enisa_iot_security_guidelines.pdf"
|
|
|
|
# 11. ENISA Cybersecurity for AI
|
|
download_pdf \
|
|
"https://www.enisa.europa.eu/publications/securing-machine-learning-algorithms/@@download/fullReport" \
|
|
"$WORK_DIR/pdfs/enisa_securing_ml_algorithms.pdf"
|
|
|
|
# --- NASA (US Gov, public domain) ---
|
|
# 12. NASA Software Safety Guidebook
|
|
download_pdf \
|
|
"https://swehb.nasa.gov/download/attachments/17957036/NASA-GB-8719.13.pdf" \
|
|
"$WORK_DIR/pdfs/nasa_software_safety_guidebook.pdf"
|
|
|
|
# --- OWASP (CC BY-SA 4.0) ---
|
|
# 13. OWASP Top 10 2021 (PDF)
|
|
download_pdf \
|
|
"https://owasp.org/Top10/assets/OWASP-Top-10-2021-en.pdf" \
|
|
"$WORK_DIR/pdfs/owasp_top10_2021.pdf"
|
|
|
|
# --- OECD (oeffentlich zugaenglich) ---
|
|
# 14. OECD AI Principles (HTML->Text, download als plain text)
|
|
if [[ ! -f "$WORK_DIR/pdfs/oecd_ai_principles.txt" ]]; then
|
|
log "Downloading OECD AI Principles (text)"
|
|
curl $CURL_OPTS -L "https://www.oecd.org/digital/artificial-intelligence/ai-principles/" \
|
|
2>/dev/null | python3 -c "
|
|
import sys
|
|
from html.parser import HTMLParser
|
|
class E(HTMLParser):
|
|
def __init__(self): super().__init__(); self.t=[]
|
|
def handle_data(self,d): self.t.append(d)
|
|
def handle_endtag(self,t):
|
|
if t in ('p','div','li','h1','h2','h3'): self.t.append('\n')
|
|
p=E(); p.feed(sys.stdin.read()); print(''.join(p.t))
|
|
" > "$WORK_DIR/pdfs/oecd_ai_principles.txt" 2>/dev/null || warn "OECD AI Principles download failed"
|
|
fi
|
|
|
|
# --- MITRE CWE (MIT License) ---
|
|
# 15. MITRE CWE Top 25 Most Dangerous Software Weaknesses (2023)
|
|
if [[ ! -f "$WORK_DIR/pdfs/mitre_cwe_top25_2023.txt" ]]; then
|
|
log "Downloading MITRE CWE Top 25 (text)"
|
|
curl $CURL_OPTS -L "https://cwe.mitre.org/top25/archive/2023/2023_top25_list.html" \
|
|
2>/dev/null | python3 -c "
|
|
import sys
|
|
from html.parser import HTMLParser
|
|
class E(HTMLParser):
|
|
def __init__(self): super().__init__(); self.t=[]; self.skip=False
|
|
def handle_starttag(self,t,a):
|
|
if t in ('script','style'): self.skip=True
|
|
def handle_endtag(self,t):
|
|
if t in ('script','style'): self.skip=False
|
|
if t in ('td','th','tr','p','div','h1','h2','h3','li'): self.t.append('\n')
|
|
def handle_data(self,d):
|
|
if not self.skip: self.t.append(d)
|
|
p=E(); p.feed(sys.stdin.read()); print(''.join(p.t))
|
|
" > "$WORK_DIR/pdfs/mitre_cwe_top25_2023.txt" 2>/dev/null || warn "MITRE CWE Top 25 download failed"
|
|
fi
|
|
|
|
log "Download phase complete."
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE B: CE-Dokumente → bp_compliance_ce
|
|
# =============================================================================
|
|
phase_ce() {
|
|
log "=========================================="
|
|
log "PHASE B: CE/Safety-Dokumente → bp_compliance_ce"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_ce"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "Collection $col: $before chunks (before)"
|
|
|
|
# 1. Machinery Regulation 2023/1230
|
|
upload_file "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" "$col" \
|
|
"compliance_ce" "legal_reference" "2023" \
|
|
'{"regulation_id":"EU-2023-1230","regulation_name_en":"Machinery Regulation","category":"ce_machinery","license":"eu_public","source_org":"EUR-Lex","celex":"32023R1230"}' \
|
|
"Machinery Regulation (EU) 2023/1230"
|
|
|
|
# 2. Machinery Directive 2006/42/EC
|
|
upload_file "$WORK_DIR/pdfs/machinery_directive_2006_42.pdf" "$col" \
|
|
"compliance_ce" "legal_reference" "2006" \
|
|
'{"regulation_id":"EU-2006-42","regulation_name_en":"Machinery Directive","category":"ce_machinery","license":"eu_public","source_org":"EUR-Lex","celex":"32006L0042"}' \
|
|
"Machinery Directive 2006/42/EC"
|
|
|
|
# 3. Low Voltage Directive 2014/35/EU
|
|
upload_file "$WORK_DIR/pdfs/lvd_2014_35.pdf" "$col" \
|
|
"compliance_ce" "legal_reference" "2014" \
|
|
'{"regulation_id":"EU-2014-35","regulation_name_en":"Low Voltage Directive","category":"ce_electrical","license":"eu_public","source_org":"EUR-Lex","celex":"32014L0035"}' \
|
|
"Low Voltage Directive 2014/35/EU"
|
|
|
|
# 4. EMC Directive 2014/30/EU
|
|
upload_file "$WORK_DIR/pdfs/emc_directive_2014_30.pdf" "$col" \
|
|
"compliance_ce" "legal_reference" "2014" \
|
|
'{"regulation_id":"EU-2014-30","regulation_name_en":"EMC Directive","category":"ce_emc","license":"eu_public","source_org":"EUR-Lex","celex":"32014L0030"}' \
|
|
"EMC Directive 2014/30/EU"
|
|
|
|
# 5. Radio Equipment Directive 2014/53/EU
|
|
upload_file "$WORK_DIR/pdfs/red_directive_2014_53.pdf" "$col" \
|
|
"compliance_ce" "legal_reference" "2014" \
|
|
'{"regulation_id":"EU-2014-53","regulation_name_en":"Radio Equipment Directive","category":"ce_radio","license":"eu_public","source_org":"EUR-Lex","celex":"32014L0053"}' \
|
|
"Radio Equipment Directive 2014/53/EU"
|
|
|
|
# 6. AI Act 2024/1689
|
|
upload_file "$WORK_DIR/pdfs/ai_act_2024_1689.pdf" "$col" \
|
|
"compliance_ce" "legal_reference" "2024" \
|
|
'{"regulation_id":"EU-2024-1689","regulation_name_en":"AI Act","category":"ce_ai","license":"eu_public","source_org":"EUR-Lex","celex":"32024R1689"}' \
|
|
"AI Act (EU) 2024/1689"
|
|
|
|
# 7. Guide to the Machinery Directive
|
|
upload_file "$WORK_DIR/pdfs/machinery_directive_guide.pdf" "$col" \
|
|
"compliance_ce" "guidance" "2021" \
|
|
'{"regulation_id":"EC-machinery-guide","regulation_name_en":"Guide to the Machinery Directive","category":"ce_machinery_guidance","license":"eu_public","source_org":"European Commission"}' \
|
|
"EC Guide to the Machinery Directive"
|
|
|
|
# 8. NIST SP 800-218 (SSDF)
|
|
upload_file "$WORK_DIR/pdfs/nist_sp800_218_ssdf.pdf" "$col" \
|
|
"compliance_ce" "guidance" "2022" \
|
|
'{"regulation_id":"NIST-SP-800-218","regulation_name_en":"Secure Software Development Framework","category":"ce_software_safety","license":"us_gov_public","source_org":"NIST"}' \
|
|
"NIST SP 800-218 (SSDF)"
|
|
|
|
# 9. NIST AI RMF 1.0
|
|
upload_file "$WORK_DIR/pdfs/nist_ai_rmf_100_1.pdf" "$col" \
|
|
"compliance_ce" "guidance" "2023" \
|
|
'{"regulation_id":"NIST-AI-100-1","regulation_name_en":"AI Risk Management Framework","category":"ce_ai_safety","license":"us_gov_public","source_org":"NIST"}' \
|
|
"NIST AI RMF 1.0 (NIST.AI.100-1)"
|
|
|
|
# 10. ENISA IoT Security Guidelines
|
|
upload_file "$WORK_DIR/pdfs/enisa_iot_security_guidelines.pdf" "$col" \
|
|
"compliance_ce" "guidance" "2019" \
|
|
'{"regulation_id":"ENISA-IoT-Security","regulation_name_en":"Guidelines for Securing the IoT","category":"ce_ot_cybersecurity","license":"eu_public","source_org":"ENISA"}' \
|
|
"ENISA Guidelines for Securing the IoT"
|
|
|
|
# 12. NASA Software Safety Guidebook
|
|
upload_file "$WORK_DIR/pdfs/nasa_software_safety_guidebook.pdf" "$col" \
|
|
"compliance_ce" "guidance" "2004" \
|
|
'{"regulation_id":"NASA-GB-8719.13","regulation_name_en":"NASA Software Safety Guidebook","category":"ce_software_safety","license":"us_gov_public","source_org":"NASA"}' \
|
|
"NASA Software Safety Guidebook (NASA-GB-8719.13)"
|
|
|
|
# 13. OWASP Top 10 2021
|
|
upload_file "$WORK_DIR/pdfs/owasp_top10_2021.pdf" "$col" \
|
|
"compliance_ce" "guidance" "2021" \
|
|
'{"regulation_id":"OWASP-Top10-2021","regulation_name_en":"OWASP Top 10 2021","category":"ce_software_security","license":"cc_by_sa_4","source_org":"OWASP"}' \
|
|
"OWASP Top 10 (2021)"
|
|
|
|
# 15. MITRE CWE Top 25
|
|
upload_file "$WORK_DIR/pdfs/mitre_cwe_top25_2023.txt" "$col" \
|
|
"compliance_ce" "guidance" "2023" \
|
|
'{"regulation_id":"MITRE-CWE-Top25-2023","regulation_name_en":"MITRE CWE Top 25 Most Dangerous Software Weaknesses","category":"ce_software_weaknesses","license":"mit","source_org":"MITRE"}' \
|
|
"MITRE CWE Top 25 (2023)"
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
log "Collection $col: $before → $after chunks"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE C: AI/Datenschutz-Dokumente → bp_compliance_datenschutz
|
|
# =============================================================================
|
|
phase_datenschutz() {
|
|
log "=========================================="
|
|
log "PHASE C: AI/Datenschutz → bp_compliance_datenschutz"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_datenschutz"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "Collection $col: $before chunks (before)"
|
|
|
|
# 11. ENISA Securing ML Algorithms
|
|
upload_file "$WORK_DIR/pdfs/enisa_securing_ml_algorithms.pdf" "$col" \
|
|
"compliance_datenschutz" "guidance" "2021" \
|
|
'{"regulation_id":"ENISA-Securing-ML","regulation_name_en":"Securing Machine Learning Algorithms","category":"ai_cybersecurity","license":"eu_public","source_org":"ENISA"}' \
|
|
"ENISA Securing Machine Learning Algorithms"
|
|
|
|
# 14. OECD AI Principles
|
|
upload_file "$WORK_DIR/pdfs/oecd_ai_principles.txt" "$col" \
|
|
"compliance_datenschutz" "guidance" "2019" \
|
|
'{"regulation_id":"OECD-AI-Principles","regulation_name_en":"OECD Principles on Artificial Intelligence","category":"ai_governance","license":"oecd_public","source_org":"OECD"}' \
|
|
"OECD AI Principles (2019)"
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
log "Collection $col: $before → $after chunks"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE D: Verifizierung
|
|
# =============================================================================
|
|
phase_verify() {
|
|
log "=========================================="
|
|
log "PHASE D: Verifizierung"
|
|
log "=========================================="
|
|
|
|
echo ""
|
|
echo "=== Collection Stats ==="
|
|
for col in bp_compliance_ce bp_compliance_datenschutz; do
|
|
local count
|
|
count=$(collection_count "$col")
|
|
printf " %-35s %s chunks\n" "$col" "$count"
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Test-Suchen ==="
|
|
|
|
log "Suche: 'Machinery Regulation software safety requirements' in bp_compliance_ce"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"Machinery Regulation software safety requirements","collection":"bp_compliance_ce","limit":3,"min_score":0.4}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:100]}...')
|
|
except Exception as e: print(f' (parse error: {e})')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
log "Suche: 'NIST secure software development practices' in bp_compliance_ce"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"NIST secure software development practices","collection":"bp_compliance_ce","limit":3,"min_score":0.4}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:100]}...')
|
|
except Exception as e: print(f' (parse error: {e})')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
log "Suche: 'AI risk governance OECD principles' in bp_compliance_datenschutz"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"AI risk governance principles transparency accountability","collection":"bp_compliance_datenschutz","limit":3,"min_score":0.4}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:100]}...')
|
|
except Exception as e: print(f' (parse error: {e})')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
echo ""
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE E: Corpus Version Registration
|
|
# =============================================================================
|
|
phase_register_version() {
|
|
log "=========================================="
|
|
log "PHASE E: Corpus Version Registration"
|
|
log "=========================================="
|
|
|
|
local today
|
|
today=$(date '+%Y-%m-%d')
|
|
|
|
local col_ce="bp_compliance_ce"
|
|
local col_ds="bp_compliance_datenschutz"
|
|
|
|
for col in "$col_ce" "$col_ds"; do
|
|
local count
|
|
count=$(collection_count "$col")
|
|
|
|
if [[ "$count" == "?" || "$count" == "0" ]]; then
|
|
warn "Skipping version for $col (count=$count)"
|
|
continue
|
|
fi
|
|
|
|
local existing_count
|
|
existing_count=$(psql "$DB_URL" -tAc \
|
|
"SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
|
|
2>/dev/null || echo "0")
|
|
local seq=$((existing_count + 1))
|
|
local version="${today}.${seq}"
|
|
|
|
local regs=""
|
|
case "$col" in
|
|
bp_compliance_ce)
|
|
regs='{EU-2023-1230,EU-2006-42,EU-2014-35,EU-2014-30,EU-2014-53,EU-2024-1689,NIST-SP-800-218,NIST-AI-100-1,ENISA-IoT-Security,NASA-GB-8719,OWASP-Top10-2021,MITRE-CWE-Top25-2023}'
|
|
;;
|
|
bp_compliance_datenschutz)
|
|
regs='{ENISA-Securing-ML,OECD-AI-Principles}'
|
|
;;
|
|
esac
|
|
|
|
local digest
|
|
digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
|
| python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
|
|
2>/dev/null || echo "")
|
|
|
|
log "Registering version $version for $col ($count chunks)"
|
|
|
|
psql "$DB_URL" -c "
|
|
INSERT INTO compliance_corpus_versions
|
|
(version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
|
|
VALUES
|
|
('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-ce-corpus.sh', 'system')
|
|
ON CONFLICT DO NOTHING
|
|
" 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration skipped for $col (DB not available?)"
|
|
done
|
|
}
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
main() {
|
|
log "=========================================="
|
|
log "BreakPilot CE/Safety Corpus Ingestion"
|
|
log "=========================================="
|
|
log "Work dir: $WORK_DIR"
|
|
log "RAG API: $RAG_URL"
|
|
log "Qdrant: $QDRANT_URL"
|
|
echo ""
|
|
|
|
# Check RAG API
|
|
if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file\|detail"; then
|
|
warn "RAG API may not be reachable at $RAG_URL — continuing anyway"
|
|
else
|
|
ok "RAG API reachable"
|
|
fi
|
|
|
|
# Check Qdrant
|
|
if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
|
|
fail "Qdrant not reachable at $QDRANT_URL"
|
|
exit 1
|
|
fi
|
|
ok "Qdrant reachable"
|
|
echo ""
|
|
|
|
if [[ -n "$ONLY_PHASE" ]]; then
|
|
case "$ONLY_PHASE" in
|
|
download) phase_download ;;
|
|
ce) phase_ce ;;
|
|
datenschutz) phase_datenschutz ;;
|
|
verify) phase_verify ;;
|
|
version) phase_register_version ;;
|
|
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
|
|
esac
|
|
else
|
|
if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
|
|
phase_download
|
|
else
|
|
log "Skipping download phase (--skip-download)"
|
|
fi
|
|
echo ""
|
|
phase_ce
|
|
echo ""
|
|
phase_datenschutz
|
|
echo ""
|
|
phase_verify
|
|
echo ""
|
|
phase_register_version
|
|
fi
|
|
|
|
echo ""
|
|
log "=========================================="
|
|
log "ERGEBNIS"
|
|
log "=========================================="
|
|
log "Uploaded: $UPLOADED"
|
|
log "Failed: $FAILED"
|
|
log "Skipped: $SKIPPED"
|
|
log "=========================================="
|
|
|
|
if [[ $FAILED -gt 0 ]]; then
|
|
warn "$FAILED uploads fehlgeschlagen!"
|
|
exit 1
|
|
fi
|
|
|
|
ok "CE/Safety Corpus Ingestion abgeschlossen!"
|
|
}
|
|
|
|
main "$@"
|