diff --git a/.gitea/workflows/rag-ingest.yaml b/.gitea/workflows/rag-ingest.yaml index 4edd11b..84563e3 100644 --- a/.gitea/workflows/rag-ingest.yaml +++ b/.gitea/workflows/rag-ingest.yaml @@ -14,7 +14,7 @@ on: workflow_dispatch: inputs: phase: - description: 'Ingestion Phase (gesetze, eu, templates, datenschutz, verbraucherschutz, dach, verify, version, all)' + description: 'Ingestion Phase (gesetze, eu, templates, datenschutz, verbraucherschutz, dach, security, verify, version, all)' required: true default: 'verbraucherschutz' diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh index 0ad5160..9bd8271 100755 --- a/scripts/ingest-legal-corpus.sh +++ b/scripts/ingest-legal-corpus.sh @@ -1451,6 +1451,338 @@ with open('$WORK_DIR/texts/BGer_1C_562_2024.txt', 'w') as f: log "Phase I abgeschlossen." } +# ============================================================================= +# PHASE J: Security Guidelines & Standards +# Nur lizenzkompatible Dokumente (Public Domain / CC BY / CC BY-SA) +# ============================================================================= +phase_security() { + log "==========================================" + log "PHASE J: Security Guidelines & Standards" + log "==========================================" + + mkdir -p "$WORK_DIR"/{pdfs,texts,repos} + + # ========================================================================= + # J1: NIST Standards (Public Domain, US Government Work) + # → bp_compliance_datenschutz (hat bereits NIST CSF + Privacy Framework) + # ========================================================================= + local col="bp_compliance_datenschutz" + local before + before=$(collection_count "$col") + log "--- J1: NIST Security Standards → $col ($before chunks) ---" + + # NIST SP 800-53 Rev. 5 — Security and Privacy Controls (GROSS: ~490 Seiten) + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53r5.pdf" \ + "$WORK_DIR/pdfs/NIST_SP_800_53r5.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_53r5.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NIST_SP_800_53r5.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \ + '{"regulation_id":"nist_sp800_53r5","source_id":"nist","doc_type":"controls_catalog","guideline_name":"NIST SP 800-53 Rev. 5 Security and Privacy Controls","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NIST SP 800-53 Rev. 5 (Security & Privacy Controls)" + fi + + # NIST SP 800-218 — Secure Software Development Framework (SSDF) + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \ + "$WORK_DIR/pdfs/NIST_SP_800_218.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_218.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NIST_SP_800_218.pdf" "$col" "compliance_datenschutz" "guidance" "2022" \ + '{"regulation_id":"nist_sp800_218","source_id":"nist","doc_type":"framework","guideline_name":"NIST SP 800-218 Secure Software Development Framework (SSDF)","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NIST SP 800-218 SSDF" + fi + + # NIST SP 800-63-3 — Digital Identity Guidelines + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-63-3.pdf" \ + "$WORK_DIR/pdfs/NIST_SP_800_63_3.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_63_3.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NIST_SP_800_63_3.pdf" "$col" "compliance_datenschutz" "guidance" "2017" \ + '{"regulation_id":"nist_sp800_63_3","source_id":"nist","doc_type":"guidelines","guideline_name":"NIST SP 800-63-3 Digital Identity Guidelines","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NIST SP 800-63-3 (Digital Identity)" + fi + + # NIST SP 800-207 — Zero Trust Architecture + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-207.pdf" \ + "$WORK_DIR/pdfs/NIST_SP_800_207.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_207.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NIST_SP_800_207.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \ + '{"regulation_id":"nist_sp800_207","source_id":"nist","doc_type":"architecture","guideline_name":"NIST SP 800-207 Zero Trust Architecture","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NIST SP 800-207 (Zero Trust Architecture)" + fi + + # NISTIR 8259A — IoT Device Cybersecurity Core Baseline + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/ir/2020/NIST.IR.8259A.pdf" \ + "$WORK_DIR/pdfs/NISTIR_8259A.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NISTIR_8259A.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NISTIR_8259A.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \ + '{"regulation_id":"nistir_8259a","source_id":"nist","doc_type":"baseline","guideline_name":"NISTIR 8259A IoT Device Cybersecurity Core Baseline","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NISTIR 8259A (IoT Core Baseline)" + fi + + # NISTIR 8259B — IoT Non-Technical Supporting Capability Core Baseline + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/ir/2020/NIST.IR.8259B.pdf" \ + "$WORK_DIR/pdfs/NISTIR_8259B.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NISTIR_8259B.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NISTIR_8259B.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \ + '{"regulation_id":"nistir_8259b","source_id":"nist","doc_type":"baseline","guideline_name":"NISTIR 8259B IoT Non-Technical Supporting Capability Core Baseline","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NISTIR 8259B (IoT Non-Technical Baseline)" + fi + + # NIST AI RMF 1.0 — AI Risk Management Framework + download_pdf \ + "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.100-1.pdf" \ + "$WORK_DIR/pdfs/NIST_AI_RMF_100_1.pdf" || true + if [[ -f "$WORK_DIR/pdfs/NIST_AI_RMF_100_1.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/NIST_AI_RMF_100_1.pdf" "$col" "compliance_datenschutz" "guidance" "2023" \ + '{"regulation_id":"nist_ai_rmf","source_id":"nist","doc_type":"framework","guideline_name":"NIST AI Risk Management Framework (AI RMF) 1.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ + "NIST AI RMF 1.0 (AI Risk Management)" + fi + + local after + after=$(collection_count "$col") + local diff=$(( after - before )) + log "Collection $col: ${before} → ${after} chunks (+${diff})" + + # ========================================================================= + # J2: OWASP Security Standards (CC BY / CC BY-SA) + # → bp_compliance_datenschutz + # Strategie: GitHub Repos klonen, Markdown extrahieren, als Text uploaden + # ========================================================================= + before=$(collection_count "$col") + log "--- J2: OWASP Standards → $col ($before chunks) ---" + + # --- J2a: OWASP Top 10 (2021 stable, 2025 in development) --- + local owasp_top10_repo="$WORK_DIR/repos/owasp-top10" + if [[ ! -d "$owasp_top10_repo" ]]; then + git clone --depth 1 "https://github.com/OWASP/Top10.git" "$owasp_top10_repo" 2>/dev/null || true + fi + if [[ -d "$owasp_top10_repo" ]]; then + # Concatenate all Top 10 documents (2021 edition, stable) + local top10_dir="$owasp_top10_repo/2021/docs" + if [[ -d "$top10_dir" ]]; then + find "$top10_dir" -name "*.md" -not -name "index.md" | sort | while read -r f; do + echo "---" + echo "# $(basename "$f" .md)" + cat "$f" + echo "" + done > "$WORK_DIR/texts/OWASP_Top10_2021.txt" 2>/dev/null + if [[ -f "$WORK_DIR/texts/OWASP_Top10_2021.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_Top10_2021.txt") -gt 1000 ]]; then + upload_file "$WORK_DIR/texts/OWASP_Top10_2021.txt" "$col" "compliance_datenschutz" "guidance" "2021" \ + '{"regulation_id":"owasp_top10_2021","source_id":"owasp","doc_type":"risk_catalog","guideline_name":"OWASP Top 10 Web Application Security Risks (2021)","license":"CC_BY_3.0","attribution":"OWASP Foundation","source":"github.com/OWASP/Top10"}' \ + "OWASP Top 10 (2021)" + fi + fi + fi + + # --- J2b: OWASP API Security Top 10 (2023) --- + local owasp_api_repo="$WORK_DIR/repos/owasp-api-security" + if [[ ! -d "$owasp_api_repo" ]]; then + git clone --depth 1 "https://github.com/OWASP/API-Security.git" "$owasp_api_repo" 2>/dev/null || true + fi + if [[ -d "$owasp_api_repo" ]]; then + local api_dir="$owasp_api_repo/editions/2023/en" + if [[ -d "$api_dir" ]]; then + find "$api_dir" -name "*.md" | sort | while read -r f; do + echo "---" + echo "# $(basename "$f" .md)" + cat "$f" + echo "" + done > "$WORK_DIR/texts/OWASP_API_Security_2023.txt" 2>/dev/null + if [[ -f "$WORK_DIR/texts/OWASP_API_Security_2023.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_API_Security_2023.txt") -gt 1000 ]]; then + upload_file "$WORK_DIR/texts/OWASP_API_Security_2023.txt" "$col" "compliance_datenschutz" "guidance" "2023" \ + '{"regulation_id":"owasp_api_top10_2023","source_id":"owasp","doc_type":"risk_catalog","guideline_name":"OWASP API Security Top 10 (2023)","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/API-Security"}' \ + "OWASP API Security Top 10 (2023)" + fi + fi + fi + + # --- J2c: OWASP ASVS (Application Security Verification Standard) --- + local owasp_asvs_repo="$WORK_DIR/repos/owasp-asvs" + if [[ ! -d "$owasp_asvs_repo" ]]; then + git clone --depth 1 "https://github.com/OWASP/ASVS.git" "$owasp_asvs_repo" 2>/dev/null || true + fi + if [[ -d "$owasp_asvs_repo" ]]; then + # ASVS 5.0 or 4.0 — check which is available + local asvs_dir="" + for candidate in "$owasp_asvs_repo/5.0/en" "$owasp_asvs_repo/4.0/en"; do + if [[ -d "$candidate" ]]; then asvs_dir="$candidate"; break; fi + done + if [[ -n "$asvs_dir" ]]; then + find "$asvs_dir" -name "*.md" | sort | while read -r f; do + echo "---" + echo "# $(basename "$f" .md)" + cat "$f" + echo "" + done > "$WORK_DIR/texts/OWASP_ASVS.txt" 2>/dev/null + if [[ -f "$WORK_DIR/texts/OWASP_ASVS.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_ASVS.txt") -gt 1000 ]]; then + upload_file "$WORK_DIR/texts/OWASP_ASVS.txt" "$col" "compliance_datenschutz" "guidance" "2024" \ + '{"regulation_id":"owasp_asvs","source_id":"owasp","doc_type":"verification_standard","guideline_name":"OWASP Application Security Verification Standard (ASVS)","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/ASVS"}' \ + "OWASP ASVS (Application Security Verification Standard)" + fi + fi + fi + + # --- J2d: OWASP MASVS (Mobile Application Security Verification Standard) --- + local owasp_masvs_repo="$WORK_DIR/repos/owasp-masvs" + if [[ ! -d "$owasp_masvs_repo" ]]; then + git clone --depth 1 "https://github.com/OWASP/owasp-masvs.git" "$owasp_masvs_repo" 2>/dev/null || true + fi + if [[ -d "$owasp_masvs_repo" ]]; then + # MASVS v2 structure + local masvs_dir="" + for candidate in "$owasp_masvs_repo/Document" "$owasp_masvs_repo/document" "$owasp_masvs_repo"; do + if [[ -d "$candidate" ]] && find "$candidate" -maxdepth 2 -name "*.md" | head -1 | grep -q .; then + masvs_dir="$candidate"; break + fi + done + if [[ -n "$masvs_dir" ]]; then + find "$masvs_dir" -name "*.md" -not -name "README.md" -not -name "CONTRIBUTING.md" -not -path "*/.github/*" | sort | while read -r f; do + echo "---" + echo "# $(basename "$f" .md)" + cat "$f" + echo "" + done > "$WORK_DIR/texts/OWASP_MASVS.txt" 2>/dev/null + if [[ -f "$WORK_DIR/texts/OWASP_MASVS.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_MASVS.txt") -gt 1000 ]]; then + upload_file "$WORK_DIR/texts/OWASP_MASVS.txt" "$col" "compliance_datenschutz" "guidance" "2024" \ + '{"regulation_id":"owasp_masvs","source_id":"owasp","doc_type":"verification_standard","guideline_name":"OWASP Mobile Application Security Verification Standard (MASVS)","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/owasp-masvs"}' \ + "OWASP MASVS (Mobile Security Verification)" + fi + fi + fi + + # --- J2e: OWASP SAMM (Software Assurance Maturity Model) --- + local owasp_samm_repo="$WORK_DIR/repos/owasp-samm" + if [[ ! -d "$owasp_samm_repo" ]]; then + git clone --depth 1 "https://github.com/OWASP/samm.git" "$owasp_samm_repo" 2>/dev/null || true + fi + if [[ -d "$owasp_samm_repo" ]]; then + # SAMM model content + local samm_dir="" + for candidate in "$owasp_samm_repo/Website/content" "$owasp_samm_repo/model" "$owasp_samm_repo"; do + if [[ -d "$candidate" ]] && find "$candidate" -maxdepth 3 -name "*.md" | head -1 | grep -q .; then + samm_dir="$candidate"; break + fi + done + if [[ -n "$samm_dir" ]]; then + find "$samm_dir" -name "*.md" -not -name "README.md" -not -name "CONTRIBUTING.md" -not -path "*/.github/*" | sort | while read -r f; do + echo "---" + echo "# $(basename "$f" .md)" + cat "$f" + echo "" + done > "$WORK_DIR/texts/OWASP_SAMM.txt" 2>/dev/null + if [[ -f "$WORK_DIR/texts/OWASP_SAMM.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_SAMM.txt") -gt 1000 ]]; then + upload_file "$WORK_DIR/texts/OWASP_SAMM.txt" "$col" "compliance_datenschutz" "guidance" "2020" \ + '{"regulation_id":"owasp_samm","source_id":"owasp","doc_type":"maturity_model","guideline_name":"OWASP Software Assurance Maturity Model (SAMM) v2","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/samm"}' \ + "OWASP SAMM v2 (Software Assurance Maturity Model)" + fi + fi + fi + + # --- J2f: OWASP Mobile Top 10 --- + local owasp_mobile_repo="$WORK_DIR/repos/owasp-mobile-top10" + if [[ ! -d "$owasp_mobile_repo" ]]; then + git clone --depth 1 "https://github.com/niccolopetti/owasp_mobile_top_10.git" "$owasp_mobile_repo" 2>/dev/null || \ + git clone --depth 1 "https://github.com/niccolopetti/owasp_mobile_top_10.git" "$owasp_mobile_repo" 2>/dev/null || true + fi + # Fallback: direkt von der OWASP-Seite als HTML holen + if [[ ! -d "$owasp_mobile_repo" ]] || ! find "$owasp_mobile_repo" -name "*.md" | head -1 | grep -q .; then + curl -sL "https://owasp.org/www-project-mobile-top-10/" \ + -o "$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html" && $(wc -c < "$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html") -gt 1000 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/OWASP_Mobile_Top10.txt', 'w') as f: + f.write('OWASP Mobile Top 10\n\n' + text[:100000]) +" 2>/dev/null + fi + else + find "$owasp_mobile_repo" -name "*.md" -not -name "README.md" | sort | while read -r f; do + echo "---" + cat "$f" + echo "" + done > "$WORK_DIR/texts/OWASP_Mobile_Top10.txt" 2>/dev/null + fi + if [[ -f "$WORK_DIR/texts/OWASP_Mobile_Top10.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_Mobile_Top10.txt") -gt 500 ]]; then + upload_file "$WORK_DIR/texts/OWASP_Mobile_Top10.txt" "$col" "compliance_datenschutz" "guidance" "2024" \ + '{"regulation_id":"owasp_mobile_top10","source_id":"owasp","doc_type":"risk_catalog","guideline_name":"OWASP Mobile Top 10","license":"CC_BY-SA","attribution":"OWASP Foundation","source":"owasp.org"}' \ + "OWASP Mobile Top 10" + fi + + after=$(collection_count "$col") + diff=$(( after - before )) + log "Collection $col (OWASP): ${before} → ${after} chunks (+${diff})" + + # ========================================================================= + # J3: ENISA Guidelines (CC BY 4.0) + # → bp_compliance_ce (EU-Content) + # ========================================================================= + col="bp_compliance_ce" + before=$(collection_count "$col") + log "--- J3: ENISA Guidelines → $col ($before chunks) ---" + + # ENISA Procurement Guidelines for Cybersecurity in Hospitals + download_pdf \ + "https://www.enisa.europa.eu/publications/good-practices-for-the-security-of-healthcare-services/@@download/fullReport" \ + "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" || true + # Fallback URL if the above doesn't work + if [[ ! -f "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" ]]; then + download_pdf \ + "https://www.enisa.europa.eu/publications/procurement-guidelines-for-cybersecurity-in-hospitals/@@download/fullReport" \ + "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" || true + fi + if [[ -f "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" "$col" "compliance_ce" "guidance" "2024" \ + '{"regulation_id":"enisa_procurement_hospitals","source_id":"enisa","doc_type":"procurement_guidelines","guideline_name":"ENISA Procurement Guidelines for Cybersecurity in Hospitals","license":"CC_BY_4.0","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \ + "ENISA Procurement Guidelines Hospitals" + fi + + # ENISA Cloud Security Guide for SMEs + download_pdf \ + "https://www.enisa.europa.eu/publications/cloud-security-guide-for-smes/@@download/fullReport" \ + "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" || true + if [[ ! -f "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" ]]; then + download_pdf \ + "https://www.enisa.europa.eu/publications/cloud-security-guide-for-small-and-medium-sized-enterprises/@@download/fullReport" \ + "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" || true + fi + if [[ -f "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" "$col" "compliance_ce" "guidance" "2015" \ + '{"regulation_id":"enisa_cloud_smes","source_id":"enisa","doc_type":"security_guide","guideline_name":"ENISA Cloud Security Guide for SMEs","license":"CC_BY_4.0","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \ + "ENISA Cloud Security Guide SMEs" + fi + + after=$(collection_count "$col") + diff=$(( after - before )) + log "Collection $col (ENISA): ${before} → ${after} chunks (+${diff})" + + # ========================================================================= + # Summary + # ========================================================================= + echo "" + log "Phase J abgeschlossen." + log "Ingestiert: NIST (7 Standards), OWASP (6 Projekte), ENISA (2 Guides)" + log "" + log "NICHT ingestiert (lizenzrechtlich ausgeschlossen):" + log " BSI TR-03161-1/2/3, TR-03109, TR-03116, TR-03125 (Lizenz unklar)" + log " BSI C5:2020, Standards 200-1/2/3/4 (Lizenz unklar)" + log " BSI Grundschutz 2023 (Alle Rechte vorbehalten)" + log " ETSI EN 303 645 + TS/TR (Copyright, Reproduktion verboten)" + log " ISO/IEC 27001, 27002, 27701 (paywalled)" + log " ISO/SAE 21434 (paywalled)" + log " UN R155, R156 (non-commercial only)" + log " IEC 62304, 81001-5-1 (paywalled + KI-Verbot)" + log " CSA CCM/CAIQ (non-commercial)" + log " CIS Controls v8.1 (CC BY-NC-ND)" + log " MDCG 2019-16 (Lizenz unklar)" +} + # ============================================================================= # PHASE F: Verifizierung # ============================================================================= @@ -1647,6 +1979,7 @@ main() { datenschutz) phase_datenschutz ;; verbraucherschutz) phase_verbraucherschutz ;; dach) phase_dach ;; + security) phase_security ;; verify) phase_verify ;; version) phase_register_version ;; *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;; @@ -1670,6 +2003,8 @@ main() { echo "" phase_dach echo "" + phase_security + echo "" phase_verify echo "" phase_register_version