fix(rag): Make download failures non-fatal — prevent set -e from aborting entire ingestion
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 43s
CI/CD / test-python-backend-compliance (push) Successful in 38s
CI/CD / test-python-document-crawler (push) Successful in 30s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / deploy-hetzner (push) Successful in 17s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 43s
CI/CD / test-python-backend-compliance (push) Successful in 38s
CI/CD / test-python-document-crawler (push) Successful in 30s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / deploy-hetzner (push) Successful in 17s
download_pdf() and extract_gesetz_html() now return 0 on failure and clean up partial files. This prevents set -euo pipefail from aborting the entire script when a single download fails (e.g. EUR-Lex timeout, BSI redirect). Root cause of H2 EU loop only processing 1 document in Run #724: first failed download_pdf returned 1, triggering set -e script abort. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -158,8 +158,16 @@ download_pdf() {
|
||||
log "Downloading: $(basename "$target")"
|
||||
curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
|
||||
warn "Download failed: $url"
|
||||
return 1
|
||||
rm -f "$target"
|
||||
return 0
|
||||
}
|
||||
# Verify file is not empty/too small (e.g. HTML error page)
|
||||
local fsize
|
||||
fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0)
|
||||
if [[ "$fsize" -lt 1000 ]]; then
|
||||
warn "Download too small (${fsize}B), likely error page: $(basename "$target")"
|
||||
rm -f "$target"
|
||||
fi
|
||||
}
|
||||
|
||||
# Extract text from gesetze-im-internet.de HTML page
|
||||
@@ -212,7 +220,8 @@ parser.feed(sys.stdin.read())
|
||||
print(''.join(parser.text).strip())
|
||||
" > "$output" || {
|
||||
warn "Extraction failed: $label"
|
||||
return 1
|
||||
rm -f "$output"
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,15 +280,15 @@ phase_download() {
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
|
||||
"$WORK_DIR/pdfs/dsa_2022_2065.pdf"
|
||||
"$WORK_DIR/pdfs/dsa_2022_2065.pdf" || true
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
|
||||
"$WORK_DIR/pdfs/eprivacy_2002_58.pdf"
|
||||
"$WORK_DIR/pdfs/eprivacy_2002_58.pdf" || true
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
|
||||
"$WORK_DIR/pdfs/scc_2021_914.pdf"
|
||||
"$WORK_DIR/pdfs/scc_2021_914.pdf" || true
|
||||
|
||||
# --- A2: Deutsche Gesetze (Einzelparagraphen) ---
|
||||
log "--- Deutsche Gesetze (Einzelparagraphen) ---"
|
||||
@@ -851,7 +860,7 @@ phase_verbraucherschutz() {
|
||||
|
||||
download_pdf \
|
||||
"https://www.gesetze-im-internet.de/${path}.pdf" \
|
||||
"$pdf_file"
|
||||
"$pdf_file" || true
|
||||
|
||||
if [[ -f "$pdf_file" ]]; then
|
||||
upload_file "$pdf_file" "$col" "compliance" "legal_reference" "2025" \
|
||||
@@ -1111,11 +1120,11 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
if [[ "$celex" == "32024R1689" ]]; then
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=OJ:L_202401689" \
|
||||
"$pdf_file"
|
||||
"$pdf_file" || true
|
||||
else
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:${celex}" \
|
||||
"$pdf_file"
|
||||
"$pdf_file" || true
|
||||
fi
|
||||
|
||||
if [[ -f "$pdf_file" ]]; then
|
||||
@@ -1140,7 +1149,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
# NIST Cybersecurity Framework 2.0
|
||||
download_pdf \
|
||||
"https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \
|
||||
"$WORK_DIR/pdfs/NIST_CSF_2.0.pdf"
|
||||
"$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||
'{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework 2.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
|
||||
@@ -1150,7 +1159,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
# NIST Privacy Framework 1.0
|
||||
download_pdf \
|
||||
"https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.01162020.pdf" \
|
||||
"$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf"
|
||||
"$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
|
||||
'{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Privacy Framework 1.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
|
||||
@@ -1160,7 +1169,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
# HLEG Ethics Guidelines for Trustworthy AI
|
||||
download_pdf \
|
||||
"https://op.europa.eu/en/publication-detail/-/publication/d3988569-0434-11ea-8c1f-01aa75ed71a1/language-en/format-PDF" \
|
||||
"$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf"
|
||||
"$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" "$col" "compliance_datenschutz" "guidance" "2019" \
|
||||
'{"source_id":"hleg","doc_type":"ethics_guidelines","guideline_name":"Ethics Guidelines for Trustworthy AI","license":"CC_BY_4.0","attribution":"High-Level Expert Group on AI (HLEG)","source":"op.europa.eu"}' \
|
||||
@@ -1202,10 +1211,12 @@ phase_dach() {
|
||||
log "--- I1: Fehlende DE-Gesetze → $col ($before chunks) ---"
|
||||
|
||||
# UStG (Umsatzsteuergesetz) — Retention/Rechnungspflichten
|
||||
download_pdf "https://www.gesetze-im-internet.de/ustg_1980/UStG.pdf" "$WORK_DIR/pdfs/UStG.pdf"
|
||||
upload_file "$WORK_DIR/pdfs/UStG.pdf" "$col" "compliance" "legal_reference" "2025" \
|
||||
'{"regulation_id":"ustg","regulation_name_de":"Umsatzsteuergesetz (UStG)","category":"steuerrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
||||
"UStG (Umsatzsteuergesetz)"
|
||||
download_pdf "https://www.gesetze-im-internet.de/ustg_1980/UStG.pdf" "$WORK_DIR/pdfs/UStG.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/UStG.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/UStG.pdf" "$col" "compliance" "legal_reference" "2025" \
|
||||
'{"regulation_id":"ustg","regulation_name_de":"Umsatzsteuergesetz (UStG)","category":"steuerrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
||||
"UStG (Umsatzsteuergesetz)"
|
||||
fi
|
||||
|
||||
# MStV § 18 — Medienstaatsvertrag (Impressumspflicht)
|
||||
# Nicht als PDF auf gesetze-im-internet.de, daher von Bayern
|
||||
@@ -1234,26 +1245,32 @@ phase_dach() {
|
||||
# DSK Muster-VVT (Art. 30 DSGVO)
|
||||
download_pdf \
|
||||
"https://www.datenschutzkonferenz-online.de/media/ah/201802_ah_muster_verantwortliche.pdf" \
|
||||
"$WORK_DIR/pdfs/DSK_Muster_VVT.pdf"
|
||||
upload_file "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" "$col" "compliance_template" "template" "2018" \
|
||||
'{"regulation_id":"dsk_muster_vvt","regulation_name_de":"DSK Muster-VVT (Art. 30 DSGVO)","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \
|
||||
"DSK Muster-VVT"
|
||||
"$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" "$col" "compliance_template" "template" "2018" \
|
||||
'{"regulation_id":"dsk_muster_vvt","regulation_name_de":"DSK Muster-VVT (Art. 30 DSGVO)","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \
|
||||
"DSK Muster-VVT"
|
||||
fi
|
||||
|
||||
# DSK Kurzpapier Nr. 5 DSFA
|
||||
download_pdf \
|
||||
"https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf" \
|
||||
"$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf"
|
||||
upload_file "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" "$col" "compliance_template" "guidance" "2018" \
|
||||
'{"regulation_id":"dsk_kp5_dsfa","regulation_name_de":"DSK Kurzpapier Nr. 5 Datenschutz-Folgenabschaetzung","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \
|
||||
"DSK Kurzpapier Nr. 5 DSFA"
|
||||
"$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" "$col" "compliance_template" "guidance" "2018" \
|
||||
'{"regulation_id":"dsk_kp5_dsfa","regulation_name_de":"DSK Kurzpapier Nr. 5 Datenschutz-Folgenabschaetzung","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \
|
||||
"DSK Kurzpapier Nr. 5 DSFA"
|
||||
fi
|
||||
|
||||
# BfDI Beispiel-VVT
|
||||
download_pdf \
|
||||
"https://www.bfdi.bund.de/SharedDocs/Downloads/DE/DokumenteBfDI/AccessForAll/2023/2023_Verzeichnis-Verarbeitungst%C3%A4tigkeiten.pdf?__blob=publicationFile&v=2" \
|
||||
"$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf"
|
||||
upload_file "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" "$col" "compliance_template" "template" "2023" \
|
||||
'{"regulation_id":"bfdi_beispiel_vvt","regulation_name_de":"BfDI Beispiel-VVT mit Loeschfristen","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Bundesbeauftragter fuer den Datenschutz und die Informationsfreiheit (BfDI)","source":"bfdi.bund.de"}' \
|
||||
"BfDI Beispiel-VVT"
|
||||
"$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" "$col" "compliance_template" "template" "2023" \
|
||||
'{"regulation_id":"bfdi_beispiel_vvt","regulation_name_de":"BfDI Beispiel-VVT mit Loeschfristen","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Bundesbeauftragter fuer den Datenschutz und die Informationsfreiheit (BfDI)","source":"bfdi.bund.de"}' \
|
||||
"BfDI Beispiel-VVT"
|
||||
fi
|
||||
|
||||
after=$(collection_count "$col")
|
||||
diff=$(( after - before ))
|
||||
@@ -1270,7 +1287,7 @@ phase_dach() {
|
||||
# BSI IT-Grundschutz Kompendium (PDF)
|
||||
download_pdf \
|
||||
"https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Grundschutz/Kompendium/IT_Grundschutz_Kompendium_Edition2024.pdf?__blob=publicationFile&v=4" \
|
||||
"$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf"
|
||||
"$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" "$col" "compliance" "guidance" "2024" \
|
||||
'{"regulation_id":"bsi_grundschutz_2024","regulation_name_de":"BSI IT-Grundschutz Kompendium 2024","category":"informationssicherheit","license":"CC_BY-SA_4.0","attribution":"Bundesamt fuer Sicherheit in der Informationstechnik (BSI)","source":"bsi.bund.de"}' \
|
||||
@@ -1401,10 +1418,12 @@ with open('$WORK_DIR/texts/OGH_6Ob102_24d.txt', 'w') as f:
|
||||
# CH: BVGer B-915/2022 (BVGE 2024 IV/2 — DSG-Auskunft vs. Akteneinsicht)
|
||||
download_pdf \
|
||||
"https://jurispub.admin.ch/publiws/download?decisionId=ed176fe0-fb98-425f-8ded-ca9da49c816b" \
|
||||
"$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf"
|
||||
upload_file "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" "$col" "compliance_datenschutz" "case_law" "2024" \
|
||||
'{"regulation_id":"bvge_2024_iv_2","doc_type":"court_decision","court":"BVGer","case_number":"B-915/2022","date":"2024-04-03","topic":"DSG-Auskunft vs. Akteneinsicht CH","country":"CH","license":"public_domain_Art5_URG","source":"jurispub.admin.ch"}' \
|
||||
"BVGer B-915/2022 (DSG-Auskunft vs. Akteneinsicht)"
|
||||
"$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" || true
|
||||
if [[ -f "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" "$col" "compliance_datenschutz" "case_law" "2024" \
|
||||
'{"regulation_id":"bvge_2024_iv_2","doc_type":"court_decision","court":"BVGer","case_number":"B-915/2022","date":"2024-04-03","topic":"DSG-Auskunft vs. Akteneinsicht CH","country":"CH","license":"public_domain_Art5_URG","source":"jurispub.admin.ch"}' \
|
||||
"BVGer B-915/2022 (DSG-Auskunft vs. Akteneinsicht)"
|
||||
fi
|
||||
|
||||
# CH: BGer 1C 562/2024 (Datensperre/Anonymisierung)
|
||||
curl -sL "https://relevancy.bger.ch/cgi-bin/JumpCGI?id=13.01.2025_1C_562%2F2024" \
|
||||
|
||||
Reference in New Issue
Block a user