diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh index 6cc802a..0ad5160 100755 --- a/scripts/ingest-legal-corpus.sh +++ b/scripts/ingest-legal-corpus.sh @@ -158,8 +158,16 @@ download_pdf() { log "Downloading: $(basename "$target")" curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || { warn "Download failed: $url" - return 1 + rm -f "$target" + return 0 } + # Verify file is not empty/too small (e.g. HTML error page) + local fsize + fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0) + if [[ "$fsize" -lt 1000 ]]; then + warn "Download too small (${fsize}B), likely error page: $(basename "$target")" + rm -f "$target" + fi } # Extract text from gesetze-im-internet.de HTML page @@ -212,7 +220,8 @@ parser.feed(sys.stdin.read()) print(''.join(parser.text).strip()) " > "$output" || { warn "Extraction failed: $label" - return 1 + rm -f "$output" + return 0 } } @@ -271,15 +280,15 @@ phase_download() { download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \ - "$WORK_DIR/pdfs/dsa_2022_2065.pdf" + "$WORK_DIR/pdfs/dsa_2022_2065.pdf" || true download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \ - "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" + "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" || true download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \ - "$WORK_DIR/pdfs/scc_2021_914.pdf" + "$WORK_DIR/pdfs/scc_2021_914.pdf" || true # --- A2: Deutsche Gesetze (Einzelparagraphen) --- log "--- Deutsche Gesetze (Einzelparagraphen) ---" @@ -851,7 +860,7 @@ phase_verbraucherschutz() { download_pdf \ "https://www.gesetze-im-internet.de/${path}.pdf" \ - "$pdf_file" + "$pdf_file" || true if [[ -f "$pdf_file" ]]; then upload_file "$pdf_file" "$col" "compliance" "legal_reference" "2025" \ @@ -1111,11 +1120,11 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: if [[ "$celex" == "32024R1689" ]]; then download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=OJ:L_202401689" \ - "$pdf_file" + "$pdf_file" || true else download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:${celex}" \ - "$pdf_file" + "$pdf_file" || true fi if [[ -f "$pdf_file" ]]; then @@ -1140,7 +1149,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: # NIST Cybersecurity Framework 2.0 download_pdf \ "https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \ - "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" + "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" || true if [[ -f "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" ]]; then upload_file "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \ '{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework 2.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ @@ -1150,7 +1159,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: # NIST Privacy Framework 1.0 download_pdf \ "https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.01162020.pdf" \ - "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" + "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" || true if [[ -f "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" ]]; then upload_file "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \ '{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Privacy Framework 1.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \ @@ -1160,7 +1169,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: # HLEG Ethics Guidelines for Trustworthy AI download_pdf \ "https://op.europa.eu/en/publication-detail/-/publication/d3988569-0434-11ea-8c1f-01aa75ed71a1/language-en/format-PDF" \ - "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" + "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" || true if [[ -f "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" ]]; then upload_file "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" "$col" "compliance_datenschutz" "guidance" "2019" \ '{"source_id":"hleg","doc_type":"ethics_guidelines","guideline_name":"Ethics Guidelines for Trustworthy AI","license":"CC_BY_4.0","attribution":"High-Level Expert Group on AI (HLEG)","source":"op.europa.eu"}' \ @@ -1202,10 +1211,12 @@ phase_dach() { log "--- I1: Fehlende DE-Gesetze → $col ($before chunks) ---" # UStG (Umsatzsteuergesetz) — Retention/Rechnungspflichten - download_pdf "https://www.gesetze-im-internet.de/ustg_1980/UStG.pdf" "$WORK_DIR/pdfs/UStG.pdf" - upload_file "$WORK_DIR/pdfs/UStG.pdf" "$col" "compliance" "legal_reference" "2025" \ - '{"regulation_id":"ustg","regulation_name_de":"Umsatzsteuergesetz (UStG)","category":"steuerrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ - "UStG (Umsatzsteuergesetz)" + download_pdf "https://www.gesetze-im-internet.de/ustg_1980/UStG.pdf" "$WORK_DIR/pdfs/UStG.pdf" || true + if [[ -f "$WORK_DIR/pdfs/UStG.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/UStG.pdf" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"ustg","regulation_name_de":"Umsatzsteuergesetz (UStG)","category":"steuerrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "UStG (Umsatzsteuergesetz)" + fi # MStV § 18 — Medienstaatsvertrag (Impressumspflicht) # Nicht als PDF auf gesetze-im-internet.de, daher von Bayern @@ -1234,26 +1245,32 @@ phase_dach() { # DSK Muster-VVT (Art. 30 DSGVO) download_pdf \ "https://www.datenschutzkonferenz-online.de/media/ah/201802_ah_muster_verantwortliche.pdf" \ - "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" - upload_file "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" "$col" "compliance_template" "template" "2018" \ - '{"regulation_id":"dsk_muster_vvt","regulation_name_de":"DSK Muster-VVT (Art. 30 DSGVO)","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \ - "DSK Muster-VVT" + "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" || true + if [[ -f "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" "$col" "compliance_template" "template" "2018" \ + '{"regulation_id":"dsk_muster_vvt","regulation_name_de":"DSK Muster-VVT (Art. 30 DSGVO)","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \ + "DSK Muster-VVT" + fi # DSK Kurzpapier Nr. 5 DSFA download_pdf \ "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf" \ - "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" - upload_file "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" "$col" "compliance_template" "guidance" "2018" \ - '{"regulation_id":"dsk_kp5_dsfa","regulation_name_de":"DSK Kurzpapier Nr. 5 Datenschutz-Folgenabschaetzung","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \ - "DSK Kurzpapier Nr. 5 DSFA" + "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" || true + if [[ -f "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" "$col" "compliance_template" "guidance" "2018" \ + '{"regulation_id":"dsk_kp5_dsfa","regulation_name_de":"DSK Kurzpapier Nr. 5 Datenschutz-Folgenabschaetzung","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \ + "DSK Kurzpapier Nr. 5 DSFA" + fi # BfDI Beispiel-VVT download_pdf \ "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/DokumenteBfDI/AccessForAll/2023/2023_Verzeichnis-Verarbeitungst%C3%A4tigkeiten.pdf?__blob=publicationFile&v=2" \ - "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" - upload_file "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" "$col" "compliance_template" "template" "2023" \ - '{"regulation_id":"bfdi_beispiel_vvt","regulation_name_de":"BfDI Beispiel-VVT mit Loeschfristen","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Bundesbeauftragter fuer den Datenschutz und die Informationsfreiheit (BfDI)","source":"bfdi.bund.de"}' \ - "BfDI Beispiel-VVT" + "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" || true + if [[ -f "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" "$col" "compliance_template" "template" "2023" \ + '{"regulation_id":"bfdi_beispiel_vvt","regulation_name_de":"BfDI Beispiel-VVT mit Loeschfristen","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Bundesbeauftragter fuer den Datenschutz und die Informationsfreiheit (BfDI)","source":"bfdi.bund.de"}' \ + "BfDI Beispiel-VVT" + fi after=$(collection_count "$col") diff=$(( after - before )) @@ -1270,7 +1287,7 @@ phase_dach() { # BSI IT-Grundschutz Kompendium (PDF) download_pdf \ "https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Grundschutz/Kompendium/IT_Grundschutz_Kompendium_Edition2024.pdf?__blob=publicationFile&v=4" \ - "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" + "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" || true if [[ -f "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" ]]; then upload_file "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" "$col" "compliance" "guidance" "2024" \ '{"regulation_id":"bsi_grundschutz_2024","regulation_name_de":"BSI IT-Grundschutz Kompendium 2024","category":"informationssicherheit","license":"CC_BY-SA_4.0","attribution":"Bundesamt fuer Sicherheit in der Informationstechnik (BSI)","source":"bsi.bund.de"}' \ @@ -1401,10 +1418,12 @@ with open('$WORK_DIR/texts/OGH_6Ob102_24d.txt', 'w') as f: # CH: BVGer B-915/2022 (BVGE 2024 IV/2 — DSG-Auskunft vs. Akteneinsicht) download_pdf \ "https://jurispub.admin.ch/publiws/download?decisionId=ed176fe0-fb98-425f-8ded-ca9da49c816b" \ - "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" - upload_file "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" "$col" "compliance_datenschutz" "case_law" "2024" \ - '{"regulation_id":"bvge_2024_iv_2","doc_type":"court_decision","court":"BVGer","case_number":"B-915/2022","date":"2024-04-03","topic":"DSG-Auskunft vs. Akteneinsicht CH","country":"CH","license":"public_domain_Art5_URG","source":"jurispub.admin.ch"}' \ - "BVGer B-915/2022 (DSG-Auskunft vs. Akteneinsicht)" + "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" || true + if [[ -f "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" "$col" "compliance_datenschutz" "case_law" "2024" \ + '{"regulation_id":"bvge_2024_iv_2","doc_type":"court_decision","court":"BVGer","case_number":"B-915/2022","date":"2024-04-03","topic":"DSG-Auskunft vs. Akteneinsicht CH","country":"CH","license":"public_domain_Art5_URG","source":"jurispub.admin.ch"}' \ + "BVGer B-915/2022 (DSG-Auskunft vs. Akteneinsicht)" + fi # CH: BGer 1C 562/2024 (Datensperre/Anonymisierung) curl -sL "https://relevancy.bger.ch/cgi-bin/JumpCGI?id=13.01.2025_1C_562%2F2024" \