From 42ec3cad6de4b5fa7553283d7317579201e5948e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 14:36:59 +0100 Subject: [PATCH] =?UTF-8?q?feat(rag):=20Phase=20I=20DACH-Erweiterung=20?= =?UTF-8?q?=E2=80=94=20Gesetze,=20Templates,=20Urteile?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New ingestion phase 'dach' adds missing documents from DACH catalog: I1: UStG (Retention), MStV (Impressum) I2: DSK Muster-VVT, DSK KP5 DSFA, BfDI Beispiel-VVT (DL-DE/BY-2.0) I3: BSI IT-Grundschutz Kompendium 2024 (CC BY-SA 4.0) I4: 7 Gerichtsentscheidungen as Praxisanker: - DE: LG Bonn 1&1, BGH Planet49, BGH Art.82 (2x) - AT: OGH Schutzzweck, OGH Art.15+82 EuGH-Vorlage - CH: BVGer DSG-Auskunft, BGer Datensperre Trigger: workflow_dispatch phase=dach Co-Authored-By: Claude Opus 4.6 --- .gitea/workflows/rag-ingest.yaml | 2 +- scripts/ingest-legal-corpus.sh | 252 +++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/rag-ingest.yaml b/.gitea/workflows/rag-ingest.yaml index a4db522..4edd11b 100644 --- a/.gitea/workflows/rag-ingest.yaml +++ b/.gitea/workflows/rag-ingest.yaml @@ -14,7 +14,7 @@ on: workflow_dispatch: inputs: phase: - description: 'Ingestion Phase (gesetze, eu, templates, datenschutz, verbraucherschutz, verify, version, all)' + description: 'Ingestion Phase (gesetze, eu, templates, datenschutz, verbraucherschutz, dach, verify, version, all)' required: true default: 'verbraucherschutz' diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh index 2bda2cd..6cc802a 100755 --- a/scripts/ingest-legal-corpus.sh +++ b/scripts/ingest-legal-corpus.sh @@ -1183,6 +1183,255 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: log " Layer 5: EuGH + BGH Leitentscheidungen" } +# ============================================================================= +# PHASE I: DACH-Erweiterung (Quellenkatalog) +# DSK-Templates, Gerichtsentscheidungen, fehlende Gesetze +# Lizenzen: §5 UrhG (DE), §7 UrhG (AT), Art.5 URG (CH), DL-DE/BY-2.0, CC BY 4.0 +# ============================================================================= +phase_dach() { + log "==========================================" + log "PHASE I: DACH-Erweiterung" + log "==========================================" + + # ========================================================================= + # I1: Fehlende DE-Gesetze → bp_compliance_gesetze + # ========================================================================= + local col="bp_compliance_gesetze" + local before + before=$(collection_count "$col") + log "--- I1: Fehlende DE-Gesetze → $col ($before chunks) ---" + + # UStG (Umsatzsteuergesetz) — Retention/Rechnungspflichten + download_pdf "https://www.gesetze-im-internet.de/ustg_1980/UStG.pdf" "$WORK_DIR/pdfs/UStG.pdf" + upload_file "$WORK_DIR/pdfs/UStG.pdf" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"ustg","regulation_name_de":"Umsatzsteuergesetz (UStG)","category":"steuerrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "UStG (Umsatzsteuergesetz)" + + # MStV § 18 — Medienstaatsvertrag (Impressumspflicht) + # Nicht als PDF auf gesetze-im-internet.de, daher von Bayern + curl -sL "https://www.gesetze-bayern.de/Content/Pdf/MStV" -o "$WORK_DIR/pdfs/MStV.pdf" 2>/dev/null || true + if [[ -f "$WORK_DIR/pdfs/MStV.pdf" && $(stat -f%z "$WORK_DIR/pdfs/MStV.pdf" 2>/dev/null || stat -c%s "$WORK_DIR/pdfs/MStV.pdf" 2>/dev/null || echo 0) -gt 1000 ]]; then + upload_file "$WORK_DIR/pdfs/MStV.pdf" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"mstv","regulation_name_de":"Medienstaatsvertrag (MStV)","category":"medienrecht","license":"public_domain_§5_UrhG","source":"gesetze-bayern.de"}' \ + "MStV (Medienstaatsvertrag)" + else + warn "MStV PDF download failed — skipping" + fi + + local after + after=$(collection_count "$col") + local diff=$(( after - before )) + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" + + # ========================================================================= + # I2: DSK Templates & BfDI → bp_legal_templates + # Lizenz: DL-DE/BY-2.0 (kommerziell mit Attribution) + # ========================================================================= + col="bp_legal_templates" + before=$(collection_count "$col") + log "--- I2: DSK/BfDI Templates → $col ($before chunks) ---" + + # DSK Muster-VVT (Art. 30 DSGVO) + download_pdf \ + "https://www.datenschutzkonferenz-online.de/media/ah/201802_ah_muster_verantwortliche.pdf" \ + "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" + upload_file "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" "$col" "compliance_template" "template" "2018" \ + '{"regulation_id":"dsk_muster_vvt","regulation_name_de":"DSK Muster-VVT (Art. 30 DSGVO)","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \ + "DSK Muster-VVT" + + # DSK Kurzpapier Nr. 5 DSFA + download_pdf \ + "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf" \ + "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" + upload_file "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" "$col" "compliance_template" "guidance" "2018" \ + '{"regulation_id":"dsk_kp5_dsfa","regulation_name_de":"DSK Kurzpapier Nr. 5 Datenschutz-Folgenabschaetzung","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \ + "DSK Kurzpapier Nr. 5 DSFA" + + # BfDI Beispiel-VVT + download_pdf \ + "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/DokumenteBfDI/AccessForAll/2023/2023_Verzeichnis-Verarbeitungst%C3%A4tigkeiten.pdf?__blob=publicationFile&v=2" \ + "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" + upload_file "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" "$col" "compliance_template" "template" "2023" \ + '{"regulation_id":"bfdi_beispiel_vvt","regulation_name_de":"BfDI Beispiel-VVT mit Loeschfristen","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Bundesbeauftragter fuer den Datenschutz und die Informationsfreiheit (BfDI)","source":"bfdi.bund.de"}' \ + "BfDI Beispiel-VVT" + + after=$(collection_count "$col") + diff=$(( after - before )) + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" + + # ========================================================================= + # I3: BSI Stand-der-Technik (OSCAL) → bp_compliance_gesetze + # Lizenz: CC BY-SA 4.0 + # ========================================================================= + col="bp_compliance_gesetze" + before=$(collection_count "$col") + log "--- I3: BSI OSCAL → $col ($before chunks) ---" + + # BSI IT-Grundschutz Kompendium (PDF) + download_pdf \ + "https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Grundschutz/Kompendium/IT_Grundschutz_Kompendium_Edition2024.pdf?__blob=publicationFile&v=4" \ + "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" + if [[ -f "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" ]]; then + upload_file "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" "$col" "compliance" "guidance" "2024" \ + '{"regulation_id":"bsi_grundschutz_2024","regulation_name_de":"BSI IT-Grundschutz Kompendium 2024","category":"informationssicherheit","license":"CC_BY-SA_4.0","attribution":"Bundesamt fuer Sicherheit in der Informationstechnik (BSI)","source":"bsi.bund.de"}' \ + "BSI IT-Grundschutz Kompendium 2024" + fi + + after=$(collection_count "$col") + diff=$(( after - before )) + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" + + # ========================================================================= + # I4: Gerichtsentscheidungen → bp_compliance_datenschutz + # Lizenz: §5 UrhG (DE), §7 UrhG (AT), Art.5 URG (CH) — amtliche Werke + # ========================================================================= + col="bp_compliance_datenschutz" + before=$(collection_count "$col") + log "--- I4: Gerichtsentscheidungen → $col ($before chunks) ---" + + # DE: LG Bonn 29 OWi 1/20 (1&1 Bussgeldbescheid, Art. 32) + curl -sL "https://www.justiz.nrw.de/nrwe/lgs/bonn/lg_bonn/j2020/29OWi1_20_Urteil_20201111.html" \ + -o "$WORK_DIR/texts/LG_Bonn_29OWi1_20.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/LG_Bonn_29OWi1_20.html" && $(wc -c < "$WORK_DIR/texts/LG_Bonn_29OWi1_20.html") -gt 500 ]]; then + # Convert HTML to text + python3 -c " +import re +with open('$WORK_DIR/texts/LG_Bonn_29OWi1_20.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/LG_Bonn_29OWi1_20.txt', 'w') as f: + f.write('LG Bonn, Urteil vom 11.11.2020, 29 OWi 1/20 (1&1 Telecom)\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/LG_Bonn_29OWi1_20.txt" "$col" "compliance_datenschutz" "case_law" "2020" \ + '{"regulation_id":"lg_bonn_29owi1_20","doc_type":"court_decision","court":"LG Bonn","case_number":"29 OWi 1/20","date":"2020-11-11","topic":"Art. 32 DSGVO Authentifizierung Bussgeld","country":"DE","license":"public_domain_§5_UrhG","source":"justiz.nrw.de"}' \ + "LG Bonn 29 OWi 1/20 (Art. 32 Bussgeld)" + fi + + # DE: BGH I ZR 7/16 (Planet49 Cookie-Einwilligung) + curl -sL "https://www.bundesgerichtshof.de/SharedDocs/Pressemitteilungen/DE/2020/2020067.html" \ + -o "$WORK_DIR/texts/BGH_Planet49.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/BGH_Planet49.html" && $(wc -c < "$WORK_DIR/texts/BGH_Planet49.html") -gt 500 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/BGH_Planet49.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/BGH_Planet49.txt', 'w') as f: + f.write('BGH, Urteil vom 28.05.2020, I ZR 7/16 (Planet49 Cookie-Einwilligung)\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/BGH_Planet49.txt" "$col" "compliance_datenschutz" "case_law" "2020" \ + '{"regulation_id":"bgh_i_zr_7_16","doc_type":"court_decision","court":"BGH","case_number":"I ZR 7/16","date":"2020-05-28","topic":"Cookie-Einwilligung Opt-in Planet49","country":"DE","license":"public_domain_§5_UrhG","source":"bundesgerichtshof.de"}' \ + "BGH I ZR 7/16 (Planet49 Cookie-Einwilligung)" + fi + + # DE: BGH Pressemitteilung 218/2024 (Art. 82 DSGVO Schadensersatz) + curl -sL "https://www.bundesgerichtshof.de/SharedDocs/Pressemitteilungen/DE/2024/2024218.html" \ + -o "$WORK_DIR/texts/BGH_Art82_2024.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/BGH_Art82_2024.html" && $(wc -c < "$WORK_DIR/texts/BGH_Art82_2024.html") -gt 500 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/BGH_Art82_2024.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/BGH_Art82_2024.txt', 'w') as f: + f.write('BGH, Pressemitteilung 218/2024, Art. 82 DSGVO Schadensersatz\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/BGH_Art82_2024.txt" "$col" "compliance_datenschutz" "case_law" "2024" \ + '{"regulation_id":"bgh_art82_2024_218","doc_type":"court_decision","court":"BGH","date":"2024-11-18","topic":"Art. 82 DSGVO immaterieller Schadensersatz Bemessung","country":"DE","license":"public_domain_§5_UrhG","source":"bundesgerichtshof.de"}' \ + "BGH PM 218/2024 (Art. 82 Schadensersatz)" + fi + + # DE: BGH VI ZR 396/24 (Art. 82 Konkretisierung) + curl -sL "https://www.bundesgerichtshof.de/SharedDocs/Entscheidungen/DE/2025/2025-11-11-VIZR396_24.html" \ + -o "$WORK_DIR/texts/BGH_VI_ZR_396_24.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/BGH_VI_ZR_396_24.html" && $(wc -c < "$WORK_DIR/texts/BGH_VI_ZR_396_24.html") -gt 500 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/BGH_VI_ZR_396_24.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/BGH_VI_ZR_396_24.txt', 'w') as f: + f.write('BGH, Urteil vom 11.11.2025, VI ZR 396/24 (Art. 82 DSGVO Konkretisierung)\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/BGH_VI_ZR_396_24.txt" "$col" "compliance_datenschutz" "case_law" "2025" \ + '{"regulation_id":"bgh_vi_zr_396_24","doc_type":"court_decision","court":"BGH","case_number":"VI ZR 396/24","date":"2025-11-11","topic":"Art. 82 DSGVO Darlegungslast Schadensbemessung","country":"DE","license":"public_domain_§5_UrhG","source":"bundesgerichtshof.de"}' \ + "BGH VI ZR 396/24 (Art. 82 Konkretisierung)" + fi + + # AT: OGH 6 Ob 70/24y (Schutzzweck/Kausalitaet) + curl -sL "https://www.ogh.gv.at/entscheidungen/entscheidungen-ogh/datenschutzrecht-zum-schutzzweck-der-datenschutz-grundverordnung-dsgvo-und-des-datenschutzgesetzes-dsg/" \ + -o "$WORK_DIR/texts/OGH_6Ob70_24y.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/OGH_6Ob70_24y.html" && $(wc -c < "$WORK_DIR/texts/OGH_6Ob70_24y.html") -gt 500 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/OGH_6Ob70_24y.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/OGH_6Ob70_24y.txt', 'w') as f: + f.write('OGH, 6 Ob 70/24y, 15.05.2024 (Schutzzweck DSGVO/DSG)\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/OGH_6Ob70_24y.txt" "$col" "compliance_datenschutz" "case_law" "2024" \ + '{"regulation_id":"ogh_6ob70_24y","doc_type":"court_decision","court":"OGH","case_number":"6 Ob 70/24y","date":"2024-05-15","topic":"Schutzzweck DSGVO Kausalitaet Schadensersatz","country":"AT","license":"public_domain_§7_UrhG_AT","source":"ogh.gv.at"}' \ + "OGH 6 Ob 70/24y (Schutzzweck DSGVO)" + fi + + # AT: OGH 6 Ob 102/24d (Art. 15 Herkunft + Art. 82) + curl -sL "https://www.ogh.gv.at/entscheidungen/vorabentscheidungsersuchen-eugh/vorabentscheidungsersuchen-zum-auskunftsrecht-nach-art-15-dsgvo-und-dem-recht-auf-schadenersatz-nach-art-82-dsgvo/" \ + -o "$WORK_DIR/texts/OGH_6Ob102_24d.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/OGH_6Ob102_24d.html" && $(wc -c < "$WORK_DIR/texts/OGH_6Ob102_24d.html") -gt 500 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/OGH_6Ob102_24d.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/OGH_6Ob102_24d.txt', 'w') as f: + f.write('OGH, 6 Ob 102/24d, 18.02.2025 (EuGH-Vorlage Art. 15 + Art. 82)\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/OGH_6Ob102_24d.txt" "$col" "compliance_datenschutz" "case_law" "2025" \ + '{"regulation_id":"ogh_6ob102_24d","doc_type":"court_decision","court":"OGH","case_number":"6 Ob 102/24d","date":"2025-02-18","topic":"Art. 15 Auskunftsrecht Herkunft Art. 82 Schadensersatz EuGH-Vorlage","country":"AT","license":"public_domain_§7_UrhG_AT","source":"ogh.gv.at"}' \ + "OGH 6 Ob 102/24d (Art. 15 + Art. 82 EuGH-Vorlage)" + fi + + # CH: BVGer B-915/2022 (BVGE 2024 IV/2 — DSG-Auskunft vs. Akteneinsicht) + download_pdf \ + "https://jurispub.admin.ch/publiws/download?decisionId=ed176fe0-fb98-425f-8ded-ca9da49c816b" \ + "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" + upload_file "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" "$col" "compliance_datenschutz" "case_law" "2024" \ + '{"regulation_id":"bvge_2024_iv_2","doc_type":"court_decision","court":"BVGer","case_number":"B-915/2022","date":"2024-04-03","topic":"DSG-Auskunft vs. Akteneinsicht CH","country":"CH","license":"public_domain_Art5_URG","source":"jurispub.admin.ch"}' \ + "BVGer B-915/2022 (DSG-Auskunft vs. Akteneinsicht)" + + # CH: BGer 1C 562/2024 (Datensperre/Anonymisierung) + curl -sL "https://relevancy.bger.ch/cgi-bin/JumpCGI?id=13.01.2025_1C_562%2F2024" \ + -o "$WORK_DIR/texts/BGer_1C_562_2024.html" 2>/dev/null || true + if [[ -f "$WORK_DIR/texts/BGer_1C_562_2024.html" && $(wc -c < "$WORK_DIR/texts/BGer_1C_562_2024.html") -gt 500 ]]; then + python3 -c " +import re +with open('$WORK_DIR/texts/BGer_1C_562_2024.html') as f: + html = f.read() +text = re.sub(r'<[^>]+>', ' ', html) +text = re.sub(r'\s+', ' ', text).strip() +with open('$WORK_DIR/texts/BGer_1C_562_2024.txt', 'w') as f: + f.write('BGer, 1C 562/2024, 13.01.2025 (Datensperre/Anonymisierung)\n\n' + text) +" 2>/dev/null + upload_file "$WORK_DIR/texts/BGer_1C_562_2024.txt" "$col" "compliance_datenschutz" "case_law" "2025" \ + '{"regulation_id":"bger_1c_562_2024","doc_type":"court_decision","court":"BGer","case_number":"1C 562/2024","date":"2025-01-13","topic":"Datensperre Anonymisierung DSG","country":"CH","license":"public_domain_Art5_URG","source":"bger.ch"}' \ + "BGer 1C 562/2024 (Datensperre/Anonymisierung)" + fi + + after=$(collection_count "$col") + diff=$(( after - before )) + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" + + echo "" + log "Phase I abgeschlossen." +} + # ============================================================================= # PHASE F: Verifizierung # ============================================================================= @@ -1378,6 +1627,7 @@ main() { templates) phase_templates ;; datenschutz) phase_datenschutz ;; verbraucherschutz) phase_verbraucherschutz ;; + dach) phase_dach ;; verify) phase_verify ;; version) phase_register_version ;; *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;; @@ -1399,6 +1649,8 @@ main() { echo "" phase_verbraucherschutz echo "" + phase_dach + echo "" phase_verify echo "" phase_register_version