All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 37s
CI / test-python-backend-compliance (push) Successful in 37s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 21s
- Crawler erweitert: +26 neue Dokumente (DSK KP 1-20, SDM V3.1, BfDI Loeschkonzept, BayLDA TOM-Checkliste) - RAG-Queries optimiert: 18 Queries mit EDPB/DSK/WP-Referenzen fuer besseres Retrieval - Chat-Route: queryRAG nutzt jetzt Collection + Query-Boost aus DOCUMENT_RAG_CONFIG - TOM Control Library: 180 Controls in 12 Domaenen (ISO Annex-A Style, tom_controls_v1.json) - Risk Engine Spec: Impact/Likelihood 0-10, Score 0-100, 4 Tiers, Loeschfristen-Engine - Soul-Files: DSK-Kurzpapiere, SDM V3.1, BfDI als primaere deutsche Quellen - Manifest CSV: eu_de_privacy_manifest.csv mit Lizenz-Ampel (gruen/gelb/rot) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1722 lines
73 KiB
Python
Executable File
1722 lines
73 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
BreakPilot Compliance — EDPB/WP29/DSFA Auto-Crawler
|
|
|
|
Downloads, versioniert und ingestiert alle relevanten Datenschutz-Dokumente
|
|
in die Qdrant-Collections bp_compliance_datenschutz / bp_dsfa_corpus:
|
|
|
|
- EDPB Guidelines & Recommendations
|
|
- Endorsed WP29 Guidelines
|
|
- EDPS Guidance
|
|
- DSFA Muss-Listen (BfDI + 16 Bundeslaender)
|
|
- DSK Kurzpapiere (Nr. 1-20)
|
|
- DSK Orientierungshilfen (SDM V3.1, E-Mail-Verschluesselung, Telemedien)
|
|
- BfDI Praxis-Dokumente (Loeschkonzept)
|
|
- BayLDA/BayLfD Orientierungshilfen (TOM-Checkliste, Loeschung)
|
|
|
|
Ordnerstruktur:
|
|
~/rag-ingestion/sources/eu/edpb/guidelines/ EDPB eigene Guidelines
|
|
~/rag-ingestion/sources/eu/edpb/endorsed_wp29/ Endorsed WP29
|
|
~/rag-ingestion/sources/eu/edps/ EDPS Guidance
|
|
~/rag-ingestion/sources/de/bfdi/ BfDI DSFA-Liste
|
|
~/rag-ingestion/sources/de/bfdi/praxis/ BfDI Praxis-Dokumente
|
|
~/rag-ingestion/sources/de/dsk/ DSK gemeinsame Liste
|
|
~/rag-ingestion/sources/de/dsk/kurzpapiere/ DSK Kurzpapiere Nr. 1-20
|
|
~/rag-ingestion/sources/de/dsk/orientierungshilfen/ SDM, OH E-Mail, OH Telemedien
|
|
~/rag-ingestion/sources/de/dpas/{bw,by,...}/ Laender-DPA Listen
|
|
~/rag-ingestion/sources/de/baylda/ BayLDA Dokumente
|
|
~/rag-ingestion/sources/de/baylfb/ BayLfD Dokumente
|
|
~/rag-ingestion/manifests/ CSV-Manifeste
|
|
|
|
Usage:
|
|
python3 edpb-crawler.py --all # Download + Ingest alles
|
|
python3 edpb-crawler.py --download # Nur fehlende PDFs laden
|
|
python3 edpb-crawler.py --ingest # Geladene PDFs hochladen
|
|
python3 edpb-crawler.py --status # Uebersicht
|
|
python3 edpb-crawler.py --verify # RAG-Test-Suchen
|
|
python3 edpb-crawler.py --migrate # PDFs aus pdfs/ in sources/ verschieben
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
import urllib3
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
print("ERROR: 'requests' not installed. Run: pip3 install requests")
|
|
sys.exit(1)
|
|
|
|
# PyMuPDF for local text extraction fallback
|
|
try:
|
|
import fitz # PyMuPDF
|
|
HAS_PYMUPDF = True
|
|
except ImportError:
|
|
HAS_PYMUPDF = False
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
WORK_DIR = Path(os.environ.get("WORK_DIR", Path.home() / "rag-ingestion"))
|
|
SOURCES_DIR = WORK_DIR / "sources"
|
|
MANIFESTS_DIR = WORK_DIR / "manifests"
|
|
MANIFEST_PATH = WORK_DIR / "edpb-manifest.json"
|
|
RAG_URL = os.environ.get("RAG_URL", "https://localhost:8097/api/v1/documents/upload")
|
|
RAG_SEARCH_URL = os.environ.get("RAG_SEARCH_URL", "https://localhost:8097/api/v1/search")
|
|
COLLECTION = "bp_compliance_datenschutz"
|
|
COLLECTION_DSFA = "bp_dsfa_corpus"
|
|
TIMEOUT = 300
|
|
DOWNLOAD_DELAY = 3 # seconds between downloads to avoid 429
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Document Registry
|
|
#
|
|
# Jeder Eintrag hat: id, title, url, filename, subdir (relativ zu sources/),
|
|
# category, year, source_org, collection (optional, default COLLECTION).
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# --- Endorsed WP29 ---
|
|
WP29_ENDORSED = [
|
|
{
|
|
"id": "wp248_dpia",
|
|
"title": "WP248 rev.01 — Guidelines on DPIA",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=47711",
|
|
"filename": "edpb_wp248_dpia.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "dpia",
|
|
"year": 2017,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
|
|
},
|
|
{
|
|
"id": "wp243_dpo",
|
|
"title": "WP243 rev.01 — Guidelines on DPO",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=44100",
|
|
"filename": "edpb_wp243_dpo.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "dpo",
|
|
"year": 2016,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
|
|
},
|
|
{
|
|
"id": "wp260_transparency",
|
|
"title": "WP260 rev.01 — Guidelines on Transparency",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=51025",
|
|
"filename": "edpb_wp260_transparency.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "transparency",
|
|
"year": 2018,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://ec.europa.eu/newsroom/article29/item-detail.cfm?item_id=622227",
|
|
},
|
|
{
|
|
"id": "wp250_breach",
|
|
"title": "WP250 rev.01 — Guidelines on Data Breach Notification",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=47741",
|
|
"filename": "edpb_wp250_breach.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "breach",
|
|
"year": 2018,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
|
|
},
|
|
{
|
|
"id": "wp259_consent",
|
|
"title": "WP259 rev.01 — Guidelines on Consent under GDPR",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=51030",
|
|
"filename": "edpb_wp259_consent.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "consent",
|
|
"year": 2018,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
|
|
},
|
|
{
|
|
"id": "wp242_portability",
|
|
"title": "WP242 rev.01 — Guidelines on Right to Data Portability",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=44099",
|
|
"filename": "edpb_wp242_portability.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "portability",
|
|
"year": 2017,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
|
|
},
|
|
{
|
|
"id": "wp251_profiling",
|
|
"title": "WP251 rev.01 — Guidelines on Automated Decision-Making/Profiling",
|
|
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=49826",
|
|
"filename": "edpb_wp251_profiling.pdf",
|
|
"subdir": "eu/edpb/endorsed_wp29",
|
|
"category": "profiling",
|
|
"year": 2018,
|
|
"source_org": "Article 29 Working Party / EDPB endorsed",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
|
|
},
|
|
]
|
|
|
|
# --- EDPB eigene Guidelines & Recommendations ---
|
|
EDPB_GUIDELINES = [
|
|
{
|
|
"id": "edpb_consent_05_2020",
|
|
"title": "EDPB Guidelines 05/2020 on Consent",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf",
|
|
"filename": "edpb_consent_05_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "consent",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
|
|
},
|
|
{
|
|
"id": "edpb_dpbd_04_2019",
|
|
"title": "EDPB Guidelines 4/2019 on Data Protection by Design and Default",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf",
|
|
"filename": "edpb_dpbd_04_2019.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "dpbd",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-42019-article-25-data-protection-design-and_en",
|
|
},
|
|
{
|
|
"id": "edpb_transfers_01_2020",
|
|
"title": "EDPB Recommendations 01/2020 on Supplementary Transfer Measures",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/consultation/edpb_recommendations_202001_supplementarymeasurestransferstools_en.pdf",
|
|
"filename": "edpb_transfers_01_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "transfers",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/recommendations/recommendations-012020-measures-supplement-transfer_en",
|
|
},
|
|
{
|
|
"id": "edpb_controller_processor_07_2020",
|
|
"title": "EDPB Guidelines 07/2020 on Controller and Processor",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/consultation/edpb_guidelines_202007_controllerprocessor_en.pdf",
|
|
"filename": "edpb_controller_processor_07_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "controller_processor",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
|
|
},
|
|
{
|
|
"id": "edpb_breach_09_2022",
|
|
"title": "EDPB Guidelines 09/2022 on Personal Data Breach Notification",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202209_personal_data_breach_notification_v2_en.pdf",
|
|
"filename": "edpb_breach_09_2022.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "breach",
|
|
"year": 2022,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-092022-personal-data-breach-notification-under_en",
|
|
},
|
|
{
|
|
"id": "edpb_access_01_2022",
|
|
"title": "EDPB Guidelines 01/2022 on Right of Access",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202201_data_subject_rights_access_v2_en.pdf",
|
|
"filename": "edpb_access_01_2022.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "access",
|
|
"year": 2022,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-012022-data-subject-rights-right-access_en",
|
|
},
|
|
{
|
|
"id": "edpb_fines_04_2022",
|
|
"title": "EDPB Guidelines 04/2022 on Calculation of Administrative Fines",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-06/edpb_guidelines_042022_calculationofadministrativefines_en.pdf",
|
|
"filename": "edpb_fines_04_2022.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "fines",
|
|
"year": 2022,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under_en",
|
|
},
|
|
{
|
|
"id": "edpb_article48_02_2024",
|
|
"title": "EDPB Guidelines 02/2024 on Article 48 GDPR",
|
|
"url": "https://www.edpb.europa.eu/system/files/2024-07/edpb_guidelines_202402_article48_en.pdf",
|
|
"filename": "edpb_article48_02_2024.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "transfers",
|
|
"year": 2024,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-022024-article-48-gdpr_en",
|
|
},
|
|
{
|
|
"id": "edpb_eprivacy_02_2023",
|
|
"title": "EDPB Guidelines 2/2023 on Technical Scope of Art. 5(3) ePrivacy",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-11/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_en.pdf",
|
|
"filename": "edpb_eprivacy_02_2023.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "eprivacy",
|
|
"year": 2023,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22023-technical-scope-art-53-eprivacy-directive_en",
|
|
},
|
|
{
|
|
"id": "edpb_legitimate_interest_01_2024",
|
|
"title": "EDPB Guidelines 01/2024 on Legitimate Interest (Art. 6(1)(f))",
|
|
"url": "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimate-interest_hyperlinks_en.pdf",
|
|
"filename": "edpb_legitimate_interest_01_2024.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "legitimate_interest",
|
|
"year": 2024,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-12024-processing-personal-data-based-legitimate_en",
|
|
},
|
|
{
|
|
"id": "edpb_dark_patterns_03_2022",
|
|
"title": "EDPB Guidelines 03/2022 on Dark Patterns in Social Media",
|
|
"url": "https://www.edpb.europa.eu/system/files/2022-03/edpb_03-2022_guidelines_on_dark_patterns_in_social_media_platform_interfaces_en.pdf",
|
|
"filename": "edpb_dark_patterns_03_2022.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "dark_patterns",
|
|
"year": 2022,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-032022-dark-patterns-social-media-platform_en",
|
|
},
|
|
{
|
|
"id": "edpb_social_media_08_2020",
|
|
"title": "EDPB Guidelines 08/2020 on Targeting Social Media Users",
|
|
"url": "https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf",
|
|
"filename": "edpb_social_media_08_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "social_media",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-082020-targeting-social-media-users_en",
|
|
},
|
|
{
|
|
"id": "edpb_video_03_2019",
|
|
"title": "EDPB Guidelines 3/2019 on Video Surveillance (CCTV)",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201903_video_devices_en_0.pdf",
|
|
"filename": "edpb_video_03_2019.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "video",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
|
|
},
|
|
{
|
|
"id": "edpb_connected_vehicles_01_2020",
|
|
"title": "EDPB Guidelines 01/2020 on Connected Vehicles",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202001_connected_vehicles_v2.0_adopted_en.pdf",
|
|
"filename": "edpb_connected_vehicles_01_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "connected_vehicles",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-12020-processing-personal-data-context_en",
|
|
},
|
|
{
|
|
"id": "edpb_vva_02_2021",
|
|
"title": "EDPB Guidelines 02/2021 on Virtual Voice Assistants",
|
|
"url": "https://www.edpb.europa.eu/system/files/2021-07/edpb_guidelines_202102_on_vva_v2.0_adopted_en.pdf",
|
|
"filename": "edpb_vva_02_2021.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "virtual_assistants",
|
|
"year": 2021,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-022021-virtual-voice-assistants_en",
|
|
},
|
|
{
|
|
"id": "edpb_cookie_taskforce_2023",
|
|
"title": "EDPB Cookie Banner Taskforce Report",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf",
|
|
"filename": "edpb_cookie_taskforce_2023.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "cookies",
|
|
"year": 2023,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/other/report-work-undertaken-cookie-banner-taskforce_en",
|
|
},
|
|
{
|
|
"id": "edpb_certification_01_2018",
|
|
"title": "EDPB Guidelines 1/2018 on GDPR Certification (v3.0)",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201801_v3.0_certificationcriteria_annex2_en.pdf",
|
|
"filename": "edpb_certification_01_2018.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "certification",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-12018-certification-and-identifying-certification_en",
|
|
},
|
|
{
|
|
"id": "edpb_bcr_01_2022",
|
|
"title": "EDPB Recommendations 01/2022 on BCR Application (v2)",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-06/edpb_recommendations_20221_bcr-c_v2_en.pdf",
|
|
"filename": "edpb_bcr_01_2022.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "transfers",
|
|
"year": 2022,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/recommendations/recommendations-012022-application-approval-binding_en",
|
|
},
|
|
{
|
|
"id": "edpb_rtbf_05_2019",
|
|
"title": "EDPB Guidelines 5/2019 on Right to Erasure (Search Engines)",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201905_rtbfsearchengines_afterpublicconsultation_en.pdf",
|
|
"filename": "edpb_rtbf_05_2019.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "erasure",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-52019-criteria-right-be-forgotten-search-engines_en",
|
|
},
|
|
{
|
|
"id": "edpb_dpia_list_recommendation",
|
|
"title": "EDPB DPIA Lists Recommendation (Consistency Mechanism)",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file2/edpb-dpia_recommendation-list-en_0.pdf",
|
|
"filename": "edpb_dpia_list_recommendation.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "dpia",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/consistency-findings/opinions_en",
|
|
},
|
|
{
|
|
"id": "edpb_health_data_03_2020",
|
|
"title": "EDPB Guidelines 03/2020 on Health Data for Research (COVID-19)",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202003_healthdatascientificresearchcovid19_en.pdf",
|
|
"filename": "edpb_health_data_03_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "health_data",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-032020-processing-data-concerning-health-purpose_en",
|
|
},
|
|
{
|
|
"id": "edpb_geolocation_04_2020",
|
|
"title": "EDPB Guidelines 04/2020 on Geolocation/Contact Tracing",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_20200420_contact_tracing_covid_with_annex_en.pdf",
|
|
"filename": "edpb_geolocation_04_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "geolocation",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042020-use-location-data-and-contact-tracing_en",
|
|
},
|
|
{
|
|
"id": "edpb_legal_basis_02_2019",
|
|
"title": "EDPB Guidelines 2/2019 on Art. 6(1)(b) GDPR (Online Services)",
|
|
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_en.pdf",
|
|
"filename": "edpb_legal_basis_02_2019.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "legal_basis",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
|
|
},
|
|
# --- Zusaetzliche EDPB Guidelines (neu hinzugefuegt) ---
|
|
{
|
|
"id": "edpb_rro_09_2020",
|
|
"title": "EDPB Guidelines 09/2020 on Relevant and Reasoned Objection",
|
|
"url": "https://www.edpb.europa.eu/system/files/2021-03/edpb_guidelines_202009_rro_final_en.pdf",
|
|
"filename": "edpb_rro_09_2020.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "enforcement",
|
|
"year": 2020,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-092020-relevant-and-reasoned-objection-under_en",
|
|
},
|
|
{
|
|
"id": "edpb_facial_recognition_05_2022",
|
|
"title": "EDPB Guidelines 05/2022 on Facial Recognition in Law Enforcement",
|
|
"url": "https://www.edpb.europa.eu/system/files/2023-05/edpb_guidelines_202304_frtlawenforcement_v2_en.pdf",
|
|
"filename": "edpb_facial_recognition_05_2022.pdf",
|
|
"subdir": "eu/edpb/guidelines",
|
|
"category": "facial_recognition",
|
|
"year": 2022,
|
|
"source_org": "European Data Protection Board (EDPB)",
|
|
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052022-use-facial-recognition-technology-area-law_en",
|
|
},
|
|
]
|
|
|
|
# --- EDPS Guidance ---
|
|
EDPS_GUIDANCE = [
|
|
{
|
|
"id": "edps_dpia_list",
|
|
"title": "EDPS DPIA List (EU Institutions)",
|
|
"url": "https://www.edps.europa.eu/sites/default/files/publication/19-07-16_dpia_list_en.pdf",
|
|
"filename": "edps_dpia_list.pdf",
|
|
"subdir": "eu/edps",
|
|
"category": "dpia",
|
|
"year": 2019,
|
|
"source_org": "European Data Protection Supervisor (EDPS)",
|
|
"source_url": "https://www.edps.europa.eu/data-protection/our-work/publications/lists/dpia-list_en",
|
|
},
|
|
{
|
|
"id": "edps_genai_orientations_2024",
|
|
"title": "EDPS GenAI Orientations (June 2024)",
|
|
"url": "https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf",
|
|
"filename": "edps_genai_orientations_2024.pdf",
|
|
"subdir": "eu/edps",
|
|
"category": "ai",
|
|
"year": 2024,
|
|
"source_org": "European Data Protection Supervisor (EDPS)",
|
|
"source_url": "https://www.edps.europa.eu/press-publications/publications/guidelines/generative-ai-orientations_en",
|
|
},
|
|
{
|
|
"id": "edps_digital_ethics_2018",
|
|
"title": "EDPS Ethics Advisory Group Report (2018)",
|
|
"url": "https://www.edps.europa.eu/sites/default/files/publication/18-01-25_eag_report_en.pdf",
|
|
"filename": "edps_digital_ethics_2018.pdf",
|
|
"subdir": "eu/edps",
|
|
"category": "ethics",
|
|
"year": 2018,
|
|
"source_org": "European Data Protection Supervisor (EDPS)",
|
|
"source_url": "https://www.edps.europa.eu/data-protection/our-work/publications/ethical-framework/ethics-advisory-group-report-2018_en",
|
|
},
|
|
]
|
|
|
|
# --- DSFA Muss-Listen (Bund + 16 Laender) ---
|
|
DSFA_MUSSLISTEN = [
|
|
{
|
|
"id": "dsfa_bfdi_bund",
|
|
"title": "BfDI — DSFA-Liste Art. 35(4) fuer oeffentliche Stellen des Bundes",
|
|
"url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf?__blob=publicationFile&v=7",
|
|
"filename": "dsfa_bfdi_bund.pdf",
|
|
"subdir": "de/bfdi",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "Bundesbeauftragte fuer den Datenschutz und die Informationsfreiheit (BfDI)",
|
|
"source_url": "https://www.bfdi.bund.de/DE/Datenschutz/DatenschutzGVO/Hilfsmittel/DSFA/DSFA-node.html",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_dsk_gemeinsam",
|
|
"title": "DSK — Gemeinsame DSFA Muss-Liste (nicht-oeffentlicher Bereich)",
|
|
"url": "https://datenschutz.hessen.de/sites/datenschutz.hessen.de/files/2022-11/dsfa_muss_liste_dsk_de.pdf",
|
|
"filename": "dsfa_dsk_gemeinsam.pdf",
|
|
"subdir": "de/dsk",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://datenschutz.hessen.de/datenschutz/it-und-datenschutz/datenschutz-folgenabschaetzung",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_bw",
|
|
"title": "Baden-Wuerttemberg — DSFA Muss-Liste",
|
|
"url": "https://www.baden-wuerttemberg.datenschutz.de/wp-content/uploads/2018/05/Liste-von-Verarbeitungsvorg%C3%A4ngen-nach-Art.-35-Abs.-4-DS-GVO-LfDI-BW.pdf",
|
|
"filename": "dsfa_bw.pdf",
|
|
"subdir": "de/dpas/bw",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfDI Baden-Wuerttemberg",
|
|
"source_url": "https://www.baden-wuerttemberg.datenschutz.de/datenschutz-folgenabschaetzung/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_by",
|
|
"title": "Bayern (BayLDA) — DSFA Muss-Liste (DSK)",
|
|
"url": "https://www.lda.bayern.de/media/dsfa_muss_liste_dsk_de.pdf",
|
|
"filename": "dsfa_by.pdf",
|
|
"subdir": "de/dpas/by",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "Bayerisches Landesamt fuer Datenschutzaufsicht (BayLDA)",
|
|
"source_url": "https://www.lda.bayern.de/de/datenschutz_eu.html",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_be_noe",
|
|
"title": "Berlin — DSFA-Liste nicht-oeffentlicher Bereich",
|
|
"url": "https://www.datenschutz-berlin.de/fileadmin/user_upload/pdf/dokumente/2018-BlnBDI_DSFA-nicht-oeffentlich.pdf",
|
|
"filename": "dsfa_be_noe.pdf",
|
|
"subdir": "de/dpas/be",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "Berliner Beauftragte fuer Datenschutz und Informationsfreiheit (BlnBDI)",
|
|
"source_url": "https://www.datenschutz-berlin.de/infothek/publikationen",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_be_oe",
|
|
"title": "Berlin — DSFA-Liste oeffentlicher Bereich",
|
|
"url": "https://www.datenschutz-berlin.de/fileadmin/user_upload/pdf/dokumente/2018-BlnBDI_DSFA-oeffentlich.pdf",
|
|
"filename": "dsfa_be_oe.pdf",
|
|
"subdir": "de/dpas/be",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "Berliner Beauftragte fuer Datenschutz und Informationsfreiheit (BlnBDI)",
|
|
"source_url": "https://www.datenschutz-berlin.de/infothek/publikationen",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_bb_oe",
|
|
"title": "Brandenburg — DSFA-Liste oeffentlicher Bereich",
|
|
"url": "https://www.lda.brandenburg.de/sixcms/media.php/9/DSFA-Liste_%C3%B6ffentlicher_Bereich.pdf",
|
|
"filename": "dsfa_bb_oe.pdf",
|
|
"subdir": "de/dpas/bb",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LDA Brandenburg",
|
|
"source_url": "https://www.lda.brandenburg.de/lda/de/datenschutz/datenschutz-folgenabschaetzung/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_bb_noe",
|
|
"title": "Brandenburg — DSFA-Liste nicht-oeffentlicher Bereich",
|
|
"url": "https://www.lda.brandenburg.de/sixcms/media.php/9/DSFA-Liste_nicht_%C3%B6ffentlicher_Bereich.pdf",
|
|
"filename": "dsfa_bb_noe.pdf",
|
|
"subdir": "de/dpas/bb",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LDA Brandenburg",
|
|
"source_url": "https://www.lda.brandenburg.de/lda/de/datenschutz/datenschutz-folgenabschaetzung/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_hb",
|
|
"title": "Bremen — DSFA Muss-Liste",
|
|
"url": "https://www.datenschutz.bremen.de/sixcms/media.php/13/DSFA%20Muss-Liste%20LfDI%20HB.pdf",
|
|
"filename": "dsfa_hb.pdf",
|
|
"subdir": "de/dpas/hb",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfDI Bremen",
|
|
"source_url": "https://www.datenschutz.bremen.de/datenschutz/ds-gvo/datenschutz-folgenabschaetzung-18544",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_hh_noe",
|
|
"title": "Hamburg — DSFA Muss-Liste nicht-oeffentlicher Bereich",
|
|
"url": "https://datenschutz-hamburg.de/fileadmin/user_upload/HmbBfDI/Datenschutz/Informationen/DSFA_Muss-Liste_fuer_den_nicht-oeffentlicher_Bereich_-_Stand_17.10.2018.pdf",
|
|
"filename": "dsfa_hh_noe.pdf",
|
|
"subdir": "de/dpas/hh",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "HmbBfDI Hamburg",
|
|
"source_url": "https://datenschutz-hamburg.de/datenschutz-informationen",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_hh_oe",
|
|
"title": "Hamburg — DSFA Muss-Liste oeffentlicher Bereich",
|
|
"url": "https://datenschutz-hamburg.de/fileadmin/user_upload/HmbBfDI/Datenschutz/Informationen/Liste_Art_35-4_DSGVO_HmbBfDI-oeffentlicher_Bereich_v2.0a.pdf",
|
|
"filename": "dsfa_hh_oe.pdf",
|
|
"subdir": "de/dpas/hh",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "HmbBfDI Hamburg",
|
|
"source_url": "https://datenschutz-hamburg.de/datenschutz-informationen",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_mv",
|
|
"title": "Mecklenburg-Vorpommern — DSFA Muss-Liste oeffentlicher Bereich",
|
|
"url": "https://www.datenschutz-mv.de/static/DS/Dateien/DS-GVO/HilfsmittelzurUmsetzung/MV-DSFA-Muss-Liste-Oeffentlicher-Bereich.pdf",
|
|
"filename": "dsfa_mv.pdf",
|
|
"subdir": "de/dpas/mv",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfDI Mecklenburg-Vorpommern",
|
|
"source_url": "https://www.datenschutz-mv.de/datenschutz/fuer-verwaltungen/Datenschutz-Folgenabschaetzung/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_ni",
|
|
"title": "Niedersachsen — DSFA Muss-Liste",
|
|
"url": "https://www.lfd.niedersachsen.de/download/131098/Liste_von_Verarbeitungsvorgaengen_nach_Art._35_Abs._4_DS-GVO.pdf",
|
|
"filename": "dsfa_ni.pdf",
|
|
"subdir": "de/dpas/ni",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfD Niedersachsen",
|
|
"source_url": "https://www.lfd.niedersachsen.de/dsgvo/liste_von_verarbeitungsvorgangen_nach_art_35_abs_4_ds_gvo/muss-listen-zur-datenschutz-folgenabschatzung-179663.html",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_rp",
|
|
"title": "Rheinland-Pfalz — DSFA Muss-Liste oeffentliche Stellen",
|
|
"url": "https://www.datenschutz.rlp.de/fileadmin/datenschutz/Dokumente/Orientierungshilfen/DSFA_-_Muss-Liste_RLP_OE.pdf",
|
|
"filename": "dsfa_rp.pdf",
|
|
"subdir": "de/dpas/rp",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfDI Rheinland-Pfalz",
|
|
"source_url": "https://www.datenschutz.rlp.de/themen/datenschutz-folgenabschaetzung",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_sl",
|
|
"title": "Saarland — DSFA Muss-Liste (DSK)",
|
|
"url": "https://www.datenschutz.saarland.de/fileadmin/user_upload/uds/alle_Dateien_und_Ordner_bis_2025/Download/dsfa_muss_liste_dsk_de.pdf",
|
|
"filename": "dsfa_sl.pdf",
|
|
"subdir": "de/dpas/sl",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "UDS Saarland",
|
|
"source_url": "https://www.datenschutz.saarland.de/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_sn",
|
|
"title": "Sachsen — DSFA-Ergaenzungsliste",
|
|
"url": "https://www.datenschutz.sachsen.de/download/Datenschutz-Folgenabschaetzung_Ergaenzung_Liste_V1_20180606.pdf",
|
|
"filename": "dsfa_sn.pdf",
|
|
"subdir": "de/dpas/sn",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "Saechsischer Datenschutzbeauftragter",
|
|
"source_url": "https://www.datenschutz.sachsen.de/datenschutz-folgenabschaetzung-4156.html",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_st_oe",
|
|
"title": "Sachsen-Anhalt — DSFA-Liste oeffentlicher Bereich",
|
|
"url": "https://datenschutz.sachsen-anhalt.de/fileadmin/Bibliothek/Landesaemter/LfD/Informationen/Internationales/Datenschutz-Grundverordnung/Liste_DSFA/Art-35-Liste-oeffentlicher_Bereich.pdf",
|
|
"filename": "dsfa_st_oe.pdf",
|
|
"subdir": "de/dpas/st",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfD Sachsen-Anhalt",
|
|
"source_url": "https://datenschutz.sachsen-anhalt.de/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_st_noe",
|
|
"title": "Sachsen-Anhalt — DSFA-Liste nicht-oeffentlicher Bereich",
|
|
"url": "https://datenschutz.sachsen-anhalt.de/fileadmin/Bibliothek/Landesaemter/LfD/Informationen/Internationales/Datenschutz-Grundverordnung/Liste_DSFA/Art-35-Liste-nichtoeffentlicher_Bereich.pdf",
|
|
"filename": "dsfa_st_noe.pdf",
|
|
"subdir": "de/dpas/st",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "LfD Sachsen-Anhalt",
|
|
"source_url": "https://datenschutz.sachsen-anhalt.de/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_sh",
|
|
"title": "Schleswig-Holstein — DSFA Muss-Liste",
|
|
"url": "https://www.datenschutzzentrum.de/uploads/datenschutzfolgenabschaetzung/20180525_LfD-SH_DSFA_Muss-Liste_V1.0.pdf",
|
|
"filename": "dsfa_sh.pdf",
|
|
"subdir": "de/dpas/sh",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "ULD Schleswig-Holstein",
|
|
"source_url": "https://www.datenschutzzentrum.de/datenschutzfolgenabschaetzung/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
{
|
|
"id": "dsfa_th",
|
|
"title": "Thueringen — Vorlaeufige DSFA Muss-Liste",
|
|
"url": "https://tlfdi.de/fileadmin/tlfdi/datenschutz/dsfa_muss-liste_04_07_18.pdf",
|
|
"filename": "dsfa_th.pdf",
|
|
"subdir": "de/dpas/th",
|
|
"category": "dsfa_mussliste",
|
|
"year": 2018,
|
|
"source_org": "TLfDI Thueringen",
|
|
"source_url": "https://tlfdi.de/",
|
|
"collection": COLLECTION_DSFA,
|
|
},
|
|
]
|
|
|
|
# --- DSK Kurzpapiere (Lizenz: GRUEN — oeffentliche Aufsichtsbehoerdendokumente, kommerziell nutzbar) ---
|
|
DSK_KURZPAPIERE = [
|
|
{
|
|
"id": "dsk_kp_01_vvt",
|
|
"title": "DSK Kurzpapier Nr. 1 — Verzeichnis von Verarbeitungstaetigkeiten (Art. 30)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_1.pdf",
|
|
"filename": "dsk_kpnr_1.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_02_einwilligung",
|
|
"title": "DSK Kurzpapier Nr. 2 — Einwilligung (Art. 7)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_2.pdf",
|
|
"filename": "dsk_kpnr_2.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_03_zweckbindung",
|
|
"title": "DSK Kurzpapier Nr. 3 — Zweckbindung und Zweckaenderung",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_3.pdf",
|
|
"filename": "dsk_kpnr_3.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_04_datenminimierung",
|
|
"title": "DSK Kurzpapier Nr. 4 — Datenminimierung",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_4.pdf",
|
|
"filename": "dsk_kpnr_4.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_05_dsfa",
|
|
"title": "DSK Kurzpapier Nr. 5 — Datenschutz-Folgenabschaetzung (Art. 35)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf",
|
|
"filename": "dsk_kpnr_5.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_06_auskunftsrecht",
|
|
"title": "DSK Kurzpapier Nr. 6 — Auskunftsrecht (Art. 15)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_6.pdf",
|
|
"filename": "dsk_kpnr_6.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_07_marktortprinzip",
|
|
"title": "DSK Kurzpapier Nr. 7 — Marktortprinzip (Art. 3)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_7.pdf",
|
|
"filename": "dsk_kpnr_7.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_08_datenportabilitaet",
|
|
"title": "DSK Kurzpapier Nr. 8 — Datenportabilitaet (Art. 20)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_8.pdf",
|
|
"filename": "dsk_kpnr_8.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_09_sanktionen",
|
|
"title": "DSK Kurzpapier Nr. 9 — Sanktionen, Geldbussen und Schadenersatz",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_9.pdf",
|
|
"filename": "dsk_kpnr_9.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_10_informationspflichten",
|
|
"title": "DSK Kurzpapier Nr. 10 — Informationspflichten (Art. 12-14)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_10.pdf",
|
|
"filename": "dsk_kpnr_10.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_11_loeschung",
|
|
"title": "DSK Kurzpapier Nr. 11 — Recht auf Loeschung (Art. 17)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_11.pdf",
|
|
"filename": "dsk_kpnr_11.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_12_dsb",
|
|
"title": "DSK Kurzpapier Nr. 12 — Datenschutzbeauftragte (Art. 37-39)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_12.pdf",
|
|
"filename": "dsk_kpnr_12.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_13_avv",
|
|
"title": "DSK Kurzpapier Nr. 13 — Auftragsverarbeitung (Art. 28)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_13.pdf",
|
|
"filename": "dsk_kpnr_13.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_14_beschaeftigte",
|
|
"title": "DSK Kurzpapier Nr. 14 — Beschaeftigtendatenschutz",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_14.pdf",
|
|
"filename": "dsk_kpnr_14.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_15_videoueberwachung",
|
|
"title": "DSK Kurzpapier Nr. 15 — Videoueberwachung nach DSGVO",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_15.pdf",
|
|
"filename": "dsk_kpnr_15.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_16_gemeinsame_verantwortlichkeit",
|
|
"title": "DSK Kurzpapier Nr. 16 — Gemeinsame Verantwortlichkeit (Art. 26)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_16.pdf",
|
|
"filename": "dsk_kpnr_16.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_17_art9",
|
|
"title": "DSK Kurzpapier Nr. 17 — Besondere Kategorien personenbezogener Daten (Art. 9)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_17.pdf",
|
|
"filename": "dsk_kpnr_17.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_18_risiko",
|
|
"title": "DSK Kurzpapier Nr. 18 — Risiko fuer die Rechte und Freiheiten natuerlicher Personen",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_18.pdf",
|
|
"filename": "dsk_kpnr_18.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_19_unabhaengigkeit",
|
|
"title": "DSK Kurzpapier Nr. 19 — Unabhaengigkeit der Datenschutzaufsicht",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_19.pdf",
|
|
"filename": "dsk_kpnr_19.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2018,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
{
|
|
"id": "dsk_kp_20_evaluierung",
|
|
"title": "DSK Kurzpapier Nr. 20 — Evaluierung der DSGVO",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_20.pdf",
|
|
"filename": "dsk_kpnr_20.pdf",
|
|
"subdir": "de/dsk/kurzpapiere",
|
|
"category": "dsk_kurzpapier",
|
|
"year": 2020,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
|
"license_grade": "green",
|
|
},
|
|
]
|
|
|
|
# --- DSK Orientierungshilfen (Lizenz: GELB — oeffentlich verfuegbar, Lizenz nicht explizit) ---
|
|
DSK_ORIENTIERUNGSHILFEN = [
|
|
{
|
|
"id": "dsk_sdm_v31",
|
|
"title": "DSK — Standard-Datenschutzmodell (SDM) V3.1",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/oh/SDM-Methode-V31.pdf",
|
|
"filename": "SDM-Methode-V31.pdf",
|
|
"subdir": "de/dsk/orientierungshilfen",
|
|
"category": "dsk_orientierungshilfe",
|
|
"year": 2024,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/sdm.html",
|
|
"license_grade": "yellow",
|
|
},
|
|
{
|
|
"id": "dsk_oh_email_verschl",
|
|
"title": "DSK Orientierungshilfe — E-Mail-Verschluesselung",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/oh/oh_e_mail_verschluesselung.pdf",
|
|
"filename": "oh_e_mail_verschluesselung.pdf",
|
|
"subdir": "de/dsk/orientierungshilfen",
|
|
"category": "dsk_orientierungshilfe",
|
|
"year": 2021,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/orientierungshilfen.html",
|
|
"license_grade": "yellow",
|
|
},
|
|
{
|
|
"id": "dsk_oh_telemedien",
|
|
"title": "DSK Orientierungshilfe — Telemedien (Webseiten/Apps)",
|
|
"url": "https://www.datenschutzkonferenz-online.de/media/oh/oh_telemedien.pdf",
|
|
"filename": "oh_telemedien.pdf",
|
|
"subdir": "de/dsk/orientierungshilfen",
|
|
"category": "dsk_orientierungshilfe",
|
|
"year": 2022,
|
|
"source_org": "Datenschutzkonferenz (DSK)",
|
|
"source_url": "https://www.datenschutzkonferenz-online.de/orientierungshilfen.html",
|
|
"license_grade": "yellow",
|
|
},
|
|
]
|
|
|
|
# --- BfDI Praxis-Dokumente (Lizenz: GELB — oeffentlich verfuegbar, Lizenz nicht explizit) ---
|
|
BFDI_PRAXIS = [
|
|
{
|
|
"id": "bfdi_loeschkonzept",
|
|
"title": "BfDI — Loeschkonzept (2021)",
|
|
"url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Broschueren/INFO5.pdf?__blob=publicationFile&v=9",
|
|
"filename": "2021_Loeschkonzept-BfDI.pdf",
|
|
"subdir": "de/bfdi/praxis",
|
|
"category": "bfdi_praxis",
|
|
"year": 2021,
|
|
"source_org": "Bundesbeauftragte fuer den Datenschutz und die Informationsfreiheit (BfDI)",
|
|
"source_url": "https://www.bfdi.bund.de/DE/Fachthemen/Inhalte/Loeschen/loeschen-node.html",
|
|
"license_grade": "yellow",
|
|
},
|
|
]
|
|
|
|
# --- BayLDA/BayLfD Dokumente (Lizenz: GELB — oeffentlich verfuegbar) ---
|
|
BAYLDA_DOCS = [
|
|
{
|
|
"id": "baylda_tom_checkliste",
|
|
"title": "BayLDA — Checkliste Technisch-Organisatorische Massnahmen (TOM)",
|
|
"url": "https://www.lda.bayern.de/media/baylda_checkliste_tom.pdf",
|
|
"filename": "baylda_checkliste_tom.pdf",
|
|
"subdir": "de/baylda",
|
|
"category": "baylda_praxis",
|
|
"year": 2019,
|
|
"source_org": "Bayerisches Landesamt fuer Datenschutzaufsicht (BayLDA)",
|
|
"source_url": "https://www.lda.bayern.de/de/datenschutz_eu.html",
|
|
"license_grade": "yellow",
|
|
},
|
|
{
|
|
"id": "baylfb_oh_loeschung",
|
|
"title": "BayLfD — Orientierungshilfe Loeschung",
|
|
"url": "https://www.datenschutz-bayern.de/datenschutzreform2018/OH_Loeschung.pdf",
|
|
"filename": "OH_Loeschung.pdf",
|
|
"subdir": "de/baylfb",
|
|
"category": "baylda_praxis",
|
|
"year": 2019,
|
|
"source_org": "Bayerischer Landesbeauftragter fuer den Datenschutz (BayLfD)",
|
|
"source_url": "https://www.datenschutz-bayern.de/datenschutzreform2018/",
|
|
"license_grade": "yellow",
|
|
},
|
|
]
|
|
|
|
# Combined registry
|
|
REGISTRY = (
|
|
WP29_ENDORSED
|
|
+ EDPB_GUIDELINES
|
|
+ EDPS_GUIDANCE
|
|
+ DSFA_MUSSLISTEN
|
|
+ DSK_KURZPAPIERE
|
|
+ DSK_ORIENTIERUNGSHILFEN
|
|
+ BFDI_PRAXIS
|
|
+ BAYLDA_DOCS
|
|
)
|
|
|
|
# Mapping old filenames (from pdfs/) to new entry IDs for migration
|
|
OLD_FILENAME_MAP = {
|
|
"edpb_wp248_dpia.pdf": "wp248_dpia",
|
|
"edpb_wp243_dpo.pdf": "wp243_dpo",
|
|
"edpb_wp260_transparency.pdf": "wp260_transparency",
|
|
"edpb_wp250_breach.pdf": "wp250_breach",
|
|
"edpb_wp259_consent.pdf": "wp259_consent",
|
|
"edpb_wp242_portability.pdf": "wp242_portability",
|
|
"edpb_wp251_profiling.pdf": "wp251_profiling",
|
|
"edpb_consent_05_2020.pdf": "edpb_consent_05_2020",
|
|
"edpb_dpbd_04_2019.pdf": "edpb_dpbd_04_2019",
|
|
"edpb_transfers_07_2020.pdf": "edpb_transfers_01_2020",
|
|
"edpb_gl_7_2020.pdf": "edpb_controller_processor_07_2020",
|
|
"edpb_breach_09_2022.pdf": "edpb_breach_09_2022",
|
|
"edpb_access_01_2022.pdf": "edpb_access_01_2022",
|
|
"edpb_gl_04_2022.pdf": "edpb_fines_04_2022",
|
|
"edpb_article48_02_2024.pdf": "edpb_article48_02_2024",
|
|
"edpb_eprivacy_02_2023.pdf": "edpb_eprivacy_02_2023",
|
|
"edpb_legitimate_interest.pdf": "edpb_legitimate_interest_01_2024",
|
|
"edpb_dark_patterns_03_2022.pdf": "edpb_dark_patterns_03_2022",
|
|
"edpb_social_media_08_2020.pdf": "edpb_social_media_08_2020",
|
|
"edpb_gl_3_2019.pdf": "edpb_video_03_2019",
|
|
"edpb_connected_vehicles_01_2020.pdf": "edpb_connected_vehicles_01_2020",
|
|
"edpb_vva_02_2021.pdf": "edpb_vva_02_2021",
|
|
"edpb_cookie_taskforce_2023.pdf": "edpb_cookie_taskforce_2023",
|
|
"edpb_certification_01_2019.pdf": "edpb_certification_01_2018",
|
|
"edpb_bcr_01_2022.pdf": "edpb_bcr_01_2022",
|
|
"edpb_rtbf_05_2019.pdf": "edpb_rtbf_05_2019",
|
|
"edpb_dpia_list_recommendation.pdf": "edpb_dpia_list_recommendation",
|
|
"edpb_health_data_03_2020.pdf": "edpb_health_data_03_2020",
|
|
"edpb_geolocation_04_2020.pdf": "edpb_geolocation_04_2020",
|
|
"edpb_gl_2_2019.pdf": "edpb_legal_basis_02_2019",
|
|
"edps_dpia_list.pdf": "edps_dpia_list",
|
|
"edps_genai_orientations_2024.pdf": "edps_genai_orientations_2024",
|
|
"edps_digital_ethics_2018.pdf": "edps_digital_ethics_2018",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def ts() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def sha256_file(path: Path) -> str:
|
|
h = hashlib.sha256()
|
|
with open(path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(8192), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
def load_manifest() -> dict:
|
|
if MANIFEST_PATH.exists():
|
|
with open(MANIFEST_PATH) as f:
|
|
return json.load(f)
|
|
return {"version": datetime.now().strftime("%Y-%m-%d.1"), "last_updated": ts(), "documents": {}}
|
|
|
|
|
|
def save_manifest(manifest: dict) -> None:
|
|
manifest["last_updated"] = ts()
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
old_v = manifest.get("version", "")
|
|
if old_v.startswith(today + "."):
|
|
n = int(old_v.split(".")[-1]) + 1
|
|
manifest["version"] = f"{today}.{n}"
|
|
else:
|
|
manifest["version"] = f"{today}.1"
|
|
MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(MANIFEST_PATH, "w") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
|
|
def ok(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] \u2713 {msg}")
|
|
def warn(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] \u26a0 {msg}", file=sys.stderr)
|
|
def fail(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] \u2717 {msg}", file=sys.stderr)
|
|
|
|
|
|
def is_valid_pdf(path: Path) -> bool:
|
|
"""Check if file is actually a PDF (not HTML from Cloudflare)."""
|
|
with open(path, "rb") as f:
|
|
header = f.read(5)
|
|
return header == b"%PDF-"
|
|
|
|
|
|
def extract_text_pymupdf(pdf_path: Path) -> str:
|
|
"""Extract text from PDF using PyMuPDF locally."""
|
|
if not HAS_PYMUPDF:
|
|
return ""
|
|
doc = fitz.open(str(pdf_path))
|
|
pages = []
|
|
for page in doc:
|
|
text = page.get_text()
|
|
if text.strip():
|
|
pages.append(text)
|
|
doc.close()
|
|
return "\n\n".join(pages)
|
|
|
|
|
|
def upload_text_as_file(text: str, filename: str, collection: str,
|
|
data_type: str, use_case: str, year: str,
|
|
metadata: dict) -> dict:
|
|
"""Upload extracted text as a .txt file to the RAG API."""
|
|
txt_filename = filename.replace(".pdf", ".txt")
|
|
resp = requests.post(
|
|
RAG_URL,
|
|
files={"file": (txt_filename, text.encode("utf-8"), "text/plain")},
|
|
data={
|
|
"collection": collection,
|
|
"data_type": data_type,
|
|
"use_case": use_case,
|
|
"year": year,
|
|
"chunk_strategy": "recursive",
|
|
"chunk_size": "512",
|
|
"chunk_overlap": "50",
|
|
"metadata_json": json.dumps(metadata),
|
|
},
|
|
timeout=TIMEOUT,
|
|
verify=False,
|
|
)
|
|
return resp
|
|
|
|
|
|
def entry_path(entry: dict) -> Path:
|
|
"""Full path for a registry entry."""
|
|
return SOURCES_DIR / entry["subdir"] / entry["filename"]
|
|
|
|
|
|
def entry_collection(entry: dict) -> str:
|
|
"""Target Qdrant collection for a registry entry."""
|
|
return entry.get("collection", COLLECTION)
|
|
|
|
|
|
def registry_by_id() -> dict:
|
|
return {e["id"]: e for e in REGISTRY}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CSV Manifest Export
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def export_csv_manifests() -> None:
|
|
"""Write CSV manifests per category + unified manifest with license grades."""
|
|
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
fields = ["id", "title", "url", "filename", "subdir", "category", "year", "source_org", "source_url"]
|
|
|
|
groups = {
|
|
"eu_wp29_endorsed.csv": WP29_ENDORSED,
|
|
"eu_edpb_guidelines.csv": EDPB_GUIDELINES,
|
|
"eu_edps_guidance.csv": EDPS_GUIDANCE,
|
|
"de_dsfa_lists.csv": DSFA_MUSSLISTEN,
|
|
"de_dsk_kurzpapiere.csv": DSK_KURZPAPIERE,
|
|
"de_dsk_orientierungshilfen.csv": DSK_ORIENTIERUNGSHILFEN,
|
|
"de_bfdi_praxis.csv": BFDI_PRAXIS,
|
|
"de_baylda_docs.csv": BAYLDA_DOCS,
|
|
}
|
|
|
|
for fname, entries in groups.items():
|
|
path = MANIFESTS_DIR / fname
|
|
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
|
writer.writeheader()
|
|
for e in entries:
|
|
writer.writerow(e)
|
|
ok(f"CSV manifest: {path} ({len(entries)} entries)")
|
|
|
|
# Unified manifest with license grades
|
|
unified_fields = [
|
|
"id", "topic", "doc_type", "url", "license_grade",
|
|
"source_org", "source_url", "collection", "notes",
|
|
]
|
|
unified_path = MANIFESTS_DIR / "eu_de_privacy_manifest.csv"
|
|
with open(unified_path, "w", newline="", encoding="utf-8") as f:
|
|
writer = csv.DictWriter(f, fieldnames=unified_fields, extrasaction="ignore")
|
|
writer.writeheader()
|
|
for e in REGISTRY:
|
|
grade = e.get("license_grade", "green")
|
|
coll = entry_collection(e)
|
|
row = {
|
|
"id": e["id"],
|
|
"topic": e.get("category", ""),
|
|
"doc_type": e.get("category", ""),
|
|
"url": e["url"],
|
|
"license_grade": grade,
|
|
"source_org": e.get("source_org", ""),
|
|
"source_url": e.get("source_url", ""),
|
|
"collection": coll,
|
|
"notes": e["title"],
|
|
}
|
|
writer.writerow(row)
|
|
ok(f"Unified manifest: {unified_path} ({len(REGISTRY)} entries)")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Migration: pdfs/ -> sources/
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def migrate_from_pdfs(manifest: dict) -> dict:
|
|
"""Move existing PDFs from ~/rag-ingestion/pdfs/ into the new sources/ structure."""
|
|
old_dir = WORK_DIR / "pdfs"
|
|
if not old_dir.exists():
|
|
log("No pdfs/ directory found — nothing to migrate")
|
|
return manifest
|
|
|
|
moved = 0
|
|
for old_file in sorted(old_dir.glob("*.pdf")):
|
|
fname = old_file.name
|
|
entry_id = OLD_FILENAME_MAP.get(fname)
|
|
if not entry_id:
|
|
warn(f"No registry mapping for {fname} — skipping migration")
|
|
continue
|
|
|
|
rmap = registry_by_id()
|
|
entry = rmap.get(entry_id)
|
|
if not entry:
|
|
warn(f"Entry {entry_id} not in registry — skipping")
|
|
continue
|
|
|
|
new_path = entry_path(entry)
|
|
if new_path.exists():
|
|
# Already migrated
|
|
continue
|
|
|
|
new_path.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(str(old_file), str(new_path))
|
|
|
|
sha = sha256_file(new_path)
|
|
manifest["documents"][entry_id] = {
|
|
"filename": entry["filename"],
|
|
"subdir": entry["subdir"],
|
|
"sha256": sha,
|
|
"downloaded_at": ts(),
|
|
"file_size": new_path.stat().st_size,
|
|
"source_url": entry["url"],
|
|
"source_org": entry["source_org"],
|
|
"source_page": entry.get("source_url", ""),
|
|
"ingested": False,
|
|
"ingested_at": None,
|
|
"collection": entry_collection(entry),
|
|
}
|
|
ok(f"Migrated: {fname} -> sources/{entry['subdir']}/{entry['filename']}")
|
|
moved += 1
|
|
|
|
log(f"Migration: {moved} files moved to sources/")
|
|
return manifest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Download
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def download_new(manifest: dict) -> dict:
|
|
"""Download PDFs from registry that are not yet on disk."""
|
|
downloaded = 0
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
for entry in REGISTRY:
|
|
doc_id = entry["id"]
|
|
target = entry_path(entry)
|
|
|
|
# Already on disk?
|
|
if target.exists() and target.stat().st_size > 100:
|
|
if doc_id not in manifest["documents"]:
|
|
sha = sha256_file(target)
|
|
manifest["documents"][doc_id] = {
|
|
"filename": entry["filename"],
|
|
"subdir": entry["subdir"],
|
|
"sha256": sha,
|
|
"downloaded_at": ts(),
|
|
"file_size": target.stat().st_size,
|
|
"source_url": entry["url"],
|
|
"source_org": entry["source_org"],
|
|
"source_page": entry.get("source_url", ""),
|
|
"ingested": False,
|
|
"ingested_at": None,
|
|
"collection": entry_collection(entry),
|
|
}
|
|
skipped += 1
|
|
continue
|
|
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
log(f"Downloading: {entry['title']}")
|
|
|
|
try:
|
|
time.sleep(DOWNLOAD_DELAY)
|
|
resp = requests.get(
|
|
entry["url"], timeout=60, verify=False,
|
|
headers={"User-Agent": "BreakPilot-Compliance-Crawler/1.0"},
|
|
allow_redirects=True,
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
if len(resp.content) < 1000:
|
|
warn(f"Response too small ({len(resp.content)} bytes): {entry['title']}")
|
|
failed += 1
|
|
continue
|
|
|
|
ct = resp.headers.get("content-type", "")
|
|
if "html" in ct.lower() and "pdf" not in ct.lower():
|
|
warn(f"Got HTML instead of PDF (Cloudflare?): {entry['title']}")
|
|
failed += 1
|
|
continue
|
|
|
|
# Double-check: content starts with %PDF-
|
|
if not resp.content[:5] == b"%PDF-":
|
|
warn(f"Downloaded content is not a PDF (header: {resp.content[:15]!r}): {entry['title']}")
|
|
failed += 1
|
|
continue
|
|
|
|
target.write_bytes(resp.content)
|
|
sha = sha256_file(target)
|
|
manifest["documents"][doc_id] = {
|
|
"filename": entry["filename"],
|
|
"subdir": entry["subdir"],
|
|
"sha256": sha,
|
|
"downloaded_at": ts(),
|
|
"file_size": len(resp.content),
|
|
"source_url": entry["url"],
|
|
"source_org": entry["source_org"],
|
|
"source_page": entry.get("source_url", ""),
|
|
"ingested": False,
|
|
"ingested_at": None,
|
|
"collection": entry_collection(entry),
|
|
}
|
|
ok(f"Downloaded: {entry['filename']} ({len(resp.content) // 1024} KB)")
|
|
downloaded += 1
|
|
|
|
except Exception as e:
|
|
fail(f"Download failed: {entry['title']} — {e}")
|
|
failed += 1
|
|
|
|
log(f"Download: {downloaded} new, {skipped} existing, {failed} failed")
|
|
return manifest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Ingest
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def ingest_new(manifest: dict) -> dict:
|
|
"""Ingest PDFs that are downloaded but not yet ingested.
|
|
|
|
Strategy:
|
|
1. Try PDF upload to RAG API (embedding service extracts text)
|
|
2. If that fails (500), check if file is actually HTML (Cloudflare block)
|
|
→ if HTML, mark as needs-redownload and skip
|
|
3. If real PDF, fallback: extract text locally with PyMuPDF,
|
|
then upload as .txt file
|
|
"""
|
|
rmap = registry_by_id()
|
|
ingested = 0
|
|
skipped = 0
|
|
failed = 0
|
|
fallback_used = 0
|
|
|
|
for doc_id, doc in list(manifest["documents"].items()):
|
|
if doc.get("ingested"):
|
|
skipped += 1
|
|
continue
|
|
|
|
# Find file: try new structure first, then old pdfs/ dir
|
|
pdf_path = SOURCES_DIR / doc.get("subdir", "") / doc["filename"]
|
|
if not pdf_path.exists():
|
|
pdf_path = WORK_DIR / "pdfs" / doc["filename"]
|
|
if not pdf_path.exists():
|
|
warn(f"PDF not found: {doc['filename']}")
|
|
failed += 1
|
|
continue
|
|
|
|
# Check for HTML files masquerading as PDFs
|
|
if not is_valid_pdf(pdf_path):
|
|
warn(f"Not a valid PDF (HTML/Cloudflare block): {doc['filename']} — deleting, will re-download")
|
|
pdf_path.unlink()
|
|
# Remove from manifest so --download will retry
|
|
if doc_id in manifest["documents"]:
|
|
del manifest["documents"][doc_id]
|
|
failed += 1
|
|
continue
|
|
|
|
entry = rmap.get(doc_id, {})
|
|
title = entry.get("title", doc_id)
|
|
category = entry.get("category", "general")
|
|
year = str(entry.get("year", "2024"))
|
|
collection = doc.get("collection", entry.get("collection", COLLECTION))
|
|
|
|
# Determine source_id based on subdir
|
|
subdir = doc.get("subdir", entry.get("subdir", ""))
|
|
if subdir.startswith("de/"):
|
|
source_id = subdir.split("/")[1]
|
|
doc_type = "dsfa_mussliste"
|
|
data_type = "compliance"
|
|
use_case = "legal_reference"
|
|
elif "edps" in subdir:
|
|
source_id = "edps"
|
|
doc_type = "guidance"
|
|
data_type = "compliance_datenschutz"
|
|
use_case = "guidance"
|
|
else:
|
|
source_id = "edpb"
|
|
doc_type = "guidance"
|
|
data_type = "compliance_datenschutz"
|
|
use_case = "guidance"
|
|
|
|
metadata = {
|
|
"source_id": source_id,
|
|
"doc_type": doc_type,
|
|
"guideline_id": doc_id,
|
|
"guideline_name": title,
|
|
"category": category,
|
|
"license": "reuse_notice" if subdir.startswith("eu/") else "public_law",
|
|
"attribution": entry.get("source_org", ""),
|
|
"source": entry.get("source_url", entry.get("url", "")),
|
|
"download_url": entry.get("url", doc.get("source_url", "")),
|
|
}
|
|
|
|
log(f"Ingesting: {title} ({pdf_path.stat().st_size // 1024} KB) -> {collection}")
|
|
|
|
# --- Attempt 1: direct PDF upload ---
|
|
try:
|
|
with open(pdf_path, "rb") as f:
|
|
resp = requests.post(
|
|
RAG_URL,
|
|
files={"file": (doc["filename"], f, "application/pdf")},
|
|
data={
|
|
"collection": collection,
|
|
"data_type": data_type,
|
|
"use_case": use_case,
|
|
"year": year,
|
|
"chunk_strategy": "recursive",
|
|
"chunk_size": "512",
|
|
"chunk_overlap": "50",
|
|
"metadata_json": json.dumps(metadata),
|
|
},
|
|
timeout=TIMEOUT,
|
|
verify=False,
|
|
)
|
|
|
|
result = resp.json()
|
|
chunks = result.get("chunks_count") or result.get("vectors_indexed", "?")
|
|
|
|
if resp.status_code < 300 and ("chunks_count" in result or "vectors_indexed" in result):
|
|
doc["ingested"] = True
|
|
doc["ingested_at"] = ts()
|
|
ok(f"Ingested: {title} -> {chunks} chunks")
|
|
ingested += 1
|
|
continue
|
|
|
|
except Exception as e:
|
|
warn(f"PDF upload error: {title} — {e}")
|
|
|
|
# --- Attempt 2: PyMuPDF local extraction + text upload ---
|
|
if not HAS_PYMUPDF:
|
|
fail(f"Ingest failed + no PyMuPDF fallback: {title}")
|
|
failed += 1
|
|
continue
|
|
|
|
log(f" Fallback: extracting text locally with PyMuPDF...")
|
|
try:
|
|
text = extract_text_pymupdf(pdf_path)
|
|
if len(text.strip()) < 100:
|
|
fail(f" PyMuPDF extracted too little text ({len(text)} chars): {title}")
|
|
failed += 1
|
|
continue
|
|
|
|
resp = upload_text_as_file(text, doc["filename"], collection,
|
|
data_type, use_case, year, metadata)
|
|
result = resp.json()
|
|
chunks = result.get("chunks_count") or result.get("vectors_indexed", "?")
|
|
|
|
if resp.status_code < 300 and ("chunks_count" in result or "vectors_indexed" in result):
|
|
doc["ingested"] = True
|
|
doc["ingested_at"] = ts()
|
|
doc["ingest_method"] = "pymupdf_fallback"
|
|
ok(f" Fallback OK: {title} -> {chunks} chunks (PyMuPDF text extraction)")
|
|
ingested += 1
|
|
fallback_used += 1
|
|
else:
|
|
fail(f" Fallback ingest failed ({resp.status_code}): {title}")
|
|
fail(f" Response: {resp.text[:300]}")
|
|
failed += 1
|
|
|
|
except Exception as e:
|
|
fail(f" Fallback error: {title} — {e}")
|
|
failed += 1
|
|
|
|
log(f"Ingest: {ingested} new ({fallback_used} via PyMuPDF fallback), {skipped} already ingested, {failed} failed")
|
|
return manifest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Status
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def show_status(manifest: dict) -> None:
|
|
downloaded = 0
|
|
ingested_count = 0
|
|
missing = 0
|
|
|
|
sections = [
|
|
("=== Endorsed WP29 ===", WP29_ENDORSED),
|
|
("=== EDPB Guidelines & Recommendations ===", EDPB_GUIDELINES),
|
|
("=== EDPS Guidance ===", EDPS_GUIDANCE),
|
|
("=== DSFA Muss-Listen (DE) ===", DSFA_MUSSLISTEN),
|
|
("=== DSK Kurzpapiere (Nr. 1-20) ===", DSK_KURZPAPIERE),
|
|
("=== DSK Orientierungshilfen ===", DSK_ORIENTIERUNGSHILFEN),
|
|
("=== BfDI Praxis-Dokumente ===", BFDI_PRAXIS),
|
|
("=== BayLDA/BayLfD Dokumente ===", BAYLDA_DOCS),
|
|
]
|
|
|
|
print()
|
|
for header, entries in sections:
|
|
print(f"\n{header}")
|
|
print(f"{'ID':<40} {'Status':<12} {'File':<45} {'Size':>8}")
|
|
print("-" * 110)
|
|
|
|
for entry in entries:
|
|
doc_id = entry["id"]
|
|
doc = manifest["documents"].get(doc_id)
|
|
|
|
if doc:
|
|
size_kb = doc.get("file_size", 0) // 1024
|
|
if doc.get("ingested"):
|
|
status = "INGESTED"
|
|
ingested_count += 1
|
|
else:
|
|
status = "DOWNLOADED"
|
|
downloaded += 1
|
|
else:
|
|
fp = entry_path(entry)
|
|
if fp.exists():
|
|
status = "ON DISK"
|
|
downloaded += 1
|
|
size_kb = fp.stat().st_size // 1024
|
|
else:
|
|
status = "MISSING"
|
|
missing += 1
|
|
size_kb = 0
|
|
|
|
size_str = f"{size_kb:>6} KB" if size_kb > 0 else " -"
|
|
print(f"{doc_id:<40} {status:<12} {entry['filename']:<45} {size_str}")
|
|
|
|
print()
|
|
print("=" * 110)
|
|
print(f"Total: {len(REGISTRY)} in registry | {downloaded} downloaded | {ingested_count} ingested | {missing} missing")
|
|
print(f"Manifest: {MANIFEST_PATH}")
|
|
print(f"Sources: {SOURCES_DIR}")
|
|
print()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Verify
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def verify(manifest: dict) -> None:
|
|
queries = [
|
|
("DSFA erforderlich Risiko", COLLECTION, "WP248"),
|
|
("Datenschutzbeauftragter Pflichten", COLLECTION, "WP243"),
|
|
("Transparenz Informationspflicht", COLLECTION, "WP260"),
|
|
("Einwilligung Consent Cookie", COLLECTION, "Consent"),
|
|
("Data Protection by Design", COLLECTION, "DPbD"),
|
|
("Datenschutz-Folgenabschaetzung Muss-Liste", COLLECTION_DSFA, "DSFA Mussliste"),
|
|
("Videoüberwachung DSFA erforderlich", COLLECTION_DSFA, "Laender-DPA"),
|
|
# DSK Kurzpapiere
|
|
("VVT Art. 30 Verarbeitungsverzeichnis", COLLECTION, "DSK KP 1"),
|
|
("Recht auf Loeschung Art. 17", COLLECTION, "DSK KP 11"),
|
|
("Auftragsverarbeitung Art. 28", COLLECTION, "DSK KP 13"),
|
|
("Besondere Kategorien Art. 9", COLLECTION, "DSK KP 17"),
|
|
("Risiko Rechte Freiheiten natuerlicher Personen", COLLECTION, "DSK KP 18"),
|
|
# SDM / BfDI / BayLDA
|
|
("Standard-Datenschutzmodell SDM Schutzbedarf", COLLECTION, "SDM V3.1"),
|
|
("Loeschkonzept Aufbewahrungsfristen", COLLECTION, "BfDI Loeschkonzept"),
|
|
("TOM Art. 32 Verschluesselung Massnahmen", COLLECTION, "BayLDA TOM"),
|
|
]
|
|
|
|
log("Verifying RAG collections")
|
|
ok_count = 0
|
|
fail_count = 0
|
|
|
|
for query, collection, hint in queries:
|
|
try:
|
|
resp = requests.post(
|
|
RAG_SEARCH_URL,
|
|
json={"query": query, "collection": collection, "top_k": 3},
|
|
timeout=30, verify=False,
|
|
)
|
|
results = resp.json()
|
|
hits = results.get("results", results.get("documents", []))
|
|
if hits:
|
|
top_score = hits[0].get("score", hits[0].get("relevance_score", "?"))
|
|
ok(f"[{collection}] '{query}' -> {len(hits)} hits (score: {top_score})")
|
|
ok_count += 1
|
|
else:
|
|
warn(f"[{collection}] '{query}' -> 0 hits (expected: {hint})")
|
|
fail_count += 1
|
|
except Exception as e:
|
|
fail(f"Search error: {e}")
|
|
fail_count += 1
|
|
|
|
print()
|
|
log(f"Verification: {ok_count}/{len(queries)} queries OK")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="EDPB/WP29/DSFA Auto-Crawler for BreakPilot Compliance")
|
|
parser.add_argument("--all", action="store_true", help="Migrate + Download + Ingest + CSV export")
|
|
parser.add_argument("--download", action="store_true", help="Download missing PDFs")
|
|
parser.add_argument("--ingest", action="store_true", help="Ingest downloaded PDFs")
|
|
parser.add_argument("--status", action="store_true", help="Show status overview")
|
|
parser.add_argument("--verify", action="store_true", help="Run RAG test searches")
|
|
parser.add_argument("--migrate", action="store_true", help="Move PDFs from pdfs/ to sources/")
|
|
parser.add_argument("--csv", action="store_true", help="Export CSV manifests")
|
|
args = parser.parse_args()
|
|
|
|
if not any([args.all, args.download, args.ingest, args.status, args.verify, args.migrate, args.csv]):
|
|
parser.print_help()
|
|
return
|
|
|
|
manifest = load_manifest()
|
|
|
|
if args.migrate or args.all:
|
|
manifest = migrate_from_pdfs(manifest)
|
|
save_manifest(manifest)
|
|
|
|
if args.download or args.all:
|
|
manifest = download_new(manifest)
|
|
save_manifest(manifest)
|
|
|
|
if args.ingest or args.all:
|
|
manifest = ingest_new(manifest)
|
|
save_manifest(manifest)
|
|
|
|
if args.csv or args.all:
|
|
export_csv_manifests()
|
|
|
|
if args.status:
|
|
show_status(manifest)
|
|
|
|
if args.verify:
|
|
verify(manifest)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|