Files
breakpilot-compliance/scripts/edpb-crawler.py
Benjamin Admin a181c977c3
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 37s
CI / test-python-backend-compliance (push) Successful in 34s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
fix(crawler): korrigierte URLs fuer DSK-OH, BayLDA-TOM (404-Fixes)
- SDM V3.1: media/oh/ → media/ah/
- E-Mail-Verschluesselung: korrekter Dateiname mit Datum
- OH Telemedien: korrekter Dateiname V1.1
- BayLDA TOM: media/checkliste/ Unterordner

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 12:12:25 +01:00

1722 lines
74 KiB
Python
Executable File

#!/usr/bin/env python3
"""
BreakPilot Compliance — EDPB/WP29/DSFA Auto-Crawler
Downloads, versioniert und ingestiert alle relevanten Datenschutz-Dokumente
in die Qdrant-Collections bp_compliance_datenschutz / bp_dsfa_corpus:
- EDPB Guidelines & Recommendations
- Endorsed WP29 Guidelines
- EDPS Guidance
- DSFA Muss-Listen (BfDI + 16 Bundeslaender)
- DSK Kurzpapiere (Nr. 1-20)
- DSK Orientierungshilfen (SDM V3.1, E-Mail-Verschluesselung, Telemedien)
- BfDI Praxis-Dokumente (Loeschkonzept)
- BayLDA/BayLfD Orientierungshilfen (TOM-Checkliste, Loeschung)
Ordnerstruktur:
~/rag-ingestion/sources/eu/edpb/guidelines/ EDPB eigene Guidelines
~/rag-ingestion/sources/eu/edpb/endorsed_wp29/ Endorsed WP29
~/rag-ingestion/sources/eu/edps/ EDPS Guidance
~/rag-ingestion/sources/de/bfdi/ BfDI DSFA-Liste
~/rag-ingestion/sources/de/bfdi/praxis/ BfDI Praxis-Dokumente
~/rag-ingestion/sources/de/dsk/ DSK gemeinsame Liste
~/rag-ingestion/sources/de/dsk/kurzpapiere/ DSK Kurzpapiere Nr. 1-20
~/rag-ingestion/sources/de/dsk/orientierungshilfen/ SDM, OH E-Mail, OH Telemedien
~/rag-ingestion/sources/de/dpas/{bw,by,...}/ Laender-DPA Listen
~/rag-ingestion/sources/de/baylda/ BayLDA Dokumente
~/rag-ingestion/sources/de/baylfb/ BayLfD Dokumente
~/rag-ingestion/manifests/ CSV-Manifeste
Usage:
python3 edpb-crawler.py --all # Download + Ingest alles
python3 edpb-crawler.py --download # Nur fehlende PDFs laden
python3 edpb-crawler.py --ingest # Geladene PDFs hochladen
python3 edpb-crawler.py --status # Uebersicht
python3 edpb-crawler.py --verify # RAG-Test-Suchen
python3 edpb-crawler.py --migrate # PDFs aus pdfs/ in sources/ verschieben
"""
import argparse
import csv
import hashlib
import json
import os
import shutil
import sys
import tempfile
import time
import urllib3
from datetime import datetime, timezone
from pathlib import Path
try:
import requests
except ImportError:
print("ERROR: 'requests' not installed. Run: pip3 install requests")
sys.exit(1)
# PyMuPDF for local text extraction fallback
try:
import fitz # PyMuPDF
HAS_PYMUPDF = True
except ImportError:
HAS_PYMUPDF = False
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
WORK_DIR = Path(os.environ.get("WORK_DIR", Path.home() / "rag-ingestion"))
SOURCES_DIR = WORK_DIR / "sources"
MANIFESTS_DIR = WORK_DIR / "manifests"
MANIFEST_PATH = WORK_DIR / "edpb-manifest.json"
RAG_URL = os.environ.get("RAG_URL", "https://localhost:8097/api/v1/documents/upload")
RAG_SEARCH_URL = os.environ.get("RAG_SEARCH_URL", "https://localhost:8097/api/v1/search")
COLLECTION = "bp_compliance_datenschutz"
COLLECTION_DSFA = "bp_dsfa_corpus"
TIMEOUT = 300
DOWNLOAD_DELAY = 3 # seconds between downloads to avoid 429
# ---------------------------------------------------------------------------
# Document Registry
#
# Jeder Eintrag hat: id, title, url, filename, subdir (relativ zu sources/),
# category, year, source_org, collection (optional, default COLLECTION).
# ---------------------------------------------------------------------------
# --- Endorsed WP29 ---
WP29_ENDORSED = [
{
"id": "wp248_dpia",
"title": "WP248 rev.01 — Guidelines on DPIA",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=47711",
"filename": "edpb_wp248_dpia.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "dpia",
"year": 2017,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
},
{
"id": "wp243_dpo",
"title": "WP243 rev.01 — Guidelines on DPO",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=44100",
"filename": "edpb_wp243_dpo.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "dpo",
"year": 2016,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
},
{
"id": "wp260_transparency",
"title": "WP260 rev.01 — Guidelines on Transparency",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=51025",
"filename": "edpb_wp260_transparency.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "transparency",
"year": 2018,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://ec.europa.eu/newsroom/article29/item-detail.cfm?item_id=622227",
},
{
"id": "wp250_breach",
"title": "WP250 rev.01 — Guidelines on Data Breach Notification",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=47741",
"filename": "edpb_wp250_breach.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "breach",
"year": 2018,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
},
{
"id": "wp259_consent",
"title": "WP259 rev.01 — Guidelines on Consent under GDPR",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=51030",
"filename": "edpb_wp259_consent.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "consent",
"year": 2018,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
},
{
"id": "wp242_portability",
"title": "WP242 rev.01 — Guidelines on Right to Data Portability",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=44099",
"filename": "edpb_wp242_portability.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "portability",
"year": 2017,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
},
{
"id": "wp251_profiling",
"title": "WP251 rev.01 — Guidelines on Automated Decision-Making/Profiling",
"url": "https://ec.europa.eu/newsroom/article29/document.cfm?action=display&doc_id=49826",
"filename": "edpb_wp251_profiling.pdf",
"subdir": "eu/edpb/endorsed_wp29",
"category": "profiling",
"year": 2018,
"source_org": "Article 29 Working Party / EDPB endorsed",
"source_url": "https://www.edpb.europa.eu/our-work-tools/general-guidance/endorsed-wp29-guidelines_en",
},
]
# --- EDPB eigene Guidelines & Recommendations ---
EDPB_GUIDELINES = [
{
"id": "edpb_consent_05_2020",
"title": "EDPB Guidelines 05/2020 on Consent",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf",
"filename": "edpb_consent_05_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "consent",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
},
{
"id": "edpb_dpbd_04_2019",
"title": "EDPB Guidelines 4/2019 on Data Protection by Design and Default",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf",
"filename": "edpb_dpbd_04_2019.pdf",
"subdir": "eu/edpb/guidelines",
"category": "dpbd",
"year": 2019,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-42019-article-25-data-protection-design-and_en",
},
{
"id": "edpb_transfers_01_2020",
"title": "EDPB Recommendations 01/2020 on Supplementary Transfer Measures",
"url": "https://www.edpb.europa.eu/sites/default/files/consultation/edpb_recommendations_202001_supplementarymeasurestransferstools_en.pdf",
"filename": "edpb_transfers_01_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "transfers",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/recommendations/recommendations-012020-measures-supplement-transfer_en",
},
{
"id": "edpb_controller_processor_07_2020",
"title": "EDPB Guidelines 07/2020 on Controller and Processor",
"url": "https://www.edpb.europa.eu/sites/default/files/consultation/edpb_guidelines_202007_controllerprocessor_en.pdf",
"filename": "edpb_controller_processor_07_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "controller_processor",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
},
{
"id": "edpb_breach_09_2022",
"title": "EDPB Guidelines 09/2022 on Personal Data Breach Notification",
"url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202209_personal_data_breach_notification_v2_en.pdf",
"filename": "edpb_breach_09_2022.pdf",
"subdir": "eu/edpb/guidelines",
"category": "breach",
"year": 2022,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-092022-personal-data-breach-notification-under_en",
},
{
"id": "edpb_access_01_2022",
"title": "EDPB Guidelines 01/2022 on Right of Access",
"url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202201_data_subject_rights_access_v2_en.pdf",
"filename": "edpb_access_01_2022.pdf",
"subdir": "eu/edpb/guidelines",
"category": "access",
"year": 2022,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-012022-data-subject-rights-right-access_en",
},
{
"id": "edpb_fines_04_2022",
"title": "EDPB Guidelines 04/2022 on Calculation of Administrative Fines",
"url": "https://www.edpb.europa.eu/system/files/2023-06/edpb_guidelines_042022_calculationofadministrativefines_en.pdf",
"filename": "edpb_fines_04_2022.pdf",
"subdir": "eu/edpb/guidelines",
"category": "fines",
"year": 2022,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under_en",
},
{
"id": "edpb_article48_02_2024",
"title": "EDPB Guidelines 02/2024 on Article 48 GDPR",
"url": "https://www.edpb.europa.eu/system/files/2024-07/edpb_guidelines_202402_article48_en.pdf",
"filename": "edpb_article48_02_2024.pdf",
"subdir": "eu/edpb/guidelines",
"category": "transfers",
"year": 2024,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-022024-article-48-gdpr_en",
},
{
"id": "edpb_eprivacy_02_2023",
"title": "EDPB Guidelines 2/2023 on Technical Scope of Art. 5(3) ePrivacy",
"url": "https://www.edpb.europa.eu/system/files/2023-11/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_en.pdf",
"filename": "edpb_eprivacy_02_2023.pdf",
"subdir": "eu/edpb/guidelines",
"category": "eprivacy",
"year": 2023,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22023-technical-scope-art-53-eprivacy-directive_en",
},
{
"id": "edpb_legitimate_interest_01_2024",
"title": "EDPB Guidelines 01/2024 on Legitimate Interest (Art. 6(1)(f))",
"url": "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimate-interest_hyperlinks_en.pdf",
"filename": "edpb_legitimate_interest_01_2024.pdf",
"subdir": "eu/edpb/guidelines",
"category": "legitimate_interest",
"year": 2024,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-12024-processing-personal-data-based-legitimate_en",
},
{
"id": "edpb_dark_patterns_03_2022",
"title": "EDPB Guidelines 03/2022 on Dark Patterns in Social Media",
"url": "https://www.edpb.europa.eu/system/files/2022-03/edpb_03-2022_guidelines_on_dark_patterns_in_social_media_platform_interfaces_en.pdf",
"filename": "edpb_dark_patterns_03_2022.pdf",
"subdir": "eu/edpb/guidelines",
"category": "dark_patterns",
"year": 2022,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-032022-dark-patterns-social-media-platform_en",
},
{
"id": "edpb_social_media_08_2020",
"title": "EDPB Guidelines 08/2020 on Targeting Social Media Users",
"url": "https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf",
"filename": "edpb_social_media_08_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "social_media",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-082020-targeting-social-media-users_en",
},
{
"id": "edpb_video_03_2019",
"title": "EDPB Guidelines 3/2019 on Video Surveillance (CCTV)",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201903_video_devices_en_0.pdf",
"filename": "edpb_video_03_2019.pdf",
"subdir": "eu/edpb/guidelines",
"category": "video",
"year": 2019,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
},
{
"id": "edpb_connected_vehicles_01_2020",
"title": "EDPB Guidelines 01/2020 on Connected Vehicles",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202001_connected_vehicles_v2.0_adopted_en.pdf",
"filename": "edpb_connected_vehicles_01_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "connected_vehicles",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-12020-processing-personal-data-context_en",
},
{
"id": "edpb_vva_02_2021",
"title": "EDPB Guidelines 02/2021 on Virtual Voice Assistants",
"url": "https://www.edpb.europa.eu/system/files/2021-07/edpb_guidelines_202102_on_vva_v2.0_adopted_en.pdf",
"filename": "edpb_vva_02_2021.pdf",
"subdir": "eu/edpb/guidelines",
"category": "virtual_assistants",
"year": 2021,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-022021-virtual-voice-assistants_en",
},
{
"id": "edpb_cookie_taskforce_2023",
"title": "EDPB Cookie Banner Taskforce Report",
"url": "https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf",
"filename": "edpb_cookie_taskforce_2023.pdf",
"subdir": "eu/edpb/guidelines",
"category": "cookies",
"year": 2023,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/other/report-work-undertaken-cookie-banner-taskforce_en",
},
{
"id": "edpb_certification_01_2018",
"title": "EDPB Guidelines 1/2018 on GDPR Certification (v3.0)",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201801_v3.0_certificationcriteria_annex2_en.pdf",
"filename": "edpb_certification_01_2018.pdf",
"subdir": "eu/edpb/guidelines",
"category": "certification",
"year": 2019,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-12018-certification-and-identifying-certification_en",
},
{
"id": "edpb_bcr_01_2022",
"title": "EDPB Recommendations 01/2022 on BCR Application (v2)",
"url": "https://www.edpb.europa.eu/system/files/2023-06/edpb_recommendations_20221_bcr-c_v2_en.pdf",
"filename": "edpb_bcr_01_2022.pdf",
"subdir": "eu/edpb/guidelines",
"category": "transfers",
"year": 2022,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/recommendations/recommendations-012022-application-approval-binding_en",
},
{
"id": "edpb_rtbf_05_2019",
"title": "EDPB Guidelines 5/2019 on Right to Erasure (Search Engines)",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201905_rtbfsearchengines_afterpublicconsultation_en.pdf",
"filename": "edpb_rtbf_05_2019.pdf",
"subdir": "eu/edpb/guidelines",
"category": "erasure",
"year": 2019,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-52019-criteria-right-be-forgotten-search-engines_en",
},
{
"id": "edpb_dpia_list_recommendation",
"title": "EDPB DPIA Lists Recommendation (Consistency Mechanism)",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file2/edpb-dpia_recommendation-list-en_0.pdf",
"filename": "edpb_dpia_list_recommendation.pdf",
"subdir": "eu/edpb/guidelines",
"category": "dpia",
"year": 2019,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/consistency-findings/opinions_en",
},
{
"id": "edpb_health_data_03_2020",
"title": "EDPB Guidelines 03/2020 on Health Data for Research (COVID-19)",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202003_healthdatascientificresearchcovid19_en.pdf",
"filename": "edpb_health_data_03_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "health_data",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-032020-processing-data-concerning-health-purpose_en",
},
{
"id": "edpb_geolocation_04_2020",
"title": "EDPB Guidelines 04/2020 on Geolocation/Contact Tracing",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_20200420_contact_tracing_covid_with_annex_en.pdf",
"filename": "edpb_geolocation_04_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "geolocation",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042020-use-location-data-and-contact-tracing_en",
},
{
"id": "edpb_legal_basis_02_2019",
"title": "EDPB Guidelines 2/2019 on Art. 6(1)(b) GDPR (Online Services)",
"url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_en.pdf",
"filename": "edpb_legal_basis_02_2019.pdf",
"subdir": "eu/edpb/guidelines",
"category": "legal_basis",
"year": 2019,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
},
# --- Zusaetzliche EDPB Guidelines (neu hinzugefuegt) ---
{
"id": "edpb_rro_09_2020",
"title": "EDPB Guidelines 09/2020 on Relevant and Reasoned Objection",
"url": "https://www.edpb.europa.eu/system/files/2021-03/edpb_guidelines_202009_rro_final_en.pdf",
"filename": "edpb_rro_09_2020.pdf",
"subdir": "eu/edpb/guidelines",
"category": "enforcement",
"year": 2020,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-092020-relevant-and-reasoned-objection-under_en",
},
{
"id": "edpb_facial_recognition_05_2022",
"title": "EDPB Guidelines 05/2022 on Facial Recognition in Law Enforcement",
"url": "https://www.edpb.europa.eu/system/files/2023-05/edpb_guidelines_202304_frtlawenforcement_v2_en.pdf",
"filename": "edpb_facial_recognition_05_2022.pdf",
"subdir": "eu/edpb/guidelines",
"category": "facial_recognition",
"year": 2022,
"source_org": "European Data Protection Board (EDPB)",
"source_url": "https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052022-use-facial-recognition-technology-area-law_en",
},
]
# --- EDPS Guidance ---
EDPS_GUIDANCE = [
{
"id": "edps_dpia_list",
"title": "EDPS DPIA List (EU Institutions)",
"url": "https://www.edps.europa.eu/sites/default/files/publication/19-07-16_dpia_list_en.pdf",
"filename": "edps_dpia_list.pdf",
"subdir": "eu/edps",
"category": "dpia",
"year": 2019,
"source_org": "European Data Protection Supervisor (EDPS)",
"source_url": "https://www.edps.europa.eu/data-protection/our-work/publications/lists/dpia-list_en",
},
{
"id": "edps_genai_orientations_2024",
"title": "EDPS GenAI Orientations (June 2024)",
"url": "https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf",
"filename": "edps_genai_orientations_2024.pdf",
"subdir": "eu/edps",
"category": "ai",
"year": 2024,
"source_org": "European Data Protection Supervisor (EDPS)",
"source_url": "https://www.edps.europa.eu/press-publications/publications/guidelines/generative-ai-orientations_en",
},
{
"id": "edps_digital_ethics_2018",
"title": "EDPS Ethics Advisory Group Report (2018)",
"url": "https://www.edps.europa.eu/sites/default/files/publication/18-01-25_eag_report_en.pdf",
"filename": "edps_digital_ethics_2018.pdf",
"subdir": "eu/edps",
"category": "ethics",
"year": 2018,
"source_org": "European Data Protection Supervisor (EDPS)",
"source_url": "https://www.edps.europa.eu/data-protection/our-work/publications/ethical-framework/ethics-advisory-group-report-2018_en",
},
]
# --- DSFA Muss-Listen (Bund + 16 Laender) ---
DSFA_MUSSLISTEN = [
{
"id": "dsfa_bfdi_bund",
"title": "BfDI — DSFA-Liste Art. 35(4) fuer oeffentliche Stellen des Bundes",
"url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf?__blob=publicationFile&v=7",
"filename": "dsfa_bfdi_bund.pdf",
"subdir": "de/bfdi",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "Bundesbeauftragte fuer den Datenschutz und die Informationsfreiheit (BfDI)",
"source_url": "https://www.bfdi.bund.de/DE/Datenschutz/DatenschutzGVO/Hilfsmittel/DSFA/DSFA-node.html",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_dsk_gemeinsam",
"title": "DSK — Gemeinsame DSFA Muss-Liste (nicht-oeffentlicher Bereich)",
"url": "https://datenschutz.hessen.de/sites/datenschutz.hessen.de/files/2022-11/dsfa_muss_liste_dsk_de.pdf",
"filename": "dsfa_dsk_gemeinsam.pdf",
"subdir": "de/dsk",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://datenschutz.hessen.de/datenschutz/it-und-datenschutz/datenschutz-folgenabschaetzung",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_bw",
"title": "Baden-Wuerttemberg — DSFA Muss-Liste",
"url": "https://www.baden-wuerttemberg.datenschutz.de/wp-content/uploads/2018/05/Liste-von-Verarbeitungsvorg%C3%A4ngen-nach-Art.-35-Abs.-4-DS-GVO-LfDI-BW.pdf",
"filename": "dsfa_bw.pdf",
"subdir": "de/dpas/bw",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfDI Baden-Wuerttemberg",
"source_url": "https://www.baden-wuerttemberg.datenschutz.de/datenschutz-folgenabschaetzung/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_by",
"title": "Bayern (BayLDA) — DSFA Muss-Liste (DSK)",
"url": "https://www.lda.bayern.de/media/dsfa_muss_liste_dsk_de.pdf",
"filename": "dsfa_by.pdf",
"subdir": "de/dpas/by",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "Bayerisches Landesamt fuer Datenschutzaufsicht (BayLDA)",
"source_url": "https://www.lda.bayern.de/de/datenschutz_eu.html",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_be_noe",
"title": "Berlin — DSFA-Liste nicht-oeffentlicher Bereich",
"url": "https://www.datenschutz-berlin.de/fileadmin/user_upload/pdf/dokumente/2018-BlnBDI_DSFA-nicht-oeffentlich.pdf",
"filename": "dsfa_be_noe.pdf",
"subdir": "de/dpas/be",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "Berliner Beauftragte fuer Datenschutz und Informationsfreiheit (BlnBDI)",
"source_url": "https://www.datenschutz-berlin.de/infothek/publikationen",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_be_oe",
"title": "Berlin — DSFA-Liste oeffentlicher Bereich",
"url": "https://www.datenschutz-berlin.de/fileadmin/user_upload/pdf/dokumente/2018-BlnBDI_DSFA-oeffentlich.pdf",
"filename": "dsfa_be_oe.pdf",
"subdir": "de/dpas/be",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "Berliner Beauftragte fuer Datenschutz und Informationsfreiheit (BlnBDI)",
"source_url": "https://www.datenschutz-berlin.de/infothek/publikationen",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_bb_oe",
"title": "Brandenburg — DSFA-Liste oeffentlicher Bereich",
"url": "https://www.lda.brandenburg.de/sixcms/media.php/9/DSFA-Liste_%C3%B6ffentlicher_Bereich.pdf",
"filename": "dsfa_bb_oe.pdf",
"subdir": "de/dpas/bb",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LDA Brandenburg",
"source_url": "https://www.lda.brandenburg.de/lda/de/datenschutz/datenschutz-folgenabschaetzung/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_bb_noe",
"title": "Brandenburg — DSFA-Liste nicht-oeffentlicher Bereich",
"url": "https://www.lda.brandenburg.de/sixcms/media.php/9/DSFA-Liste_nicht_%C3%B6ffentlicher_Bereich.pdf",
"filename": "dsfa_bb_noe.pdf",
"subdir": "de/dpas/bb",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LDA Brandenburg",
"source_url": "https://www.lda.brandenburg.de/lda/de/datenschutz/datenschutz-folgenabschaetzung/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_hb",
"title": "Bremen — DSFA Muss-Liste",
"url": "https://www.datenschutz.bremen.de/sixcms/media.php/13/DSFA%20Muss-Liste%20LfDI%20HB.pdf",
"filename": "dsfa_hb.pdf",
"subdir": "de/dpas/hb",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfDI Bremen",
"source_url": "https://www.datenschutz.bremen.de/datenschutz/ds-gvo/datenschutz-folgenabschaetzung-18544",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_hh_noe",
"title": "Hamburg — DSFA Muss-Liste nicht-oeffentlicher Bereich",
"url": "https://datenschutz-hamburg.de/fileadmin/user_upload/HmbBfDI/Datenschutz/Informationen/DSFA_Muss-Liste_fuer_den_nicht-oeffentlicher_Bereich_-_Stand_17.10.2018.pdf",
"filename": "dsfa_hh_noe.pdf",
"subdir": "de/dpas/hh",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "HmbBfDI Hamburg",
"source_url": "https://datenschutz-hamburg.de/datenschutz-informationen",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_hh_oe",
"title": "Hamburg — DSFA Muss-Liste oeffentlicher Bereich",
"url": "https://datenschutz-hamburg.de/fileadmin/user_upload/HmbBfDI/Datenschutz/Informationen/Liste_Art_35-4_DSGVO_HmbBfDI-oeffentlicher_Bereich_v2.0a.pdf",
"filename": "dsfa_hh_oe.pdf",
"subdir": "de/dpas/hh",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "HmbBfDI Hamburg",
"source_url": "https://datenschutz-hamburg.de/datenschutz-informationen",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_mv",
"title": "Mecklenburg-Vorpommern — DSFA Muss-Liste oeffentlicher Bereich",
"url": "https://www.datenschutz-mv.de/static/DS/Dateien/DS-GVO/HilfsmittelzurUmsetzung/MV-DSFA-Muss-Liste-Oeffentlicher-Bereich.pdf",
"filename": "dsfa_mv.pdf",
"subdir": "de/dpas/mv",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfDI Mecklenburg-Vorpommern",
"source_url": "https://www.datenschutz-mv.de/datenschutz/fuer-verwaltungen/Datenschutz-Folgenabschaetzung/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_ni",
"title": "Niedersachsen — DSFA Muss-Liste",
"url": "https://www.lfd.niedersachsen.de/download/131098/Liste_von_Verarbeitungsvorgaengen_nach_Art._35_Abs._4_DS-GVO.pdf",
"filename": "dsfa_ni.pdf",
"subdir": "de/dpas/ni",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfD Niedersachsen",
"source_url": "https://www.lfd.niedersachsen.de/dsgvo/liste_von_verarbeitungsvorgangen_nach_art_35_abs_4_ds_gvo/muss-listen-zur-datenschutz-folgenabschatzung-179663.html",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_rp",
"title": "Rheinland-Pfalz — DSFA Muss-Liste oeffentliche Stellen",
"url": "https://www.datenschutz.rlp.de/fileadmin/datenschutz/Dokumente/Orientierungshilfen/DSFA_-_Muss-Liste_RLP_OE.pdf",
"filename": "dsfa_rp.pdf",
"subdir": "de/dpas/rp",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfDI Rheinland-Pfalz",
"source_url": "https://www.datenschutz.rlp.de/themen/datenschutz-folgenabschaetzung",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_sl",
"title": "Saarland — DSFA Muss-Liste (DSK)",
"url": "https://www.datenschutz.saarland.de/fileadmin/user_upload/uds/alle_Dateien_und_Ordner_bis_2025/Download/dsfa_muss_liste_dsk_de.pdf",
"filename": "dsfa_sl.pdf",
"subdir": "de/dpas/sl",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "UDS Saarland",
"source_url": "https://www.datenschutz.saarland.de/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_sn",
"title": "Sachsen — DSFA-Ergaenzungsliste",
"url": "https://www.datenschutz.sachsen.de/download/Datenschutz-Folgenabschaetzung_Ergaenzung_Liste_V1_20180606.pdf",
"filename": "dsfa_sn.pdf",
"subdir": "de/dpas/sn",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "Saechsischer Datenschutzbeauftragter",
"source_url": "https://www.datenschutz.sachsen.de/datenschutz-folgenabschaetzung-4156.html",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_st_oe",
"title": "Sachsen-Anhalt — DSFA-Liste oeffentlicher Bereich",
"url": "https://datenschutz.sachsen-anhalt.de/fileadmin/Bibliothek/Landesaemter/LfD/Informationen/Internationales/Datenschutz-Grundverordnung/Liste_DSFA/Art-35-Liste-oeffentlicher_Bereich.pdf",
"filename": "dsfa_st_oe.pdf",
"subdir": "de/dpas/st",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfD Sachsen-Anhalt",
"source_url": "https://datenschutz.sachsen-anhalt.de/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_st_noe",
"title": "Sachsen-Anhalt — DSFA-Liste nicht-oeffentlicher Bereich",
"url": "https://datenschutz.sachsen-anhalt.de/fileadmin/Bibliothek/Landesaemter/LfD/Informationen/Internationales/Datenschutz-Grundverordnung/Liste_DSFA/Art-35-Liste-nichtoeffentlicher_Bereich.pdf",
"filename": "dsfa_st_noe.pdf",
"subdir": "de/dpas/st",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "LfD Sachsen-Anhalt",
"source_url": "https://datenschutz.sachsen-anhalt.de/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_sh",
"title": "Schleswig-Holstein — DSFA Muss-Liste",
"url": "https://www.datenschutzzentrum.de/uploads/datenschutzfolgenabschaetzung/20180525_LfD-SH_DSFA_Muss-Liste_V1.0.pdf",
"filename": "dsfa_sh.pdf",
"subdir": "de/dpas/sh",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "ULD Schleswig-Holstein",
"source_url": "https://www.datenschutzzentrum.de/datenschutzfolgenabschaetzung/",
"collection": COLLECTION_DSFA,
},
{
"id": "dsfa_th",
"title": "Thueringen — Vorlaeufige DSFA Muss-Liste",
"url": "https://tlfdi.de/fileadmin/tlfdi/datenschutz/dsfa_muss-liste_04_07_18.pdf",
"filename": "dsfa_th.pdf",
"subdir": "de/dpas/th",
"category": "dsfa_mussliste",
"year": 2018,
"source_org": "TLfDI Thueringen",
"source_url": "https://tlfdi.de/",
"collection": COLLECTION_DSFA,
},
]
# --- DSK Kurzpapiere (Lizenz: GRUEN — oeffentliche Aufsichtsbehoerdendokumente, kommerziell nutzbar) ---
DSK_KURZPAPIERE = [
{
"id": "dsk_kp_01_vvt",
"title": "DSK Kurzpapier Nr. 1 — Verzeichnis von Verarbeitungstaetigkeiten (Art. 30)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_1.pdf",
"filename": "dsk_kpnr_1.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_02_einwilligung",
"title": "DSK Kurzpapier Nr. 2 — Einwilligung (Art. 7)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_2.pdf",
"filename": "dsk_kpnr_2.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_03_zweckbindung",
"title": "DSK Kurzpapier Nr. 3 — Zweckbindung und Zweckaenderung",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_3.pdf",
"filename": "dsk_kpnr_3.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_04_datenminimierung",
"title": "DSK Kurzpapier Nr. 4 — Datenminimierung",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_4.pdf",
"filename": "dsk_kpnr_4.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_05_dsfa",
"title": "DSK Kurzpapier Nr. 5 — Datenschutz-Folgenabschaetzung (Art. 35)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf",
"filename": "dsk_kpnr_5.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_06_auskunftsrecht",
"title": "DSK Kurzpapier Nr. 6 — Auskunftsrecht (Art. 15)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_6.pdf",
"filename": "dsk_kpnr_6.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_07_marktortprinzip",
"title": "DSK Kurzpapier Nr. 7 — Marktortprinzip (Art. 3)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_7.pdf",
"filename": "dsk_kpnr_7.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_08_datenportabilitaet",
"title": "DSK Kurzpapier Nr. 8 — Datenportabilitaet (Art. 20)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_8.pdf",
"filename": "dsk_kpnr_8.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_09_sanktionen",
"title": "DSK Kurzpapier Nr. 9 — Sanktionen, Geldbussen und Schadenersatz",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_9.pdf",
"filename": "dsk_kpnr_9.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_10_informationspflichten",
"title": "DSK Kurzpapier Nr. 10 — Informationspflichten (Art. 12-14)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_10.pdf",
"filename": "dsk_kpnr_10.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_11_loeschung",
"title": "DSK Kurzpapier Nr. 11 — Recht auf Loeschung (Art. 17)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_11.pdf",
"filename": "dsk_kpnr_11.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_12_dsb",
"title": "DSK Kurzpapier Nr. 12 — Datenschutzbeauftragte (Art. 37-39)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_12.pdf",
"filename": "dsk_kpnr_12.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_13_avv",
"title": "DSK Kurzpapier Nr. 13 — Auftragsverarbeitung (Art. 28)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_13.pdf",
"filename": "dsk_kpnr_13.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_14_beschaeftigte",
"title": "DSK Kurzpapier Nr. 14 — Beschaeftigtendatenschutz",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_14.pdf",
"filename": "dsk_kpnr_14.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_15_videoueberwachung",
"title": "DSK Kurzpapier Nr. 15 — Videoueberwachung nach DSGVO",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_15.pdf",
"filename": "dsk_kpnr_15.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_16_gemeinsame_verantwortlichkeit",
"title": "DSK Kurzpapier Nr. 16 — Gemeinsame Verantwortlichkeit (Art. 26)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_16.pdf",
"filename": "dsk_kpnr_16.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_17_art9",
"title": "DSK Kurzpapier Nr. 17 — Besondere Kategorien personenbezogener Daten (Art. 9)",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_17.pdf",
"filename": "dsk_kpnr_17.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_18_risiko",
"title": "DSK Kurzpapier Nr. 18 — Risiko fuer die Rechte und Freiheiten natuerlicher Personen",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_18.pdf",
"filename": "dsk_kpnr_18.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_19_unabhaengigkeit",
"title": "DSK Kurzpapier Nr. 19 — Unabhaengigkeit der Datenschutzaufsicht",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_19.pdf",
"filename": "dsk_kpnr_19.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2018,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
{
"id": "dsk_kp_20_evaluierung",
"title": "DSK Kurzpapier Nr. 20 — Evaluierung der DSGVO",
"url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_20.pdf",
"filename": "dsk_kpnr_20.pdf",
"subdir": "de/dsk/kurzpapiere",
"category": "dsk_kurzpapier",
"year": 2020,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_grade": "green",
},
]
# --- DSK Orientierungshilfen (Lizenz: GELB — oeffentlich verfuegbar, Lizenz nicht explizit) ---
DSK_ORIENTIERUNGSHILFEN = [
{
"id": "dsk_sdm_v31",
"title": "DSK — Standard-Datenschutzmodell (SDM) V3.1",
"url": "https://www.datenschutzkonferenz-online.de/media/ah/SDM-Methode-V31.pdf",
"filename": "SDM-Methode-V31.pdf",
"subdir": "de/dsk/orientierungshilfen",
"category": "dsk_orientierungshilfe",
"year": 2024,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/sdm.html",
"license_grade": "yellow",
},
{
"id": "dsk_oh_email_verschl",
"title": "DSK Orientierungshilfe — E-Mail-Verschluesselung",
"url": "https://www.datenschutzkonferenz-online.de/media/oh/20210616_orientierungshilfe_e_mail_verschluesselung.pdf",
"filename": "oh_e_mail_verschluesselung.pdf",
"subdir": "de/dsk/orientierungshilfen",
"category": "dsk_orientierungshilfe",
"year": 2021,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/orientierungshilfen.html",
"license_grade": "yellow",
},
{
"id": "dsk_oh_telemedien",
"title": "DSK Orientierungshilfe — Telemedien (Webseiten/Apps)",
"url": "https://www.datenschutzkonferenz-online.de/media/oh/20221205_oh_Telemedien_2021_Version_1_1_Vorlage_104_DSK_final.pdf",
"filename": "oh_telemedien.pdf",
"subdir": "de/dsk/orientierungshilfen",
"category": "dsk_orientierungshilfe",
"year": 2022,
"source_org": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/orientierungshilfen.html",
"license_grade": "yellow",
},
]
# --- BfDI Praxis-Dokumente (Lizenz: GELB — oeffentlich verfuegbar, Lizenz nicht explizit) ---
BFDI_PRAXIS = [
{
"id": "bfdi_loeschkonzept",
"title": "BfDI — Loeschkonzept (2021)",
"url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Broschueren/INFO5.pdf?__blob=publicationFile&v=9",
"filename": "2021_Loeschkonzept-BfDI.pdf",
"subdir": "de/bfdi/praxis",
"category": "bfdi_praxis",
"year": 2021,
"source_org": "Bundesbeauftragte fuer den Datenschutz und die Informationsfreiheit (BfDI)",
"source_url": "https://www.bfdi.bund.de/DE/Fachthemen/Inhalte/Loeschen/loeschen-node.html",
"license_grade": "yellow",
},
]
# --- BayLDA/BayLfD Dokumente (Lizenz: GELB — oeffentlich verfuegbar) ---
BAYLDA_DOCS = [
{
"id": "baylda_tom_checkliste",
"title": "BayLDA — Checkliste Technisch-Organisatorische Massnahmen (TOM)",
"url": "https://www.lda.bayern.de/media/checkliste/baylda_checkliste_tom.pdf",
"filename": "baylda_checkliste_tom.pdf",
"subdir": "de/baylda",
"category": "baylda_praxis",
"year": 2019,
"source_org": "Bayerisches Landesamt fuer Datenschutzaufsicht (BayLDA)",
"source_url": "https://www.lda.bayern.de/de/datenschutz_eu.html",
"license_grade": "yellow",
},
{
"id": "baylfb_oh_loeschung",
"title": "BayLfD — Orientierungshilfe Loeschung",
"url": "https://www.datenschutz-bayern.de/datenschutzreform2018/OH_Loeschung.pdf",
"filename": "OH_Loeschung.pdf",
"subdir": "de/baylfb",
"category": "baylda_praxis",
"year": 2019,
"source_org": "Bayerischer Landesbeauftragter fuer den Datenschutz (BayLfD)",
"source_url": "https://www.datenschutz-bayern.de/datenschutzreform2018/",
"license_grade": "yellow",
},
]
# Combined registry
REGISTRY = (
WP29_ENDORSED
+ EDPB_GUIDELINES
+ EDPS_GUIDANCE
+ DSFA_MUSSLISTEN
+ DSK_KURZPAPIERE
+ DSK_ORIENTIERUNGSHILFEN
+ BFDI_PRAXIS
+ BAYLDA_DOCS
)
# Mapping old filenames (from pdfs/) to new entry IDs for migration
OLD_FILENAME_MAP = {
"edpb_wp248_dpia.pdf": "wp248_dpia",
"edpb_wp243_dpo.pdf": "wp243_dpo",
"edpb_wp260_transparency.pdf": "wp260_transparency",
"edpb_wp250_breach.pdf": "wp250_breach",
"edpb_wp259_consent.pdf": "wp259_consent",
"edpb_wp242_portability.pdf": "wp242_portability",
"edpb_wp251_profiling.pdf": "wp251_profiling",
"edpb_consent_05_2020.pdf": "edpb_consent_05_2020",
"edpb_dpbd_04_2019.pdf": "edpb_dpbd_04_2019",
"edpb_transfers_07_2020.pdf": "edpb_transfers_01_2020",
"edpb_gl_7_2020.pdf": "edpb_controller_processor_07_2020",
"edpb_breach_09_2022.pdf": "edpb_breach_09_2022",
"edpb_access_01_2022.pdf": "edpb_access_01_2022",
"edpb_gl_04_2022.pdf": "edpb_fines_04_2022",
"edpb_article48_02_2024.pdf": "edpb_article48_02_2024",
"edpb_eprivacy_02_2023.pdf": "edpb_eprivacy_02_2023",
"edpb_legitimate_interest.pdf": "edpb_legitimate_interest_01_2024",
"edpb_dark_patterns_03_2022.pdf": "edpb_dark_patterns_03_2022",
"edpb_social_media_08_2020.pdf": "edpb_social_media_08_2020",
"edpb_gl_3_2019.pdf": "edpb_video_03_2019",
"edpb_connected_vehicles_01_2020.pdf": "edpb_connected_vehicles_01_2020",
"edpb_vva_02_2021.pdf": "edpb_vva_02_2021",
"edpb_cookie_taskforce_2023.pdf": "edpb_cookie_taskforce_2023",
"edpb_certification_01_2019.pdf": "edpb_certification_01_2018",
"edpb_bcr_01_2022.pdf": "edpb_bcr_01_2022",
"edpb_rtbf_05_2019.pdf": "edpb_rtbf_05_2019",
"edpb_dpia_list_recommendation.pdf": "edpb_dpia_list_recommendation",
"edpb_health_data_03_2020.pdf": "edpb_health_data_03_2020",
"edpb_geolocation_04_2020.pdf": "edpb_geolocation_04_2020",
"edpb_gl_2_2019.pdf": "edpb_legal_basis_02_2019",
"edps_dpia_list.pdf": "edps_dpia_list",
"edps_genai_orientations_2024.pdf": "edps_genai_orientations_2024",
"edps_digital_ethics_2018.pdf": "edps_digital_ethics_2018",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def ts() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
h.update(chunk)
return h.hexdigest()
def load_manifest() -> dict:
if MANIFEST_PATH.exists():
with open(MANIFEST_PATH) as f:
return json.load(f)
return {"version": datetime.now().strftime("%Y-%m-%d.1"), "last_updated": ts(), "documents": {}}
def save_manifest(manifest: dict) -> None:
manifest["last_updated"] = ts()
today = datetime.now().strftime("%Y-%m-%d")
old_v = manifest.get("version", "")
if old_v.startswith(today + "."):
n = int(old_v.split(".")[-1]) + 1
manifest["version"] = f"{today}.{n}"
else:
manifest["version"] = f"{today}.1"
MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(MANIFEST_PATH, "w") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
def ok(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] \u2713 {msg}")
def warn(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] \u26a0 {msg}", file=sys.stderr)
def fail(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] \u2717 {msg}", file=sys.stderr)
def is_valid_pdf(path: Path) -> bool:
"""Check if file is actually a PDF (not HTML from Cloudflare)."""
with open(path, "rb") as f:
header = f.read(5)
return header == b"%PDF-"
def extract_text_pymupdf(pdf_path: Path) -> str:
"""Extract text from PDF using PyMuPDF locally."""
if not HAS_PYMUPDF:
return ""
doc = fitz.open(str(pdf_path))
pages = []
for page in doc:
text = page.get_text()
if text.strip():
pages.append(text)
doc.close()
return "\n\n".join(pages)
def upload_text_as_file(text: str, filename: str, collection: str,
data_type: str, use_case: str, year: str,
metadata: dict) -> dict:
"""Upload extracted text as a .txt file to the RAG API."""
txt_filename = filename.replace(".pdf", ".txt")
resp = requests.post(
RAG_URL,
files={"file": (txt_filename, text.encode("utf-8"), "text/plain")},
data={
"collection": collection,
"data_type": data_type,
"use_case": use_case,
"year": year,
"chunk_strategy": "recursive",
"chunk_size": "512",
"chunk_overlap": "50",
"metadata_json": json.dumps(metadata),
},
timeout=TIMEOUT,
verify=False,
)
return resp
def entry_path(entry: dict) -> Path:
"""Full path for a registry entry."""
return SOURCES_DIR / entry["subdir"] / entry["filename"]
def entry_collection(entry: dict) -> str:
"""Target Qdrant collection for a registry entry."""
return entry.get("collection", COLLECTION)
def registry_by_id() -> dict:
return {e["id"]: e for e in REGISTRY}
# ---------------------------------------------------------------------------
# CSV Manifest Export
# ---------------------------------------------------------------------------
def export_csv_manifests() -> None:
"""Write CSV manifests per category + unified manifest with license grades."""
MANIFESTS_DIR.mkdir(parents=True, exist_ok=True)
fields = ["id", "title", "url", "filename", "subdir", "category", "year", "source_org", "source_url"]
groups = {
"eu_wp29_endorsed.csv": WP29_ENDORSED,
"eu_edpb_guidelines.csv": EDPB_GUIDELINES,
"eu_edps_guidance.csv": EDPS_GUIDANCE,
"de_dsfa_lists.csv": DSFA_MUSSLISTEN,
"de_dsk_kurzpapiere.csv": DSK_KURZPAPIERE,
"de_dsk_orientierungshilfen.csv": DSK_ORIENTIERUNGSHILFEN,
"de_bfdi_praxis.csv": BFDI_PRAXIS,
"de_baylda_docs.csv": BAYLDA_DOCS,
}
for fname, entries in groups.items():
path = MANIFESTS_DIR / fname
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
writer.writeheader()
for e in entries:
writer.writerow(e)
ok(f"CSV manifest: {path} ({len(entries)} entries)")
# Unified manifest with license grades
unified_fields = [
"id", "topic", "doc_type", "url", "license_grade",
"source_org", "source_url", "collection", "notes",
]
unified_path = MANIFESTS_DIR / "eu_de_privacy_manifest.csv"
with open(unified_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=unified_fields, extrasaction="ignore")
writer.writeheader()
for e in REGISTRY:
grade = e.get("license_grade", "green")
coll = entry_collection(e)
row = {
"id": e["id"],
"topic": e.get("category", ""),
"doc_type": e.get("category", ""),
"url": e["url"],
"license_grade": grade,
"source_org": e.get("source_org", ""),
"source_url": e.get("source_url", ""),
"collection": coll,
"notes": e["title"],
}
writer.writerow(row)
ok(f"Unified manifest: {unified_path} ({len(REGISTRY)} entries)")
# ---------------------------------------------------------------------------
# Migration: pdfs/ -> sources/
# ---------------------------------------------------------------------------
def migrate_from_pdfs(manifest: dict) -> dict:
"""Move existing PDFs from ~/rag-ingestion/pdfs/ into the new sources/ structure."""
old_dir = WORK_DIR / "pdfs"
if not old_dir.exists():
log("No pdfs/ directory found — nothing to migrate")
return manifest
moved = 0
for old_file in sorted(old_dir.glob("*.pdf")):
fname = old_file.name
entry_id = OLD_FILENAME_MAP.get(fname)
if not entry_id:
warn(f"No registry mapping for {fname} — skipping migration")
continue
rmap = registry_by_id()
entry = rmap.get(entry_id)
if not entry:
warn(f"Entry {entry_id} not in registry — skipping")
continue
new_path = entry_path(entry)
if new_path.exists():
# Already migrated
continue
new_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(str(old_file), str(new_path))
sha = sha256_file(new_path)
manifest["documents"][entry_id] = {
"filename": entry["filename"],
"subdir": entry["subdir"],
"sha256": sha,
"downloaded_at": ts(),
"file_size": new_path.stat().st_size,
"source_url": entry["url"],
"source_org": entry["source_org"],
"source_page": entry.get("source_url", ""),
"ingested": False,
"ingested_at": None,
"collection": entry_collection(entry),
}
ok(f"Migrated: {fname} -> sources/{entry['subdir']}/{entry['filename']}")
moved += 1
log(f"Migration: {moved} files moved to sources/")
return manifest
# ---------------------------------------------------------------------------
# Download
# ---------------------------------------------------------------------------
def download_new(manifest: dict) -> dict:
"""Download PDFs from registry that are not yet on disk."""
downloaded = 0
skipped = 0
failed = 0
for entry in REGISTRY:
doc_id = entry["id"]
target = entry_path(entry)
# Already on disk?
if target.exists() and target.stat().st_size > 100:
if doc_id not in manifest["documents"]:
sha = sha256_file(target)
manifest["documents"][doc_id] = {
"filename": entry["filename"],
"subdir": entry["subdir"],
"sha256": sha,
"downloaded_at": ts(),
"file_size": target.stat().st_size,
"source_url": entry["url"],
"source_org": entry["source_org"],
"source_page": entry.get("source_url", ""),
"ingested": False,
"ingested_at": None,
"collection": entry_collection(entry),
}
skipped += 1
continue
target.parent.mkdir(parents=True, exist_ok=True)
log(f"Downloading: {entry['title']}")
try:
time.sleep(DOWNLOAD_DELAY)
resp = requests.get(
entry["url"], timeout=60, verify=False,
headers={"User-Agent": "BreakPilot-Compliance-Crawler/1.0"},
allow_redirects=True,
)
resp.raise_for_status()
if len(resp.content) < 1000:
warn(f"Response too small ({len(resp.content)} bytes): {entry['title']}")
failed += 1
continue
ct = resp.headers.get("content-type", "")
if "html" in ct.lower() and "pdf" not in ct.lower():
warn(f"Got HTML instead of PDF (Cloudflare?): {entry['title']}")
failed += 1
continue
# Double-check: content starts with %PDF-
if not resp.content[:5] == b"%PDF-":
warn(f"Downloaded content is not a PDF (header: {resp.content[:15]!r}): {entry['title']}")
failed += 1
continue
target.write_bytes(resp.content)
sha = sha256_file(target)
manifest["documents"][doc_id] = {
"filename": entry["filename"],
"subdir": entry["subdir"],
"sha256": sha,
"downloaded_at": ts(),
"file_size": len(resp.content),
"source_url": entry["url"],
"source_org": entry["source_org"],
"source_page": entry.get("source_url", ""),
"ingested": False,
"ingested_at": None,
"collection": entry_collection(entry),
}
ok(f"Downloaded: {entry['filename']} ({len(resp.content) // 1024} KB)")
downloaded += 1
except Exception as e:
fail(f"Download failed: {entry['title']}{e}")
failed += 1
log(f"Download: {downloaded} new, {skipped} existing, {failed} failed")
return manifest
# ---------------------------------------------------------------------------
# Ingest
# ---------------------------------------------------------------------------
def ingest_new(manifest: dict) -> dict:
"""Ingest PDFs that are downloaded but not yet ingested.
Strategy:
1. Try PDF upload to RAG API (embedding service extracts text)
2. If that fails (500), check if file is actually HTML (Cloudflare block)
→ if HTML, mark as needs-redownload and skip
3. If real PDF, fallback: extract text locally with PyMuPDF,
then upload as .txt file
"""
rmap = registry_by_id()
ingested = 0
skipped = 0
failed = 0
fallback_used = 0
for doc_id, doc in list(manifest["documents"].items()):
if doc.get("ingested"):
skipped += 1
continue
# Find file: try new structure first, then old pdfs/ dir
pdf_path = SOURCES_DIR / doc.get("subdir", "") / doc["filename"]
if not pdf_path.exists():
pdf_path = WORK_DIR / "pdfs" / doc["filename"]
if not pdf_path.exists():
warn(f"PDF not found: {doc['filename']}")
failed += 1
continue
# Check for HTML files masquerading as PDFs
if not is_valid_pdf(pdf_path):
warn(f"Not a valid PDF (HTML/Cloudflare block): {doc['filename']} — deleting, will re-download")
pdf_path.unlink()
# Remove from manifest so --download will retry
if doc_id in manifest["documents"]:
del manifest["documents"][doc_id]
failed += 1
continue
entry = rmap.get(doc_id, {})
title = entry.get("title", doc_id)
category = entry.get("category", "general")
year = str(entry.get("year", "2024"))
collection = doc.get("collection", entry.get("collection", COLLECTION))
# Determine source_id based on subdir
subdir = doc.get("subdir", entry.get("subdir", ""))
if subdir.startswith("de/"):
source_id = subdir.split("/")[1]
doc_type = "dsfa_mussliste"
data_type = "compliance"
use_case = "legal_reference"
elif "edps" in subdir:
source_id = "edps"
doc_type = "guidance"
data_type = "compliance_datenschutz"
use_case = "guidance"
else:
source_id = "edpb"
doc_type = "guidance"
data_type = "compliance_datenschutz"
use_case = "guidance"
metadata = {
"source_id": source_id,
"doc_type": doc_type,
"guideline_id": doc_id,
"guideline_name": title,
"category": category,
"license": "reuse_notice" if subdir.startswith("eu/") else "public_law",
"attribution": entry.get("source_org", ""),
"source": entry.get("source_url", entry.get("url", "")),
"download_url": entry.get("url", doc.get("source_url", "")),
}
log(f"Ingesting: {title} ({pdf_path.stat().st_size // 1024} KB) -> {collection}")
# --- Attempt 1: direct PDF upload ---
try:
with open(pdf_path, "rb") as f:
resp = requests.post(
RAG_URL,
files={"file": (doc["filename"], f, "application/pdf")},
data={
"collection": collection,
"data_type": data_type,
"use_case": use_case,
"year": year,
"chunk_strategy": "recursive",
"chunk_size": "512",
"chunk_overlap": "50",
"metadata_json": json.dumps(metadata),
},
timeout=TIMEOUT,
verify=False,
)
result = resp.json()
chunks = result.get("chunks_count") or result.get("vectors_indexed", "?")
if resp.status_code < 300 and ("chunks_count" in result or "vectors_indexed" in result):
doc["ingested"] = True
doc["ingested_at"] = ts()
ok(f"Ingested: {title} -> {chunks} chunks")
ingested += 1
continue
except Exception as e:
warn(f"PDF upload error: {title}{e}")
# --- Attempt 2: PyMuPDF local extraction + text upload ---
if not HAS_PYMUPDF:
fail(f"Ingest failed + no PyMuPDF fallback: {title}")
failed += 1
continue
log(f" Fallback: extracting text locally with PyMuPDF...")
try:
text = extract_text_pymupdf(pdf_path)
if len(text.strip()) < 100:
fail(f" PyMuPDF extracted too little text ({len(text)} chars): {title}")
failed += 1
continue
resp = upload_text_as_file(text, doc["filename"], collection,
data_type, use_case, year, metadata)
result = resp.json()
chunks = result.get("chunks_count") or result.get("vectors_indexed", "?")
if resp.status_code < 300 and ("chunks_count" in result or "vectors_indexed" in result):
doc["ingested"] = True
doc["ingested_at"] = ts()
doc["ingest_method"] = "pymupdf_fallback"
ok(f" Fallback OK: {title} -> {chunks} chunks (PyMuPDF text extraction)")
ingested += 1
fallback_used += 1
else:
fail(f" Fallback ingest failed ({resp.status_code}): {title}")
fail(f" Response: {resp.text[:300]}")
failed += 1
except Exception as e:
fail(f" Fallback error: {title}{e}")
failed += 1
log(f"Ingest: {ingested} new ({fallback_used} via PyMuPDF fallback), {skipped} already ingested, {failed} failed")
return manifest
# ---------------------------------------------------------------------------
# Status
# ---------------------------------------------------------------------------
def show_status(manifest: dict) -> None:
downloaded = 0
ingested_count = 0
missing = 0
sections = [
("=== Endorsed WP29 ===", WP29_ENDORSED),
("=== EDPB Guidelines & Recommendations ===", EDPB_GUIDELINES),
("=== EDPS Guidance ===", EDPS_GUIDANCE),
("=== DSFA Muss-Listen (DE) ===", DSFA_MUSSLISTEN),
("=== DSK Kurzpapiere (Nr. 1-20) ===", DSK_KURZPAPIERE),
("=== DSK Orientierungshilfen ===", DSK_ORIENTIERUNGSHILFEN),
("=== BfDI Praxis-Dokumente ===", BFDI_PRAXIS),
("=== BayLDA/BayLfD Dokumente ===", BAYLDA_DOCS),
]
print()
for header, entries in sections:
print(f"\n{header}")
print(f"{'ID':<40} {'Status':<12} {'File':<45} {'Size':>8}")
print("-" * 110)
for entry in entries:
doc_id = entry["id"]
doc = manifest["documents"].get(doc_id)
if doc:
size_kb = doc.get("file_size", 0) // 1024
if doc.get("ingested"):
status = "INGESTED"
ingested_count += 1
else:
status = "DOWNLOADED"
downloaded += 1
else:
fp = entry_path(entry)
if fp.exists():
status = "ON DISK"
downloaded += 1
size_kb = fp.stat().st_size // 1024
else:
status = "MISSING"
missing += 1
size_kb = 0
size_str = f"{size_kb:>6} KB" if size_kb > 0 else " -"
print(f"{doc_id:<40} {status:<12} {entry['filename']:<45} {size_str}")
print()
print("=" * 110)
print(f"Total: {len(REGISTRY)} in registry | {downloaded} downloaded | {ingested_count} ingested | {missing} missing")
print(f"Manifest: {MANIFEST_PATH}")
print(f"Sources: {SOURCES_DIR}")
print()
# ---------------------------------------------------------------------------
# Verify
# ---------------------------------------------------------------------------
def verify(manifest: dict) -> None:
queries = [
("DSFA erforderlich Risiko", COLLECTION, "WP248"),
("Datenschutzbeauftragter Pflichten", COLLECTION, "WP243"),
("Transparenz Informationspflicht", COLLECTION, "WP260"),
("Einwilligung Consent Cookie", COLLECTION, "Consent"),
("Data Protection by Design", COLLECTION, "DPbD"),
("Datenschutz-Folgenabschaetzung Muss-Liste", COLLECTION_DSFA, "DSFA Mussliste"),
("Videoüberwachung DSFA erforderlich", COLLECTION_DSFA, "Laender-DPA"),
# DSK Kurzpapiere
("VVT Art. 30 Verarbeitungsverzeichnis", COLLECTION, "DSK KP 1"),
("Recht auf Loeschung Art. 17", COLLECTION, "DSK KP 11"),
("Auftragsverarbeitung Art. 28", COLLECTION, "DSK KP 13"),
("Besondere Kategorien Art. 9", COLLECTION, "DSK KP 17"),
("Risiko Rechte Freiheiten natuerlicher Personen", COLLECTION, "DSK KP 18"),
# SDM / BfDI / BayLDA
("Standard-Datenschutzmodell SDM Schutzbedarf", COLLECTION, "SDM V3.1"),
("Loeschkonzept Aufbewahrungsfristen", COLLECTION, "BfDI Loeschkonzept"),
("TOM Art. 32 Verschluesselung Massnahmen", COLLECTION, "BayLDA TOM"),
]
log("Verifying RAG collections")
ok_count = 0
fail_count = 0
for query, collection, hint in queries:
try:
resp = requests.post(
RAG_SEARCH_URL,
json={"query": query, "collection": collection, "top_k": 3},
timeout=30, verify=False,
)
results = resp.json()
hits = results.get("results", results.get("documents", []))
if hits:
top_score = hits[0].get("score", hits[0].get("relevance_score", "?"))
ok(f"[{collection}] '{query}' -> {len(hits)} hits (score: {top_score})")
ok_count += 1
else:
warn(f"[{collection}] '{query}' -> 0 hits (expected: {hint})")
fail_count += 1
except Exception as e:
fail(f"Search error: {e}")
fail_count += 1
print()
log(f"Verification: {ok_count}/{len(queries)} queries OK")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="EDPB/WP29/DSFA Auto-Crawler for BreakPilot Compliance")
parser.add_argument("--all", action="store_true", help="Migrate + Download + Ingest + CSV export")
parser.add_argument("--download", action="store_true", help="Download missing PDFs")
parser.add_argument("--ingest", action="store_true", help="Ingest downloaded PDFs")
parser.add_argument("--status", action="store_true", help="Show status overview")
parser.add_argument("--verify", action="store_true", help="Run RAG test searches")
parser.add_argument("--migrate", action="store_true", help="Move PDFs from pdfs/ to sources/")
parser.add_argument("--csv", action="store_true", help="Export CSV manifests")
args = parser.parse_args()
if not any([args.all, args.download, args.ingest, args.status, args.verify, args.migrate, args.csv]):
parser.print_help()
return
manifest = load_manifest()
if args.migrate or args.all:
manifest = migrate_from_pdfs(manifest)
save_manifest(manifest)
if args.download or args.all:
manifest = download_new(manifest)
save_manifest(manifest)
if args.ingest or args.all:
manifest = ingest_new(manifest)
save_manifest(manifest)
if args.csv or args.all:
export_csv_manifests()
if args.status:
show_status(manifest)
if args.verify:
verify(manifest)
if __name__ == "__main__":
main()