Files
breakpilot-lehrer/klausur-service/backend/dsfa_corpus_ingestion.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

1829 lines
73 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
DSFA Corpus Ingestion Pipeline.
Indexes DSFA guidance documents into Qdrant with full source attribution.
Collections:
- bp_dsfa_corpus: All DSFA-related documents (WP248, DSK, Muss-Listen)
Usage:
python dsfa_corpus_ingestion.py --init-sources # Register all sources
python dsfa_corpus_ingestion.py --ingest WP248 # Ingest specific source
python dsfa_corpus_ingestion.py --ingest-all # Ingest all sources
python dsfa_corpus_ingestion.py --status # Show ingestion status
"""
import os
import re
import hashlib
import uuid
import asyncio
import argparse
from typing import List, Dict, Optional, Any
from dataclasses import dataclass, field, asdict
from datetime import datetime
from enum import Enum
import asyncpg
from qdrant_client import QdrantClient
from qdrant_client.models import (
VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue
)
# Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
MINIO_BUCKET = "dsfa-documents"
# DSFA Collection Configuration
DSFA_COLLECTION = "bp_dsfa_corpus"
VECTOR_SIZE = 1024 # BGE-M3
# =============================================================================
# License Registry
# =============================================================================
LICENSE_REGISTRY = {
"DL-DE-BY-2.0": {
"name": "Datenlizenz Deutschland Namensnennung Version 2.0",
"url": "https://www.govdata.de/dl-de/by-2-0",
"attribution_required": True,
"modification_allowed": True,
"commercial_use": True,
"template": "Quelle: {source_name}, Datenlizenz Deutschland Namensnennung Version 2.0"
},
"DL-DE-ZERO-2.0": {
"name": "Datenlizenz Deutschland Zero Version 2.0",
"url": "https://www.govdata.de/dl-de/zero-2-0",
"attribution_required": False,
"modification_allowed": True,
"commercial_use": True,
"template": None
},
"CC-BY-4.0": {
"name": "Creative Commons Attribution 4.0 International",
"url": "https://creativecommons.org/licenses/by/4.0/",
"attribution_required": True,
"modification_allowed": True,
"commercial_use": True,
"template": "© {organization} | CC BY 4.0"
},
"EDPB-LICENSE": {
"name": "EDPB Document License",
"url": "https://edpb.europa.eu/about-edpb/legal-notice_en",
"attribution_required": True,
"modification_allowed": True,
"commercial_use": True,
"template": "Source: {source_name}, European Data Protection Board"
},
"PUBLIC_DOMAIN": {
"name": "Public Domain",
"url": None,
"attribution_required": False,
"modification_allowed": True,
"commercial_use": True,
"template": None
},
"PROPRIETARY": {
"name": "Proprietary (internal use only)",
"url": None,
"attribution_required": False,
"modification_allowed": False,
"commercial_use": True,
"template": "© BreakPilot - Internal Use Only"
},
"OGL-3.0": {
"name": "Open Government Licence v3.0",
"url": "https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
"attribution_required": True,
"modification_allowed": True,
"commercial_use": True,
"template": "Contains public sector information licensed under the Open Government Licence v3.0. Source: {source_name}"
}
}
# =============================================================================
# DSFA Sources Registry
# =============================================================================
DSFA_SOURCES = [
# === Primärquellen (EU/DSGVO) ===
{
"source_code": "GDPR_ART35",
"name": "Art. 35 DSGVO - DSFA",
"full_name": "Datenschutz-Folgenabschätzung gemäß Artikel 35 DSGVO",
"organization": "Europäische Union",
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
"eur_lex_celex": "32016R0679",
"license_code": "CC-BY-4.0",
"attribution_text": "Quelle: DSGVO Art. 35 (EUR-Lex)",
"document_type": "regulation",
"language": "de"
},
{
"source_code": "GDPR_ART36",
"name": "Art. 36 DSGVO - Behördenkonsultation",
"full_name": "Vorherige Konsultation gemäß Artikel 36 DSGVO",
"organization": "Europäische Union",
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
"eur_lex_celex": "32016R0679",
"license_code": "CC-BY-4.0",
"attribution_text": "Quelle: DSGVO Art. 36 (EUR-Lex)",
"document_type": "regulation",
"language": "de"
},
{
"source_code": "GDPR_RECITALS",
"name": "Erwägungsgründe 75, 84, 89-91 DSGVO",
"full_name": "Erwägungsgründe zur Datenschutz-Folgenabschätzung",
"organization": "Europäische Union",
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
"eur_lex_celex": "32016R0679",
"license_code": "CC-BY-4.0",
"attribution_text": "Quelle: DSGVO Erwägungsgründe (EUR-Lex)",
"document_type": "regulation",
"language": "de"
},
# === WP29/EDPB Leitlinien ===
{
"source_code": "WP248",
"name": "WP248 rev.01 - Leitlinien zur DSFA",
"full_name": "Leitlinien zur Datenschutz-Folgenabschätzung und Beantwortung der Frage, ob eine Verarbeitung 'wahrscheinlich ein hohes Risiko' birgt",
"organization": "Artikel-29-Datenschutzgruppe / EDPB",
"source_url": "https://ec.europa.eu/newsroom/article29/items/611236/en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Quelle: WP248 rev.01, Artikel-29-Datenschutzgruppe (2017), bestätigt durch EDPB",
"document_type": "guideline",
"language": "de"
},
# === DSK Dokumente ===
{
"source_code": "DSK_KP5",
"name": "Kurzpapier Nr. 5 - DSFA nach Art. 35 DS-GVO",
"full_name": "DSK Kurzpapier Nr. 5: Datenschutz-Folgenabschätzung nach Art. 35 DS-GVO",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf",
"license_code": "DL-DE-BY-2.0",
"license_url": "https://www.govdata.de/dl-de/by-2-0",
"attribution_text": "Quelle: DSK Kurzpapier Nr. 5 (Stand: 2018), Datenlizenz Deutschland Namensnennung Version 2.0",
"document_type": "guideline",
"language": "de"
},
# === Muss-Listen Bund ===
{
"source_code": "BFDI_MUSS_PUBLIC",
"name": "BfDI DSFA-Liste (öffentlicher Bereich)",
"full_name": "Liste der Verarbeitungsvorgänge nach Art. 35 Abs. 4 DSGVO - Öffentlicher Bereich",
"organization": "Bundesbeauftragter für den Datenschutz und die Informationsfreiheit",
"source_url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf",
"license_code": "DL-DE-ZERO-2.0",
"attribution_text": "Quelle: BfDI, Liste gem. Art. 35 Abs. 4 DSGVO (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "BFDI_MUSS_PRIVATE",
"name": "BfDI DSFA-Liste (nicht-öffentlicher Bereich)",
"full_name": "Liste der Verarbeitungsvorgänge nach Art. 35 Abs. 4 DSGVO - Nicht-öffentlicher Bereich",
"organization": "Bundesbeauftragter für den Datenschutz und die Informationsfreiheit",
"source_url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf",
"license_code": "DL-DE-ZERO-2.0",
"attribution_text": "Quelle: BfDI, Liste gem. Art. 35 Abs. 4 DSGVO (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# === Muss-Listen Länder ===
# Baden-Württemberg
{
"source_code": "BW_MUSS_PUBLIC",
"name": "LfDI BW DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragter für Datenschutz BW",
"source_url": "https://www.baden-wuerttemberg.datenschutz.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Baden-Württemberg, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "BW_MUSS_PRIVATE",
"name": "LfDI BW DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragter für Datenschutz BW",
"source_url": "https://www.baden-wuerttemberg.datenschutz.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Baden-Württemberg, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Bayern
{
"source_code": "BY_MUSS_PUBLIC",
"name": "BayLDA DSFA-Liste (öffentlich)",
"organization": "Bayerisches Landesamt für Datenschutzaufsicht",
"source_url": "https://www.lda.bayern.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: BayLDA, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "BY_MUSS_PRIVATE",
"name": "BayLDA DSFA-Liste (nicht-öffentlich)",
"organization": "Bayerisches Landesamt für Datenschutzaufsicht",
"source_url": "https://www.lda.bayern.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: BayLDA, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Berlin
{
"source_code": "BE_MUSS_PUBLIC",
"name": "BlnBDI DSFA-Liste (öffentlich)",
"organization": "Berliner Beauftragte für Datenschutz",
"source_url": "https://www.datenschutz-berlin.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: BlnBDI, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "BE_MUSS_PRIVATE",
"name": "BlnBDI DSFA-Liste (nicht-öffentlich)",
"organization": "Berliner Beauftragte für Datenschutz",
"source_url": "https://www.datenschutz-berlin.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: BlnBDI, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Brandenburg
{
"source_code": "BB_MUSS_PUBLIC",
"name": "LDA BB DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragte für Datenschutz Brandenburg",
"source_url": "https://www.lda.brandenburg.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LDA Brandenburg, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "BB_MUSS_PRIVATE",
"name": "LDA BB DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragte für Datenschutz Brandenburg",
"source_url": "https://www.lda.brandenburg.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LDA Brandenburg, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Bremen
{
"source_code": "HB_MUSS_PUBLIC",
"name": "LfDI HB DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragte für Datenschutz Bremen",
"source_url": "https://www.datenschutz.bremen.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Bremen, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "HB_MUSS_PRIVATE",
"name": "LfDI HB DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragte für Datenschutz Bremen",
"source_url": "https://www.datenschutz.bremen.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Bremen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Hamburg
{
"source_code": "HH_MUSS_PUBLIC",
"name": "HmbBfDI DSFA-Liste (öffentlich)",
"organization": "Hamburgische Beauftragte für Datenschutz",
"source_url": "https://datenschutz-hamburg.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: HmbBfDI, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "HH_MUSS_PRIVATE",
"name": "HmbBfDI DSFA-Liste (nicht-öffentlich)",
"organization": "Hamburgische Beauftragte für Datenschutz",
"source_url": "https://datenschutz-hamburg.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: HmbBfDI, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Hessen
{
"source_code": "HE_MUSS_PUBLIC",
"name": "HBDI DSFA-Liste (öffentlich)",
"organization": "Hessischer Beauftragter für Datenschutz",
"source_url": "https://datenschutz.hessen.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: HBDI, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "HE_MUSS_PRIVATE",
"name": "HBDI DSFA-Liste (nicht-öffentlich)",
"organization": "Hessischer Beauftragter für Datenschutz",
"source_url": "https://datenschutz.hessen.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: HBDI, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Mecklenburg-Vorpommern
{
"source_code": "MV_MUSS_PUBLIC",
"name": "LfDI MV DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragter für Datenschutz MV",
"source_url": "https://www.datenschutz-mv.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI MV, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "MV_MUSS_PRIVATE",
"name": "LfDI MV DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragter für Datenschutz MV",
"source_url": "https://www.datenschutz-mv.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI MV, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Niedersachsen
{
"source_code": "NI_MUSS_PUBLIC",
"name": "LfD NI DSFA-Liste (öffentlich)",
"organization": "Die Landesbeauftragte für den Datenschutz Niedersachsen",
"source_url": "https://www.lfd.niedersachsen.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfD Niedersachsen, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "NI_MUSS_PRIVATE",
"name": "LfD NI DSFA-Liste (nicht-öffentlich)",
"organization": "Die Landesbeauftragte für den Datenschutz Niedersachsen",
"source_url": "https://www.lfd.niedersachsen.de/download/131098",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfD Niedersachsen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Nordrhein-Westfalen
{
"source_code": "NW_MUSS_PUBLIC",
"name": "LDI NRW DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragte für Datenschutz NRW",
"source_url": "https://www.ldi.nrw.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LDI NRW, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "NW_MUSS_PRIVATE",
"name": "LDI NRW DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragte für Datenschutz NRW",
"source_url": "https://www.ldi.nrw.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LDI NRW, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Rheinland-Pfalz
{
"source_code": "RP_MUSS_PUBLIC",
"name": "LfDI RP DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragter für Datenschutz Rheinland-Pfalz",
"source_url": "https://www.datenschutz.rlp.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Rheinland-Pfalz, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "RP_MUSS_PRIVATE",
"name": "LfDI RP DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragter für Datenschutz Rheinland-Pfalz",
"source_url": "https://www.datenschutz.rlp.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Rheinland-Pfalz, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Saarland
{
"source_code": "SL_MUSS_PUBLIC",
"name": "LfDI SL DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragte für Datenschutz Saarland",
"source_url": "https://www.datenschutz.saarland.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Saarland, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "SL_MUSS_PRIVATE",
"name": "LfDI SL DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragte für Datenschutz Saarland",
"source_url": "https://www.datenschutz.saarland.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfDI Saarland, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Sachsen
{
"source_code": "SN_MUSS_PUBLIC",
"name": "SDB DSFA-Liste (öffentlich)",
"organization": "Sächsischer Datenschutzbeauftragter",
"source_url": "https://www.saechsdsb.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: SDB Sachsen, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "SN_MUSS_PRIVATE",
"name": "SDB DSFA-Liste (nicht-öffentlich)",
"organization": "Sächsischer Datenschutzbeauftragter",
"source_url": "https://www.saechsdsb.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: SDB Sachsen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Sachsen-Anhalt
{
"source_code": "ST_MUSS_PUBLIC",
"name": "LfD ST DSFA-Liste (öffentlich)",
"organization": "Landesbeauftragter für Datenschutz Sachsen-Anhalt",
"source_url": "https://datenschutz.sachsen-anhalt.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfD Sachsen-Anhalt, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "ST_MUSS_PRIVATE",
"name": "LfD ST DSFA-Liste (nicht-öffentlich)",
"organization": "Landesbeauftragter für Datenschutz Sachsen-Anhalt",
"source_url": "https://datenschutz.sachsen-anhalt.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: LfD Sachsen-Anhalt, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Schleswig-Holstein
{
"source_code": "SH_MUSS_PUBLIC",
"name": "ULD DSFA-Liste (öffentlich)",
"organization": "Unabhängiges Landeszentrum für Datenschutz SH",
"source_url": "https://www.datenschutzzentrum.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: ULD Schleswig-Holstein, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "SH_MUSS_PRIVATE",
"name": "ULD DSFA-Liste (nicht-öffentlich)",
"organization": "Unabhängiges Landeszentrum für Datenschutz SH",
"source_url": "https://www.datenschutzzentrum.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: ULD Schleswig-Holstein, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# Thüringen
{
"source_code": "TH_MUSS_PUBLIC",
"name": "TLfDI DSFA-Liste (öffentlich)",
"organization": "Thüringer Landesbeauftragter für Datenschutz",
"source_url": "https://www.tlfdi.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: TLfDI Thüringen, DSFA-Muss-Liste (öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
{
"source_code": "TH_MUSS_PRIVATE",
"name": "TLfDI DSFA-Liste (nicht-öffentlich)",
"organization": "Thüringer Landesbeauftragter für Datenschutz",
"source_url": "https://www.tlfdi.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: TLfDI Thüringen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
"document_type": "checklist",
"language": "de"
},
# === Sonstige ===
{
"source_code": "AI_ACT_DSFA",
"name": "AI Act Bezüge zu DSFA",
"full_name": "AI Act Artikel mit Bezug zur Datenschutz-Folgenabschätzung",
"organization": "Europäische Union",
"source_url": "https://eur-lex.europa.eu/eli/reg/2024/1689/oj",
"license_code": "CC-BY-4.0",
"attribution_text": "Quelle: AI Act (EU) 2024/1689, EUR-Lex",
"document_type": "regulation",
"language": "de"
},
{
"source_code": "DSK_OH_KI",
"name": "DSK Orientierungshilfe KI",
"full_name": "DSK Orientierungshilfe KI und Datenschutz",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: DSK Orientierungshilfe KI und Datenschutz",
"document_type": "guideline",
"language": "de"
},
{
"source_code": "EDSA_GUIDELINES",
"name": "EDPB Guidelines on DPIA",
"full_name": "European Data Protection Board Guidelines on DPIA",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines on Data Protection Impact Assessment",
"document_type": "guideline",
"language": "en"
},
# === DSK Weitere Kurzpapiere ===
{
"source_code": "DSK_KP18",
"name": "Kurzpapier Nr. 18 - Risiko für die Rechte und Freiheiten",
"full_name": "DSK Kurzpapier Nr. 18: Risiko für die Rechte und Freiheiten natürlicher Personen",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_18.pdf",
"license_code": "DL-DE-BY-2.0",
"license_url": "https://www.govdata.de/dl-de/by-2-0",
"attribution_text": "Quelle: DSK Kurzpapier Nr. 18 (Risiko), Datenlizenz Deutschland Namensnennung Version 2.0",
"document_type": "guideline",
"language": "de"
},
# === Standard-Datenschutzmodell ===
{
"source_code": "SDM_V2",
"name": "Standard-Datenschutzmodell V2.0",
"full_name": "SDM-Methode der Datenschutzaufsichtsbehörden V2.0",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/media/ah/20191106_SDM-Methode_V2.0.pdf",
"license_code": "DL-DE-BY-2.0",
"license_url": "https://www.govdata.de/dl-de/by-2-0",
"attribution_text": "Quelle: SDM V2.0, Datenschutzkonferenz (DSK), Datenlizenz Deutschland Namensnennung Version 2.0",
"document_type": "methodology",
"language": "de"
},
# === Internes Dokument ===
{
"source_code": "BREAKPILOT_DSFA_GUIDE",
"name": "Datenschutz-Folgenabschätzung in Deutschland",
"full_name": "BreakPilot DSFA-Leitfaden (intern)",
"organization": "BreakPilot",
"source_url": None,
"license_code": "PROPRIETARY",
"attribution_text": "Quelle: BreakPilot DSFA-Leitfaden (intern)",
"document_type": "guideline",
"language": "de"
},
{
"source_code": "BREAKPILOT_BASELINE",
"name": "Baseline-DSFA Katalog",
"full_name": "BreakPilot Baseline-DSFA Katalog (proprietär)",
"organization": "BreakPilot",
"source_url": None,
"license_code": "PROPRIETARY",
"attribution_text": "Quelle: BreakPilot Baseline-DSFA Katalog (intern)",
"document_type": "catalog",
"language": "de"
},
{
"source_code": "BREAKPILOT_DSFA_DE",
"name": "DSFA in Deutschland Dokument",
"full_name": "BreakPilot DSFA in Deutschland (proprietär)",
"organization": "BreakPilot",
"source_url": None,
"license_code": "PROPRIETARY",
"attribution_text": "Quelle: BreakPilot DSFA in Deutschland (intern)",
"document_type": "guideline",
"language": "de"
},
# === VVT-Quellen (Verarbeitungsverzeichnis Art. 30 DSGVO) ===
{
"source_code": "DSK_KP1",
"name": "Kurzpapier Nr. 1 - Verarbeitungsverzeichnis",
"full_name": "DSK Kurzpapier Nr. 1: Verzeichnis von Verarbeitungstaetigkeiten nach Art. 30 DS-GVO",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_1.pdf",
"license_code": "DL-DE-BY-2.0",
"license_url": "https://www.govdata.de/dl-de/by-2-0",
"attribution_text": "Quelle: DSK Kurzpapier Nr. 1 (Stand: 2018), Datenlizenz Deutschland Namensnennung Version 2.0",
"document_type": "guideline",
"language": "de"
},
{
"source_code": "ICO_ROPA",
"name": "ICO Records of Processing Activities",
"full_name": "ICO Guidance on Documentation and Records of Processing Activities (RoPA)",
"organization": "Information Commissioner's Office (ICO)",
"source_url": "https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/accountability-and-governance/documentation-record-of-processing-activities/",
"license_code": "OGL-3.0",
"license_url": "https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: ICO RoPA Guidance",
"document_type": "guideline",
"language": "en"
},
{
"source_code": "BREAKPILOT_VVT_SPEC",
"name": "VVT Generator Spezifikation",
"full_name": "BreakPilot VVT Generator Spezifikation (proprietaer)",
"organization": "BreakPilot",
"source_url": None,
"license_code": "PROPRIETARY",
"attribution_text": "Quelle: BreakPilot VVT Generator Spezifikation (intern)",
"document_type": "specification",
"language": "de"
},
# === SDM Bausteine V3.0 (TOM Gewaehrleistungsziele) ===
{
"source_code": "SDM_BAUSTEINE",
"name": "SDM Bausteine V3.0",
"full_name": "Standard-Datenschutzmodell Bausteine Version 3.0",
"organization": "Konferenz der unabhaengigen Datenschutzaufsichtsbehoerden",
"source_url": "https://www.datenschutz-mv.de/datenschutz/sdm/",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: SDM Bausteine V3.0, Konferenz der unabhaengigen Datenschutzaufsichtsbehoerden des Bundes und der Laender, Lizenz: dl-de/by-2-0",
"document_type": "standard",
"language": "de"
},
# === DSK Kurzpapier Nr. 7 (Loeschung) ===
{
"source_code": "DSK_KP7",
"name": "DSK Kurzpapier Nr. 7 - Loeschung",
"full_name": "Kurzpapier Nr. 7: Marktortprinzip und Loeschung personenbezogener Daten",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: DSK Kurzpapier Nr. 7, Datenschutzkonferenz, Lizenz: dl-de/by-2-0",
"document_type": "guidance",
"language": "de"
},
# === BreakPilot Loeschfristen + TOM Spec (intern) ===
{
"source_code": "BREAKPILOT_LF_TOM_SPEC",
"name": "Loeschfristen & TOM Generator Spezifikation",
"full_name": "BreakPilot Loeschfristen und TOM Generator Spezifikation (proprietaer)",
"organization": "BreakPilot",
"source_url": None,
"license_code": "PROPRIETARY",
"attribution_text": "Quelle: BreakPilot Loeschfristen & TOM Generator Spezifikation (intern)",
"document_type": "specification",
"language": "de"
},
# === Compliance Advisor Agent - Zusaetzliche Quellen ===
{
"source_code": "DSGVO_VOLLTEXT",
"name": "DSGVO Volltext",
"full_name": "Verordnung (EU) 2016/679 - Datenschutz-Grundverordnung (Volltext mit Erwaegsgruenden)",
"organization": "Europaeische Union",
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
"license_code": "CC-BY-4.0",
"attribution_text": "Quelle: DSGVO Volltext, Europaeische Union, CC BY 4.0",
"document_type": "legislation",
"language": "de"
},
{
"source_code": "BDSG_VOLLTEXT",
"name": "BDSG Volltext",
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext",
"organization": "Bundesrepublik Deutschland",
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland",
"document_type": "legislation",
"language": "de"
},
{
"source_code": "AI_ACT_SUMMARY",
"name": "AI Act Zusammenfassung",
"full_name": "EU KI-Verordnung (AI Act) - Zusammenfassung und Kernpunkte",
"organization": "Europaeische Union",
"source_url": "https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:32024R1689",
"license_code": "CC-BY-4.0",
"attribution_text": "Quelle: AI Act, Europaeische Union, CC BY 4.0",
"document_type": "legislation",
"language": "de"
},
{
"source_code": "DSK_KURZPAPIERE_ALLE",
"name": "DSK Kurzpapiere (alle 20)",
"full_name": "Datenschutzkonferenz - Alle 20 Kurzpapiere zur DSGVO",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: DSK Kurzpapiere, Datenschutzkonferenz, Lizenz: dl-de/by-2-0",
"document_type": "guidance",
"language": "de"
},
{
"source_code": "SDM_V3",
"name": "Standard-Datenschutzmodell V3.0",
"full_name": "SDM - Standard-Datenschutzmodell Version 3.0",
"organization": "Datenschutzkonferenz (DSK)",
"source_url": "https://www.datenschutz-mv.de/datenschutz/sdm/",
"license_code": "DL-DE-BY-2.0",
"attribution_text": "Quelle: SDM V3.0, Datenschutzkonferenz, Lizenz: dl-de/by-2-0",
"document_type": "standard",
"language": "de"
},
# === EDPB Ergaenzende Leitlinien ===
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
{
"source_code": "EDPB_GUIDELINES_2_2019",
"name": "EDPB Leitlinien 2/2019 zu Art. 6(1)(b)",
"full_name": "EDPB Leitlinien 2/2019 zur Verarbeitung personenbezogener Daten auf Grundlage von Art. 6 Abs. 1 lit. b DSGVO",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 2/2019, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_3_2019",
"name": "EDPB Leitlinien 3/2019 Videoueberwachung",
"full_name": "EDPB Leitlinien 3/2019 zur Verarbeitung personenbezogener Daten durch Videoueberwachung",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 3/2019, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_5_2020",
"name": "EDPB Leitlinien 5/2020 Einwilligung",
"full_name": "EDPB Leitlinien 5/2020 zur Einwilligung gemaess Verordnung 2016/679",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 5/2020, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_7_2020",
"name": "EDPB Leitlinien 7/2020 Controller/Processor",
"full_name": "EDPB Leitlinien 7/2020 zu den Begriffen Verantwortlicher und Auftragsverarbeiter",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 7/2020, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_1_2022",
"name": "EDPB Leitlinien 1/2022 Bussgelder",
"full_name": "EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under-gdpr_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 04/2022, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "SCC_FULL_TEXT",
"name": "Standard Contractual Clauses Volltext",
"full_name": "Standardvertragsklauseln fuer die Uebermittlung personenbezogener Daten an Drittlaender (2021/914/EU)",
"organization": "Europaeische Kommission",
"source_url": "https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: SCC Volltext, Europaeische Kommission (EUR-Lex)",
"document_type": "regulation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
# === Nationale Datenschutzgesetze (DSGVO-Umsetzungen) ===
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
# These sources are kept here for reference but will be skipped during ingestion.
# Ingestion should target bp_legal_corpus for these source codes.
{
"source_code": "BDSG_FULL",
"name": "BDSG Volltext (Deutschland)",
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext inkl. aller Teile",
"organization": "Bundesrepublik Deutschland",
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland (gesetze-im-internet.de)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "AT_DSG",
"name": "DSG Oesterreich",
"full_name": "Bundesgesetz zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten (Datenschutzgesetz - DSG)",
"organization": "Republik Oesterreich",
"source_url": "https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001597",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Republik Oesterreich (RIS)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "CH_DSG",
"name": "DSG Schweiz (revDSG 2023)",
"full_name": "Bundesgesetz ueber den Datenschutz (Datenschutzgesetz, DSG) - revidierte Fassung 2023",
"organization": "Schweizerische Eidgenossenschaft",
"source_url": "https://www.fedlex.admin.ch/eli/cc/2022/491/de",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Schweizerische Eidgenossenschaft (Fedlex)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "LI_DSG",
"name": "DSG Liechtenstein",
"full_name": "Datenschutzgesetz (DSG) Liechtenstein",
"organization": "Fuerstentum Liechtenstein",
"source_url": "https://www.gesetze.li/konso/2018.272",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Fuerstentum Liechtenstein (gesetze.li)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "FR_CNIL_GUIDE",
"name": "CNIL Guide RGPD",
"full_name": "Guide pratique RGPD - Commission Nationale de l'Informatique et des Libertes",
"organization": "CNIL (France)",
"source_url": "https://www.cnil.fr/fr/rgpd-de-quoi-parle-t-on",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: CNIL Guide RGPD, Commission Nationale de l'Informatique et des Libertes",
"document_type": "guideline",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "ES_LOPDGDD",
"name": "LOPDGDD Spanien",
"full_name": "Ley Organica de Proteccion de Datos Personales y garantia de los derechos digitales",
"organization": "Reino de Espana",
"source_url": "https://www.boe.es/buscar/act.php?id=BOE-A-2018-16673",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Fuente: LOPDGDD, Reino de Espana (BOE)",
"document_type": "legislation",
"language": "es",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "IT_CODICE_PRIVACY",
"name": "Codice Privacy Italien",
"full_name": "Codice in materia di protezione dei dati personali (D.Lgs. 196/2003, aggiornato D.Lgs. 101/2018)",
"organization": "Repubblica Italiana",
"source_url": "https://www.garanteprivacy.it/home/docweb/-/docweb-display/docweb/9042678",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Fonte: Codice Privacy, Garante per la protezione dei dati personali",
"document_type": "legislation",
"language": "it",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "NL_UAVG",
"name": "UAVG Niederlande",
"full_name": "Uitvoeringswet Algemene verordening gegevensbescherming (UAVG)",
"organization": "Koninkrijk der Nederlanden",
"source_url": "https://wetten.overheid.nl/BWBR0040940/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Bron: UAVG, Koninkrijk der Nederlanden (wetten.overheid.nl)",
"document_type": "legislation",
"language": "nl",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "BE_DPA_LAW",
"name": "Datenschutzgesetz Belgien",
"full_name": "Loi relative a la protection des personnes physiques a l'egard des traitements de donnees a caractere personnel",
"organization": "Royaume de Belgique",
"source_url": "https://www.ejustice.just.fgov.be/cgi_loi/change_lg.pl?language=fr&la=F&cn=2018073046",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: Loi Protection des Donnees, Royaume de Belgique (eJustice)",
"document_type": "legislation",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "LU_DPA_LAW",
"name": "Datenschutzgesetz Luxemburg",
"full_name": "Loi du 1er aout 2018 portant organisation de la Commission nationale pour la protection des donnees",
"organization": "Grand-Duche de Luxembourg",
"source_url": "https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: Loi Protection des Donnees, Grand-Duche de Luxembourg (Legilux)",
"document_type": "legislation",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "IE_DPA_2018",
"name": "Data Protection Act 2018 Ireland",
"full_name": "Data Protection Act 2018 (Act No. 7 of 2018) - Ireland",
"organization": "Government of Ireland",
"source_url": "https://www.irishstatutebook.ie/eli/2018/act/7/enacted/en/html",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, Ireland",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "UK_DPA_2018",
"name": "Data Protection Act 2018 UK",
"full_name": "Data Protection Act 2018 (c. 12) - United Kingdom",
"organization": "Government of the United Kingdom",
"source_url": "https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, UK",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "UK_GDPR",
"name": "UK GDPR (retained EU law)",
"full_name": "United Kingdom General Data Protection Regulation (UK GDPR) - retained EU law",
"organization": "Government of the United Kingdom",
"source_url": "https://www.legislation.gov.uk/eur/2016/679/contents",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: UK GDPR, legislation.gov.uk",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "NO_PERSONOPPLYSNINGSLOVEN",
"name": "Personopplysningsloven Norwegen",
"full_name": "Lov om behandling av personopplysninger (personopplysningsloven)",
"organization": "Kongeriket Norge",
"source_url": "https://lovdata.no/dokument/NL/lov/2018-06-15-38",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kilde: Personopplysningsloven, Kongeriket Norge (Lovdata)",
"document_type": "legislation",
"language": "no",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "SE_DATASKYDDSLAG",
"name": "Dataskyddslag Schweden",
"full_name": "Lag (2018:218) med kompletterande bestammelser till EU:s dataskyddsforordning",
"organization": "Konungariket Sverige",
"source_url": "https://www.riksdagen.se/sv/dokument-och-lagar/dokument/svensk-forfattningssamling/lag-2018218-med-kompletterande-bestammelser-till_sfs-2018-218/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kalla: Dataskyddslag (2018:218), Konungariket Sverige (Riksdagen)",
"document_type": "legislation",
"language": "sv",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "DK_DATABESKYTTELSESLOVEN",
"name": "Databeskyttelsesloven Daenemark",
"full_name": "Lov om supplerende bestemmelser til forordning om beskyttelse af fysiske personer i forbindelse med behandling af personoplysninger",
"organization": "Kongeriget Danmark",
"source_url": "https://www.retsinformation.dk/eli/lta/2018/502",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kilde: Databeskyttelsesloven, Kongeriget Danmark (Retsinformation)",
"document_type": "legislation",
"language": "da",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "FI_TIETOSUOJALAKI",
"name": "Tietosuojalaki Finnland",
"full_name": "Tietosuojalaki (1050/2018) - Datenschutzgesetz Finnland",
"organization": "Suomen tasavalta",
"source_url": "https://www.finlex.fi/fi/laki/ajantasa/2018/20181050",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Lahde: Tietosuojalaki, Suomen tasavalta (Finlex)",
"document_type": "legislation",
"language": "fi",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "PL_UODO",
"name": "UODO Polen",
"full_name": "Ustawa o ochronie danych osobowych - Datenschutzgesetz Polen",
"organization": "Rzeczpospolita Polska",
"source_url": "https://isap.sejm.gov.pl/isap.nsf/DocDetails.xsp?id=WDU20180001000",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Zrodlo: Ustawa o ochronie danych osobowych, Rzeczpospolita Polska (ISAP)",
"document_type": "legislation",
"language": "pl",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "CZ_ZOU",
"name": "Zakon o ochrane osobnich udaju Tschechien",
"full_name": "Zakon c. 110/2019 Sb. o zpracovani osobnich udaju",
"organization": "Ceska republika",
"source_url": "https://www.zakonyprolidi.cz/cs/2019-110",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Zdroj: Zakon o ochrane osobnich udaju, Ceska republika (zakonyprolidi.cz)",
"document_type": "legislation",
"language": "cs",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "HU_INFOTV",
"name": "Informacios torvenye Ungarn",
"full_name": "2011. evi CXII. torveny az informacios onrendelkezesi jogrol es az informacioszabadsagrol (Infotv.)",
"organization": "Magyarorszag",
"source_url": "https://njt.hu/jogszabaly/2011-112-00-00",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Forras: Infotv., Magyarorszag (njt.hu)",
"document_type": "legislation",
"language": "hu",
"migrated_to": "bp_legal_corpus"
},
]
# =============================================================================
# Chunking Configuration
# =============================================================================
DSFA_CHUNK_CONFIG = {
# WP248 - Kriterien einzeln chunken
"WP248": {
"strategy": "section_based",
"section_markers": [r"K1[:\s]", r"K2[:\s]", r"K3[:\s]", r"K4[:\s]", r"K5[:\s]",
r"K6[:\s]", r"K7[:\s]", r"K8[:\s]", r"K9[:\s]"],
"max_chunk_size": 1500,
"overlap": 200
},
# DSK Kurzpapier - Prozessschritte einzeln
"DSK_KP5": {
"strategy": "section_based",
"section_markers": [r"Schritt\s*1", r"Schritt\s*2", r"Schritt\s*3",
r"Schritt\s*4", r"Schritt\s*5", r"Schritt\s*6"],
"max_chunk_size": 1200,
"overlap": 150
},
# SDM V2.0 - Gewährleistungsziele einzeln
'SDM_V2': {
'strategy': 'section_based',
'section_markers': [
r'Gewährleistungsziel\s+\d',
r'\d+\.\d+\s+',
],
'max_chunk_size': 1200,
'overlap': 150,
'categories': ['sdm_goal', 'methodology', 'implementation']
},
# Muss-Listen - Jeder Eintrag = 1 Chunk
"MUSS_LISTEN": {
"strategy": "list_item",
"list_markers": [r"^•", r"^-", r"^\d+\."],
"max_chunk_size": 800,
"overlap": 0
},
# DSK Kurzpapier Nr. 1 - VVT Schritte einzeln
"DSK_KP1": {
"strategy": "section_based",
"section_markers": [r"Schritt\s+\d", r"\d+\.\s+"],
"max_chunk_size": 1000,
"overlap": 150,
"categories": ["vvt_guidance", "art30_requirements", "controller_duties"]
},
# ICO RoPA - Sections einzeln
"ICO_ROPA": {
"strategy": "section_based",
"section_markers": [r"What\s+should", r"How\s+to", r"Controller", r"Processor"],
"max_chunk_size": 1000,
"overlap": 150,
"categories": ["vvt_guidance", "art30_requirements", "ropa_templates"]
},
# SDM Bausteine - Gewaehrleistungsziele einzeln
"SDM_BAUSTEINE": {
"strategy": "section_based",
"section_markers": [
r"Baustein\s+\d",
r"Gewaehrleistungsziel",
r"\d+\.\d+\s+",
],
"max_chunk_size": 1200,
"overlap": 150,
"categories": ["sdm_goal", "tom_measure", "implementation"]
},
# DSK Kurzpapier Nr. 7 - Loeschung
"DSK_KP7": {
"strategy": "section_based",
"section_markers": [r"Schritt\s+\d", r"\d+\.\s+", r"Loeschkonzept"],
"max_chunk_size": 1000,
"overlap": 150,
"categories": ["loeschung", "art17_requirements", "retention_guidance"]
},
# Fallback
"DEFAULT": {
"strategy": "recursive",
"max_chunk_size": 1000,
"overlap": 200
}
}
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class DSFAChunkPayload:
"""Payload schema for Qdrant points."""
chunk_id: str
document_id: str
source_id: str
content: str
section_title: Optional[str] = None
source_code: str = ""
source_name: str = ""
attribution_text: str = ""
license_code: str = ""
attribution_required: bool = True
document_type: str = ""
category: str = ""
language: str = "de"
page_number: Optional[int] = None
@dataclass
class DSFASearchResult:
"""Search result with attribution."""
chunk_id: str
content: str
score: float
source_code: str
source_name: str
attribution_text: str
license_code: str
license_url: Optional[str]
attribution_required: bool
source_url: Optional[str]
document_type: str
category: str
section_title: Optional[str]
page_number: Optional[int]
# =============================================================================
# Database Operations
# =============================================================================
class DSFACorpusStore:
"""Database operations for DSFA corpus."""
def __init__(self, pool: asyncpg.Pool):
self.pool = pool
async def register_source(self, source_data: Dict) -> str:
"""Register a DSFA source in the database."""
async with self.pool.acquire() as conn:
# Check if source already exists
existing = await conn.fetchval(
"SELECT id FROM dsfa_sources WHERE source_code = $1",
source_data["source_code"]
)
if existing:
# Update existing source
await conn.execute("""
UPDATE dsfa_sources SET
name = $2,
full_name = $3,
organization = $4,
source_url = $5,
eur_lex_celex = $6,
license_code = $7,
license_url = $8,
attribution_required = $9,
attribution_text = $10,
document_type = $11,
language = $12,
updated_at = NOW()
WHERE source_code = $1
""",
source_data["source_code"],
source_data["name"],
source_data.get("full_name"),
source_data.get("organization"),
source_data.get("source_url"),
source_data.get("eur_lex_celex"),
source_data["license_code"],
source_data.get("license_url"),
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
source_data["attribution_text"],
source_data.get("document_type"),
source_data.get("language", "de")
)
return str(existing)
else:
# Insert new source
source_id = await conn.fetchval("""
INSERT INTO dsfa_sources (
source_code, name, full_name, organization, source_url,
eur_lex_celex, license_code, license_url, attribution_required,
attribution_text, document_type, language
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
RETURNING id
""",
source_data["source_code"],
source_data["name"],
source_data.get("full_name"),
source_data.get("organization"),
source_data.get("source_url"),
source_data.get("eur_lex_celex"),
source_data["license_code"],
source_data.get("license_url"),
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
source_data["attribution_text"],
source_data.get("document_type"),
source_data.get("language", "de")
)
return str(source_id)
async def get_source_by_code(self, source_code: str) -> Optional[Dict]:
"""Get source by its code."""
async with self.pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM dsfa_sources WHERE source_code = $1",
source_code
)
if row:
return dict(row)
return None
async def list_sources(self) -> List[Dict]:
"""List all registered sources."""
async with self.pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM dsfa_sources ORDER BY source_code"
)
return [dict(row) for row in rows]
async def create_document(
self,
source_id: str,
title: str,
file_name: Optional[str] = None,
file_type: Optional[str] = None,
minio_path: Optional[str] = None,
original_url: Optional[str] = None,
metadata: Optional[Dict] = None
) -> str:
"""Create a document record."""
import json
metadata_json = json.dumps(metadata or {})
async with self.pool.acquire() as conn:
doc_id = await conn.fetchval("""
INSERT INTO dsfa_documents (
source_id, title, file_name, file_type, minio_path,
original_url, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb)
RETURNING id
""",
uuid.UUID(source_id),
title,
file_name,
file_type,
minio_path,
original_url,
metadata_json
)
return str(doc_id)
async def create_chunk(
self,
document_id: str,
source_id: str,
content: str,
chunk_index: int,
section_title: Optional[str] = None,
page_number: Optional[int] = None,
category: Optional[str] = None,
qdrant_point_id: Optional[str] = None,
metadata: Optional[Dict] = None
) -> str:
"""Create a chunk record."""
import json
content_hash = hashlib.sha256(content.encode()).hexdigest()
async with self.pool.acquire() as conn:
chunk_id = await conn.fetchval("""
INSERT INTO dsfa_document_chunks (
document_id, source_id, content, content_hash, chunk_index,
section_title, page_number, category, qdrant_point_id, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb)
RETURNING id
""",
uuid.UUID(document_id),
uuid.UUID(source_id),
content,
content_hash,
chunk_index,
section_title,
page_number,
category,
qdrant_point_id,
json.dumps(metadata or {})
)
return str(chunk_id)
async def get_chunk_with_attribution(self, chunk_id: str) -> Optional[Dict]:
"""Get a chunk with full source attribution."""
async with self.pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT * FROM dsfa_chunk_with_attribution
WHERE chunk_id = $1
""", uuid.UUID(chunk_id))
if row:
return dict(row)
return None
async def get_source_stats(self) -> List[Dict]:
"""Get aggregated stats per source."""
async with self.pool.acquire() as conn:
rows = await conn.fetch("SELECT * FROM dsfa_source_stats")
return [dict(row) for row in rows]
async def update_document_indexed(self, document_id: str, chunks_count: int):
"""Update document with indexing information."""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE dsfa_documents
SET chunks_generated = $2,
last_indexed_at = NOW(),
text_extracted = true
WHERE id = $1
""", uuid.UUID(document_id), chunks_count)
# =============================================================================
# Qdrant Operations
# =============================================================================
class DSFAQdrantService:
"""Qdrant operations for DSFA corpus."""
def __init__(self, url: str = None):
self.url = url or QDRANT_URL
self._client = None
@property
def client(self) -> QdrantClient:
if self._client is None:
self._client = QdrantClient(url=self.url, check_compatibility=False)
return self._client
async def ensure_collection(self) -> bool:
"""Ensure DSFA collection exists."""
try:
collections = self.client.get_collections().collections
collection_names = [c.name for c in collections]
if DSFA_COLLECTION not in collection_names:
self.client.create_collection(
collection_name=DSFA_COLLECTION,
vectors_config=VectorParams(
size=VECTOR_SIZE,
distance=Distance.COSINE
)
)
print(f"Created collection: {DSFA_COLLECTION}")
return True
except Exception as e:
print(f"Error ensuring collection: {e}")
return False
async def index_chunks(
self,
chunks: List[Dict],
embeddings: List[List[float]]
) -> int:
"""Index chunks into Qdrant."""
if not chunks or not embeddings:
return 0
points = []
for chunk, embedding in zip(chunks, embeddings):
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk["chunk_id"]))
payload = DSFAChunkPayload(
chunk_id=chunk["chunk_id"],
document_id=chunk["document_id"],
source_id=chunk["source_id"],
content=chunk["content"],
section_title=chunk.get("section_title"),
source_code=chunk["source_code"],
source_name=chunk["source_name"],
attribution_text=chunk["attribution_text"],
license_code=chunk["license_code"],
attribution_required=chunk.get("attribution_required", True),
document_type=chunk.get("document_type", ""),
category=chunk.get("category", ""),
language=chunk.get("language", "de"),
page_number=chunk.get("page_number")
)
points.append(
PointStruct(
id=point_id,
vector=embedding,
payload=asdict(payload)
)
)
self.client.upsert(collection_name=DSFA_COLLECTION, points=points)
return len(points)
async def search(
self,
query_embedding: List[float],
source_codes: Optional[List[str]] = None,
document_types: Optional[List[str]] = None,
categories: Optional[List[str]] = None,
limit: int = 10
) -> List[Dict]:
"""Search DSFA corpus with filters."""
must_conditions = []
if source_codes:
for code in source_codes:
must_conditions.append(
FieldCondition(key="source_code", match=MatchValue(value=code))
)
if document_types:
for dtype in document_types:
must_conditions.append(
FieldCondition(key="document_type", match=MatchValue(value=dtype))
)
if categories:
for cat in categories:
must_conditions.append(
FieldCondition(key="category", match=MatchValue(value=cat))
)
query_filter = Filter(must=must_conditions) if must_conditions else None
# Use query_points for newer qdrant-client API
results = self.client.query_points(
collection_name=DSFA_COLLECTION,
query=query_embedding,
query_filter=query_filter,
limit=limit
)
return [
{
"id": str(r.id),
"score": r.score,
**r.payload
}
for r in results.points
]
async def get_stats(self) -> Dict:
"""Get collection statistics."""
try:
info = self.client.get_collection(DSFA_COLLECTION)
return {
"collection": DSFA_COLLECTION,
"vectors_count": info.vectors_count,
"points_count": info.points_count,
"status": info.status.value
}
except Exception as e:
return {"error": str(e), "collection": DSFA_COLLECTION}
# =============================================================================
# Chunking Functions
# =============================================================================
def chunk_text_recursive(text: str, max_size: int = 1000, overlap: int = 200) -> List[Dict]:
"""Recursively chunk text with overlap."""
chunks = []
start = 0
while start < len(text):
end = min(start + max_size, len(text))
# Find a good break point (sentence end, paragraph)
if end < len(text):
for sep in ["\n\n", "\n", ". ", ", ", " "]:
last_sep = text[start:end].rfind(sep)
if last_sep > max_size // 2:
end = start + last_sep + len(sep)
break
chunk_text = text[start:end].strip()
if chunk_text:
chunks.append({
"content": chunk_text,
"start_char": start,
"end_char": end
})
start = end - overlap if end < len(text) else len(text)
return chunks
def chunk_by_sections(text: str, markers: List[str], max_size: int = 1500, overlap: int = 200) -> List[Dict]:
"""Chunk text by section markers."""
chunks = []
pattern = "|".join(f"({m})" for m in markers)
# Find all section starts
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
if not matches:
return chunk_text_recursive(text, max_size, overlap)
for i, match in enumerate(matches):
start = match.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
section_text = text[start:end].strip()
section_title = match.group(0).strip()
if len(section_text) > max_size:
sub_chunks = chunk_text_recursive(section_text, max_size, overlap)
for j, sub in enumerate(sub_chunks):
chunks.append({
"content": sub["content"],
"section_title": section_title if j == 0 else f"{section_title} (cont.)",
"start_char": start + sub["start_char"],
"end_char": start + sub["end_char"]
})
else:
chunks.append({
"content": section_text,
"section_title": section_title,
"start_char": start,
"end_char": end
})
return chunks
def chunk_by_list_items(text: str, markers: List[str], max_size: int = 800) -> List[Dict]:
"""Chunk text by list item markers."""
chunks = []
pattern = "|".join(f"({m})" for m in markers)
lines = text.split("\n")
current_item = ""
current_start = 0
for i, line in enumerate(lines):
if re.match(pattern, line.strip()):
if current_item.strip():
chunks.append({
"content": current_item.strip(),
"start_char": current_start,
"end_char": current_start + len(current_item)
})
current_item = line
current_start = sum(len(lines[j]) + 1 for j in range(i))
else:
current_item += "\n" + line
if current_item.strip():
chunks.append({
"content": current_item.strip(),
"start_char": current_start,
"end_char": current_start + len(current_item)
})
return chunks
def chunk_document(text: str, source_code: str) -> List[Dict]:
"""Chunk document using appropriate strategy for source type."""
config = DSFA_CHUNK_CONFIG.get(source_code, DSFA_CHUNK_CONFIG["DEFAULT"])
if source_code.endswith("_MUSS_PUBLIC") or source_code.endswith("_MUSS_PRIVATE"):
config = DSFA_CHUNK_CONFIG["MUSS_LISTEN"]
if config["strategy"] == "section_based":
return chunk_by_sections(
text,
config["section_markers"],
config["max_chunk_size"],
config["overlap"]
)
elif config["strategy"] == "list_item":
return chunk_by_list_items(
text,
config["list_markers"],
config["max_chunk_size"]
)
else:
return chunk_text_recursive(
text,
config["max_chunk_size"],
config["overlap"]
)
# =============================================================================
# Attribution Functions
# =============================================================================
def generate_attribution_notice(results: List[DSFASearchResult]) -> str:
"""Generate combined attribution notice for all used sources."""
from collections import defaultdict
by_license = defaultdict(list)
for r in results:
by_license[r.license_code].append(r)
notices = []
for license_code, items in by_license.items():
license_info = LICENSE_REGISTRY.get(license_code, {})
if license_info.get("attribution_required", True):
sources = ", ".join(set(r.source_name for r in items))
license_name = license_info.get("name", license_code)
notices.append(f"{sources} - {license_name}")
if notices:
return "Quellennachweis:\n" + "\n".join(notices)
return ""
def get_license_label(license_code: str) -> str:
"""Get human-readable license label."""
license_info = LICENSE_REGISTRY.get(license_code, {})
return license_info.get("name", license_code)
# =============================================================================
# Main Functions
# =============================================================================
async def init_dsfa_tables(pool: asyncpg.Pool):
"""Initialize DSFA tables by running migration."""
migration_path = os.path.join(
os.path.dirname(__file__),
"migrations",
"003_dsfa_rag_tables.sql"
)
async with pool.acquire() as conn:
with open(migration_path, "r") as f:
await conn.execute(f.read())
print("DSFA tables initialized successfully")
async def register_all_sources(pool: asyncpg.Pool):
"""Register all DSFA sources in the database (skips migrated sources)."""
store = DSFACorpusStore(pool)
registered = 0
skipped = 0
for source in DSFA_SOURCES:
if source.get("migrated_to"):
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
skipped += 1
continue
source_id = await store.register_source(source)
print(f"Registered source: {source['source_code']} -> {source_id}")
registered += 1
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
async def get_ingestion_status(pool: asyncpg.Pool):
"""Get current ingestion status."""
store = DSFACorpusStore(pool)
qdrant = DSFAQdrantService()
print("\n=== DSFA Corpus Status ===\n")
# PostgreSQL stats
stats = await store.get_source_stats()
print("PostgreSQL Sources:")
print("-" * 80)
print(f"{'Source Code':<25} {'Documents':>10} {'Chunks':>10} {'Last Indexed':<20}")
print("-" * 80)
total_docs = 0
total_chunks = 0
for s in stats:
total_docs += s.get("document_count", 0)
total_chunks += s.get("chunk_count", 0)
last_indexed = s.get("last_indexed_at")
last_indexed_str = last_indexed.strftime("%Y-%m-%d %H:%M") if last_indexed else "Never"
print(f"{s['source_code']:<25} {s.get('document_count', 0):>10} {s.get('chunk_count', 0):>10} {last_indexed_str:<20}")
print("-" * 80)
print(f"{'TOTAL':<25} {total_docs:>10} {total_chunks:>10}")
# Qdrant stats
print("\nQdrant Collection:")
qdrant_stats = await qdrant.get_stats()
if "error" in qdrant_stats:
print(f" Error: {qdrant_stats['error']}")
else:
print(f" Collection: {qdrant_stats['collection']}")
print(f" Points: {qdrant_stats['points_count']}")
print(f" Status: {qdrant_stats['status']}")
async def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="DSFA Corpus Ingestion Pipeline")
parser.add_argument("--init-sources", action="store_true", help="Register all sources")
parser.add_argument("--init-tables", action="store_true", help="Initialize database tables")
parser.add_argument("--ingest", type=str, help="Ingest specific source by code")
parser.add_argument("--ingest-all", action="store_true", help="Ingest all sources")
parser.add_argument("--status", action="store_true", help="Show ingestion status")
parser.add_argument("--init-qdrant", action="store_true", help="Initialize Qdrant collection")
args = parser.parse_args()
# Connect to database
pool = await asyncpg.create_pool(DATABASE_URL)
try:
if args.init_tables:
await init_dsfa_tables(pool)
if args.init_sources:
await register_all_sources(pool)
if args.init_qdrant:
qdrant = DSFAQdrantService()
await qdrant.ensure_collection()
print(f"Qdrant collection {DSFA_COLLECTION} initialized")
if args.status:
await get_ingestion_status(pool)
if args.ingest:
print(f"Ingesting source: {args.ingest}")
# TODO: Implement document ingestion
if args.ingest_all:
print("Ingesting all sources...")
# TODO: Implement bulk ingestion
finally:
await pool.close()
if __name__ == "__main__":
asyncio.run(main())