Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1829 lines
73 KiB
Python
1829 lines
73 KiB
Python
"""
|
||
DSFA Corpus Ingestion Pipeline.
|
||
|
||
Indexes DSFA guidance documents into Qdrant with full source attribution.
|
||
|
||
Collections:
|
||
- bp_dsfa_corpus: All DSFA-related documents (WP248, DSK, Muss-Listen)
|
||
|
||
Usage:
|
||
python dsfa_corpus_ingestion.py --init-sources # Register all sources
|
||
python dsfa_corpus_ingestion.py --ingest WP248 # Ingest specific source
|
||
python dsfa_corpus_ingestion.py --ingest-all # Ingest all sources
|
||
python dsfa_corpus_ingestion.py --status # Show ingestion status
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import hashlib
|
||
import uuid
|
||
import asyncio
|
||
import argparse
|
||
from typing import List, Dict, Optional, Any
|
||
from dataclasses import dataclass, field, asdict
|
||
from datetime import datetime
|
||
from enum import Enum
|
||
|
||
import asyncpg
|
||
from qdrant_client import QdrantClient
|
||
from qdrant_client.models import (
|
||
VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue
|
||
)
|
||
|
||
# Configuration
|
||
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
|
||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
|
||
MINIO_BUCKET = "dsfa-documents"
|
||
|
||
# DSFA Collection Configuration
|
||
DSFA_COLLECTION = "bp_dsfa_corpus"
|
||
VECTOR_SIZE = 1024 # BGE-M3
|
||
|
||
|
||
# =============================================================================
|
||
# License Registry
|
||
# =============================================================================
|
||
|
||
LICENSE_REGISTRY = {
|
||
"DL-DE-BY-2.0": {
|
||
"name": "Datenlizenz Deutschland – Namensnennung – Version 2.0",
|
||
"url": "https://www.govdata.de/dl-de/by-2-0",
|
||
"attribution_required": True,
|
||
"modification_allowed": True,
|
||
"commercial_use": True,
|
||
"template": "Quelle: {source_name}, Datenlizenz Deutschland – Namensnennung – Version 2.0"
|
||
},
|
||
"DL-DE-ZERO-2.0": {
|
||
"name": "Datenlizenz Deutschland – Zero – Version 2.0",
|
||
"url": "https://www.govdata.de/dl-de/zero-2-0",
|
||
"attribution_required": False,
|
||
"modification_allowed": True,
|
||
"commercial_use": True,
|
||
"template": None
|
||
},
|
||
"CC-BY-4.0": {
|
||
"name": "Creative Commons Attribution 4.0 International",
|
||
"url": "https://creativecommons.org/licenses/by/4.0/",
|
||
"attribution_required": True,
|
||
"modification_allowed": True,
|
||
"commercial_use": True,
|
||
"template": "© {organization} | CC BY 4.0"
|
||
},
|
||
"EDPB-LICENSE": {
|
||
"name": "EDPB Document License",
|
||
"url": "https://edpb.europa.eu/about-edpb/legal-notice_en",
|
||
"attribution_required": True,
|
||
"modification_allowed": True,
|
||
"commercial_use": True,
|
||
"template": "Source: {source_name}, European Data Protection Board"
|
||
},
|
||
"PUBLIC_DOMAIN": {
|
||
"name": "Public Domain",
|
||
"url": None,
|
||
"attribution_required": False,
|
||
"modification_allowed": True,
|
||
"commercial_use": True,
|
||
"template": None
|
||
},
|
||
"PROPRIETARY": {
|
||
"name": "Proprietary (internal use only)",
|
||
"url": None,
|
||
"attribution_required": False,
|
||
"modification_allowed": False,
|
||
"commercial_use": True,
|
||
"template": "© BreakPilot - Internal Use Only"
|
||
},
|
||
"OGL-3.0": {
|
||
"name": "Open Government Licence v3.0",
|
||
"url": "https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
|
||
"attribution_required": True,
|
||
"modification_allowed": True,
|
||
"commercial_use": True,
|
||
"template": "Contains public sector information licensed under the Open Government Licence v3.0. Source: {source_name}"
|
||
}
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# DSFA Sources Registry
|
||
# =============================================================================
|
||
|
||
DSFA_SOURCES = [
|
||
# === Primärquellen (EU/DSGVO) ===
|
||
{
|
||
"source_code": "GDPR_ART35",
|
||
"name": "Art. 35 DSGVO - DSFA",
|
||
"full_name": "Datenschutz-Folgenabschätzung gemäß Artikel 35 DSGVO",
|
||
"organization": "Europäische Union",
|
||
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
|
||
"eur_lex_celex": "32016R0679",
|
||
"license_code": "CC-BY-4.0",
|
||
"attribution_text": "Quelle: DSGVO Art. 35 (EUR-Lex)",
|
||
"document_type": "regulation",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "GDPR_ART36",
|
||
"name": "Art. 36 DSGVO - Behördenkonsultation",
|
||
"full_name": "Vorherige Konsultation gemäß Artikel 36 DSGVO",
|
||
"organization": "Europäische Union",
|
||
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
|
||
"eur_lex_celex": "32016R0679",
|
||
"license_code": "CC-BY-4.0",
|
||
"attribution_text": "Quelle: DSGVO Art. 36 (EUR-Lex)",
|
||
"document_type": "regulation",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "GDPR_RECITALS",
|
||
"name": "Erwägungsgründe 75, 84, 89-91 DSGVO",
|
||
"full_name": "Erwägungsgründe zur Datenschutz-Folgenabschätzung",
|
||
"organization": "Europäische Union",
|
||
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
|
||
"eur_lex_celex": "32016R0679",
|
||
"license_code": "CC-BY-4.0",
|
||
"attribution_text": "Quelle: DSGVO Erwägungsgründe (EUR-Lex)",
|
||
"document_type": "regulation",
|
||
"language": "de"
|
||
},
|
||
|
||
# === WP29/EDPB Leitlinien ===
|
||
{
|
||
"source_code": "WP248",
|
||
"name": "WP248 rev.01 - Leitlinien zur DSFA",
|
||
"full_name": "Leitlinien zur Datenschutz-Folgenabschätzung und Beantwortung der Frage, ob eine Verarbeitung 'wahrscheinlich ein hohes Risiko' birgt",
|
||
"organization": "Artikel-29-Datenschutzgruppe / EDPB",
|
||
"source_url": "https://ec.europa.eu/newsroom/article29/items/611236/en",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Quelle: WP248 rev.01, Artikel-29-Datenschutzgruppe (2017), bestätigt durch EDPB",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
|
||
# === DSK Dokumente ===
|
||
{
|
||
"source_code": "DSK_KP5",
|
||
"name": "Kurzpapier Nr. 5 - DSFA nach Art. 35 DS-GVO",
|
||
"full_name": "DSK Kurzpapier Nr. 5: Datenschutz-Folgenabschätzung nach Art. 35 DS-GVO",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"license_url": "https://www.govdata.de/dl-de/by-2-0",
|
||
"attribution_text": "Quelle: DSK Kurzpapier Nr. 5 (Stand: 2018), Datenlizenz Deutschland – Namensnennung – Version 2.0",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
|
||
# === Muss-Listen Bund ===
|
||
{
|
||
"source_code": "BFDI_MUSS_PUBLIC",
|
||
"name": "BfDI DSFA-Liste (öffentlicher Bereich)",
|
||
"full_name": "Liste der Verarbeitungsvorgänge nach Art. 35 Abs. 4 DSGVO - Öffentlicher Bereich",
|
||
"organization": "Bundesbeauftragter für den Datenschutz und die Informationsfreiheit",
|
||
"source_url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf",
|
||
"license_code": "DL-DE-ZERO-2.0",
|
||
"attribution_text": "Quelle: BfDI, Liste gem. Art. 35 Abs. 4 DSGVO (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BFDI_MUSS_PRIVATE",
|
||
"name": "BfDI DSFA-Liste (nicht-öffentlicher Bereich)",
|
||
"full_name": "Liste der Verarbeitungsvorgänge nach Art. 35 Abs. 4 DSGVO - Nicht-öffentlicher Bereich",
|
||
"organization": "Bundesbeauftragter für den Datenschutz und die Informationsfreiheit",
|
||
"source_url": "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf",
|
||
"license_code": "DL-DE-ZERO-2.0",
|
||
"attribution_text": "Quelle: BfDI, Liste gem. Art. 35 Abs. 4 DSGVO (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
|
||
# === Muss-Listen Länder ===
|
||
# Baden-Württemberg
|
||
{
|
||
"source_code": "BW_MUSS_PUBLIC",
|
||
"name": "LfDI BW DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz BW",
|
||
"source_url": "https://www.baden-wuerttemberg.datenschutz.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Baden-Württemberg, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BW_MUSS_PRIVATE",
|
||
"name": "LfDI BW DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz BW",
|
||
"source_url": "https://www.baden-wuerttemberg.datenschutz.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Baden-Württemberg, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Bayern
|
||
{
|
||
"source_code": "BY_MUSS_PUBLIC",
|
||
"name": "BayLDA DSFA-Liste (öffentlich)",
|
||
"organization": "Bayerisches Landesamt für Datenschutzaufsicht",
|
||
"source_url": "https://www.lda.bayern.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: BayLDA, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BY_MUSS_PRIVATE",
|
||
"name": "BayLDA DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Bayerisches Landesamt für Datenschutzaufsicht",
|
||
"source_url": "https://www.lda.bayern.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: BayLDA, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Berlin
|
||
{
|
||
"source_code": "BE_MUSS_PUBLIC",
|
||
"name": "BlnBDI DSFA-Liste (öffentlich)",
|
||
"organization": "Berliner Beauftragte für Datenschutz",
|
||
"source_url": "https://www.datenschutz-berlin.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: BlnBDI, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BE_MUSS_PRIVATE",
|
||
"name": "BlnBDI DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Berliner Beauftragte für Datenschutz",
|
||
"source_url": "https://www.datenschutz-berlin.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: BlnBDI, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Brandenburg
|
||
{
|
||
"source_code": "BB_MUSS_PUBLIC",
|
||
"name": "LDA BB DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz Brandenburg",
|
||
"source_url": "https://www.lda.brandenburg.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LDA Brandenburg, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BB_MUSS_PRIVATE",
|
||
"name": "LDA BB DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz Brandenburg",
|
||
"source_url": "https://www.lda.brandenburg.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LDA Brandenburg, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Bremen
|
||
{
|
||
"source_code": "HB_MUSS_PUBLIC",
|
||
"name": "LfDI HB DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz Bremen",
|
||
"source_url": "https://www.datenschutz.bremen.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Bremen, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "HB_MUSS_PRIVATE",
|
||
"name": "LfDI HB DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz Bremen",
|
||
"source_url": "https://www.datenschutz.bremen.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Bremen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Hamburg
|
||
{
|
||
"source_code": "HH_MUSS_PUBLIC",
|
||
"name": "HmbBfDI DSFA-Liste (öffentlich)",
|
||
"organization": "Hamburgische Beauftragte für Datenschutz",
|
||
"source_url": "https://datenschutz-hamburg.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: HmbBfDI, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "HH_MUSS_PRIVATE",
|
||
"name": "HmbBfDI DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Hamburgische Beauftragte für Datenschutz",
|
||
"source_url": "https://datenschutz-hamburg.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: HmbBfDI, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Hessen
|
||
{
|
||
"source_code": "HE_MUSS_PUBLIC",
|
||
"name": "HBDI DSFA-Liste (öffentlich)",
|
||
"organization": "Hessischer Beauftragter für Datenschutz",
|
||
"source_url": "https://datenschutz.hessen.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: HBDI, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "HE_MUSS_PRIVATE",
|
||
"name": "HBDI DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Hessischer Beauftragter für Datenschutz",
|
||
"source_url": "https://datenschutz.hessen.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: HBDI, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Mecklenburg-Vorpommern
|
||
{
|
||
"source_code": "MV_MUSS_PUBLIC",
|
||
"name": "LfDI MV DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz MV",
|
||
"source_url": "https://www.datenschutz-mv.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI MV, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "MV_MUSS_PRIVATE",
|
||
"name": "LfDI MV DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz MV",
|
||
"source_url": "https://www.datenschutz-mv.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI MV, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Niedersachsen
|
||
{
|
||
"source_code": "NI_MUSS_PUBLIC",
|
||
"name": "LfD NI DSFA-Liste (öffentlich)",
|
||
"organization": "Die Landesbeauftragte für den Datenschutz Niedersachsen",
|
||
"source_url": "https://www.lfd.niedersachsen.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfD Niedersachsen, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "NI_MUSS_PRIVATE",
|
||
"name": "LfD NI DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Die Landesbeauftragte für den Datenschutz Niedersachsen",
|
||
"source_url": "https://www.lfd.niedersachsen.de/download/131098",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfD Niedersachsen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Nordrhein-Westfalen
|
||
{
|
||
"source_code": "NW_MUSS_PUBLIC",
|
||
"name": "LDI NRW DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz NRW",
|
||
"source_url": "https://www.ldi.nrw.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LDI NRW, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "NW_MUSS_PRIVATE",
|
||
"name": "LDI NRW DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz NRW",
|
||
"source_url": "https://www.ldi.nrw.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LDI NRW, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Rheinland-Pfalz
|
||
{
|
||
"source_code": "RP_MUSS_PUBLIC",
|
||
"name": "LfDI RP DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz Rheinland-Pfalz",
|
||
"source_url": "https://www.datenschutz.rlp.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Rheinland-Pfalz, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "RP_MUSS_PRIVATE",
|
||
"name": "LfDI RP DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz Rheinland-Pfalz",
|
||
"source_url": "https://www.datenschutz.rlp.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Rheinland-Pfalz, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Saarland
|
||
{
|
||
"source_code": "SL_MUSS_PUBLIC",
|
||
"name": "LfDI SL DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz Saarland",
|
||
"source_url": "https://www.datenschutz.saarland.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Saarland, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "SL_MUSS_PRIVATE",
|
||
"name": "LfDI SL DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragte für Datenschutz Saarland",
|
||
"source_url": "https://www.datenschutz.saarland.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfDI Saarland, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Sachsen
|
||
{
|
||
"source_code": "SN_MUSS_PUBLIC",
|
||
"name": "SDB DSFA-Liste (öffentlich)",
|
||
"organization": "Sächsischer Datenschutzbeauftragter",
|
||
"source_url": "https://www.saechsdsb.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: SDB Sachsen, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "SN_MUSS_PRIVATE",
|
||
"name": "SDB DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Sächsischer Datenschutzbeauftragter",
|
||
"source_url": "https://www.saechsdsb.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: SDB Sachsen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Sachsen-Anhalt
|
||
{
|
||
"source_code": "ST_MUSS_PUBLIC",
|
||
"name": "LfD ST DSFA-Liste (öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz Sachsen-Anhalt",
|
||
"source_url": "https://datenschutz.sachsen-anhalt.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfD Sachsen-Anhalt, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "ST_MUSS_PRIVATE",
|
||
"name": "LfD ST DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Landesbeauftragter für Datenschutz Sachsen-Anhalt",
|
||
"source_url": "https://datenschutz.sachsen-anhalt.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: LfD Sachsen-Anhalt, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Schleswig-Holstein
|
||
{
|
||
"source_code": "SH_MUSS_PUBLIC",
|
||
"name": "ULD DSFA-Liste (öffentlich)",
|
||
"organization": "Unabhängiges Landeszentrum für Datenschutz SH",
|
||
"source_url": "https://www.datenschutzzentrum.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: ULD Schleswig-Holstein, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "SH_MUSS_PRIVATE",
|
||
"name": "ULD DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Unabhängiges Landeszentrum für Datenschutz SH",
|
||
"source_url": "https://www.datenschutzzentrum.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: ULD Schleswig-Holstein, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
# Thüringen
|
||
{
|
||
"source_code": "TH_MUSS_PUBLIC",
|
||
"name": "TLfDI DSFA-Liste (öffentlich)",
|
||
"organization": "Thüringer Landesbeauftragter für Datenschutz",
|
||
"source_url": "https://www.tlfdi.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: TLfDI Thüringen, DSFA-Muss-Liste (öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "TH_MUSS_PRIVATE",
|
||
"name": "TLfDI DSFA-Liste (nicht-öffentlich)",
|
||
"organization": "Thüringer Landesbeauftragter für Datenschutz",
|
||
"source_url": "https://www.tlfdi.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: TLfDI Thüringen, DSFA-Muss-Liste (nicht-öffentlicher Bereich)",
|
||
"document_type": "checklist",
|
||
"language": "de"
|
||
},
|
||
|
||
# === Sonstige ===
|
||
{
|
||
"source_code": "AI_ACT_DSFA",
|
||
"name": "AI Act Bezüge zu DSFA",
|
||
"full_name": "AI Act Artikel mit Bezug zur Datenschutz-Folgenabschätzung",
|
||
"organization": "Europäische Union",
|
||
"source_url": "https://eur-lex.europa.eu/eli/reg/2024/1689/oj",
|
||
"license_code": "CC-BY-4.0",
|
||
"attribution_text": "Quelle: AI Act (EU) 2024/1689, EUR-Lex",
|
||
"document_type": "regulation",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "DSK_OH_KI",
|
||
"name": "DSK Orientierungshilfe KI",
|
||
"full_name": "DSK Orientierungshilfe KI und Datenschutz",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: DSK Orientierungshilfe KI und Datenschutz",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "EDSA_GUIDELINES",
|
||
"name": "EDPB Guidelines on DPIA",
|
||
"full_name": "European Data Protection Board Guidelines on DPIA",
|
||
"organization": "European Data Protection Board",
|
||
"source_url": "https://edpb.europa.eu",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Source: EDPB Guidelines on Data Protection Impact Assessment",
|
||
"document_type": "guideline",
|
||
"language": "en"
|
||
},
|
||
|
||
# === DSK Weitere Kurzpapiere ===
|
||
{
|
||
"source_code": "DSK_KP18",
|
||
"name": "Kurzpapier Nr. 18 - Risiko für die Rechte und Freiheiten",
|
||
"full_name": "DSK Kurzpapier Nr. 18: Risiko für die Rechte und Freiheiten natürlicher Personen",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_18.pdf",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"license_url": "https://www.govdata.de/dl-de/by-2-0",
|
||
"attribution_text": "Quelle: DSK Kurzpapier Nr. 18 (Risiko), Datenlizenz Deutschland – Namensnennung – Version 2.0",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
|
||
# === Standard-Datenschutzmodell ===
|
||
{
|
||
"source_code": "SDM_V2",
|
||
"name": "Standard-Datenschutzmodell V2.0",
|
||
"full_name": "SDM-Methode der Datenschutzaufsichtsbehörden V2.0",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de/media/ah/20191106_SDM-Methode_V2.0.pdf",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"license_url": "https://www.govdata.de/dl-de/by-2-0",
|
||
"attribution_text": "Quelle: SDM V2.0, Datenschutzkonferenz (DSK), Datenlizenz Deutschland – Namensnennung – Version 2.0",
|
||
"document_type": "methodology",
|
||
"language": "de"
|
||
},
|
||
|
||
# === Internes Dokument ===
|
||
{
|
||
"source_code": "BREAKPILOT_DSFA_GUIDE",
|
||
"name": "Datenschutz-Folgenabschätzung in Deutschland",
|
||
"full_name": "BreakPilot DSFA-Leitfaden (intern)",
|
||
"organization": "BreakPilot",
|
||
"source_url": None,
|
||
"license_code": "PROPRIETARY",
|
||
"attribution_text": "Quelle: BreakPilot DSFA-Leitfaden (intern)",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BREAKPILOT_BASELINE",
|
||
"name": "Baseline-DSFA Katalog",
|
||
"full_name": "BreakPilot Baseline-DSFA Katalog (proprietär)",
|
||
"organization": "BreakPilot",
|
||
"source_url": None,
|
||
"license_code": "PROPRIETARY",
|
||
"attribution_text": "Quelle: BreakPilot Baseline-DSFA Katalog (intern)",
|
||
"document_type": "catalog",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BREAKPILOT_DSFA_DE",
|
||
"name": "DSFA in Deutschland Dokument",
|
||
"full_name": "BreakPilot DSFA in Deutschland (proprietär)",
|
||
"organization": "BreakPilot",
|
||
"source_url": None,
|
||
"license_code": "PROPRIETARY",
|
||
"attribution_text": "Quelle: BreakPilot DSFA in Deutschland (intern)",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
|
||
# === VVT-Quellen (Verarbeitungsverzeichnis Art. 30 DSGVO) ===
|
||
{
|
||
"source_code": "DSK_KP1",
|
||
"name": "Kurzpapier Nr. 1 - Verarbeitungsverzeichnis",
|
||
"full_name": "DSK Kurzpapier Nr. 1: Verzeichnis von Verarbeitungstaetigkeiten nach Art. 30 DS-GVO",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_1.pdf",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"license_url": "https://www.govdata.de/dl-de/by-2-0",
|
||
"attribution_text": "Quelle: DSK Kurzpapier Nr. 1 (Stand: 2018), Datenlizenz Deutschland – Namensnennung – Version 2.0",
|
||
"document_type": "guideline",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "ICO_ROPA",
|
||
"name": "ICO Records of Processing Activities",
|
||
"full_name": "ICO Guidance on Documentation and Records of Processing Activities (RoPA)",
|
||
"organization": "Information Commissioner's Office (ICO)",
|
||
"source_url": "https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/accountability-and-governance/documentation-record-of-processing-activities/",
|
||
"license_code": "OGL-3.0",
|
||
"license_url": "https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
|
||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: ICO RoPA Guidance",
|
||
"document_type": "guideline",
|
||
"language": "en"
|
||
},
|
||
{
|
||
"source_code": "BREAKPILOT_VVT_SPEC",
|
||
"name": "VVT Generator Spezifikation",
|
||
"full_name": "BreakPilot VVT Generator Spezifikation (proprietaer)",
|
||
"organization": "BreakPilot",
|
||
"source_url": None,
|
||
"license_code": "PROPRIETARY",
|
||
"attribution_text": "Quelle: BreakPilot VVT Generator Spezifikation (intern)",
|
||
"document_type": "specification",
|
||
"language": "de"
|
||
},
|
||
|
||
# === SDM Bausteine V3.0 (TOM Gewaehrleistungsziele) ===
|
||
{
|
||
"source_code": "SDM_BAUSTEINE",
|
||
"name": "SDM Bausteine V3.0",
|
||
"full_name": "Standard-Datenschutzmodell Bausteine Version 3.0",
|
||
"organization": "Konferenz der unabhaengigen Datenschutzaufsichtsbehoerden",
|
||
"source_url": "https://www.datenschutz-mv.de/datenschutz/sdm/",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: SDM Bausteine V3.0, Konferenz der unabhaengigen Datenschutzaufsichtsbehoerden des Bundes und der Laender, Lizenz: dl-de/by-2-0",
|
||
"document_type": "standard",
|
||
"language": "de"
|
||
},
|
||
|
||
# === DSK Kurzpapier Nr. 7 (Loeschung) ===
|
||
{
|
||
"source_code": "DSK_KP7",
|
||
"name": "DSK Kurzpapier Nr. 7 - Loeschung",
|
||
"full_name": "Kurzpapier Nr. 7: Marktortprinzip und Loeschung personenbezogener Daten",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: DSK Kurzpapier Nr. 7, Datenschutzkonferenz, Lizenz: dl-de/by-2-0",
|
||
"document_type": "guidance",
|
||
"language": "de"
|
||
},
|
||
|
||
# === BreakPilot Loeschfristen + TOM Spec (intern) ===
|
||
{
|
||
"source_code": "BREAKPILOT_LF_TOM_SPEC",
|
||
"name": "Loeschfristen & TOM Generator Spezifikation",
|
||
"full_name": "BreakPilot Loeschfristen und TOM Generator Spezifikation (proprietaer)",
|
||
"organization": "BreakPilot",
|
||
"source_url": None,
|
||
"license_code": "PROPRIETARY",
|
||
"attribution_text": "Quelle: BreakPilot Loeschfristen & TOM Generator Spezifikation (intern)",
|
||
"document_type": "specification",
|
||
"language": "de"
|
||
},
|
||
|
||
# === Compliance Advisor Agent - Zusaetzliche Quellen ===
|
||
{
|
||
"source_code": "DSGVO_VOLLTEXT",
|
||
"name": "DSGVO Volltext",
|
||
"full_name": "Verordnung (EU) 2016/679 - Datenschutz-Grundverordnung (Volltext mit Erwaegsgruenden)",
|
||
"organization": "Europaeische Union",
|
||
"source_url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
|
||
"license_code": "CC-BY-4.0",
|
||
"attribution_text": "Quelle: DSGVO Volltext, Europaeische Union, CC BY 4.0",
|
||
"document_type": "legislation",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "BDSG_VOLLTEXT",
|
||
"name": "BDSG Volltext",
|
||
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext",
|
||
"organization": "Bundesrepublik Deutschland",
|
||
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland",
|
||
"document_type": "legislation",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "AI_ACT_SUMMARY",
|
||
"name": "AI Act Zusammenfassung",
|
||
"full_name": "EU KI-Verordnung (AI Act) - Zusammenfassung und Kernpunkte",
|
||
"organization": "Europaeische Union",
|
||
"source_url": "https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:32024R1689",
|
||
"license_code": "CC-BY-4.0",
|
||
"attribution_text": "Quelle: AI Act, Europaeische Union, CC BY 4.0",
|
||
"document_type": "legislation",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "DSK_KURZPAPIERE_ALLE",
|
||
"name": "DSK Kurzpapiere (alle 20)",
|
||
"full_name": "Datenschutzkonferenz - Alle 20 Kurzpapiere zur DSGVO",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutzkonferenz-online.de/kurzpapiere.html",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: DSK Kurzpapiere, Datenschutzkonferenz, Lizenz: dl-de/by-2-0",
|
||
"document_type": "guidance",
|
||
"language": "de"
|
||
},
|
||
{
|
||
"source_code": "SDM_V3",
|
||
"name": "Standard-Datenschutzmodell V3.0",
|
||
"full_name": "SDM - Standard-Datenschutzmodell Version 3.0",
|
||
"organization": "Datenschutzkonferenz (DSK)",
|
||
"source_url": "https://www.datenschutz-mv.de/datenschutz/sdm/",
|
||
"license_code": "DL-DE-BY-2.0",
|
||
"attribution_text": "Quelle: SDM V3.0, Datenschutzkonferenz, Lizenz: dl-de/by-2-0",
|
||
"document_type": "standard",
|
||
"language": "de"
|
||
},
|
||
|
||
# === EDPB Ergaenzende Leitlinien ===
|
||
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
|
||
{
|
||
"source_code": "EDPB_GUIDELINES_2_2019",
|
||
"name": "EDPB Leitlinien 2/2019 zu Art. 6(1)(b)",
|
||
"full_name": "EDPB Leitlinien 2/2019 zur Verarbeitung personenbezogener Daten auf Grundlage von Art. 6 Abs. 1 lit. b DSGVO",
|
||
"organization": "European Data Protection Board",
|
||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Source: EDPB Guidelines 2/2019, European Data Protection Board",
|
||
"document_type": "guideline",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "EDPB_GUIDELINES_3_2019",
|
||
"name": "EDPB Leitlinien 3/2019 Videoueberwachung",
|
||
"full_name": "EDPB Leitlinien 3/2019 zur Verarbeitung personenbezogener Daten durch Videoueberwachung",
|
||
"organization": "European Data Protection Board",
|
||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Source: EDPB Guidelines 3/2019, European Data Protection Board",
|
||
"document_type": "guideline",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "EDPB_GUIDELINES_5_2020",
|
||
"name": "EDPB Leitlinien 5/2020 Einwilligung",
|
||
"full_name": "EDPB Leitlinien 5/2020 zur Einwilligung gemaess Verordnung 2016/679",
|
||
"organization": "European Data Protection Board",
|
||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Source: EDPB Guidelines 5/2020, European Data Protection Board",
|
||
"document_type": "guideline",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "EDPB_GUIDELINES_7_2020",
|
||
"name": "EDPB Leitlinien 7/2020 Controller/Processor",
|
||
"full_name": "EDPB Leitlinien 7/2020 zu den Begriffen Verantwortlicher und Auftragsverarbeiter",
|
||
"organization": "European Data Protection Board",
|
||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Source: EDPB Guidelines 7/2020, European Data Protection Board",
|
||
"document_type": "guideline",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "EDPB_GUIDELINES_1_2022",
|
||
"name": "EDPB Leitlinien 1/2022 Bussgelder",
|
||
"full_name": "EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
|
||
"organization": "European Data Protection Board",
|
||
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under-gdpr_en",
|
||
"license_code": "EDPB-LICENSE",
|
||
"attribution_text": "Source: EDPB Guidelines 04/2022, European Data Protection Board",
|
||
"document_type": "guideline",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "SCC_FULL_TEXT",
|
||
"name": "Standard Contractual Clauses Volltext",
|
||
"full_name": "Standardvertragsklauseln fuer die Uebermittlung personenbezogener Daten an Drittlaender (2021/914/EU)",
|
||
"organization": "Europaeische Kommission",
|
||
"source_url": "https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Quelle: SCC Volltext, Europaeische Kommission (EUR-Lex)",
|
||
"document_type": "regulation",
|
||
"language": "de",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
|
||
# === Nationale Datenschutzgesetze (DSGVO-Umsetzungen) ===
|
||
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
|
||
# These sources are kept here for reference but will be skipped during ingestion.
|
||
# Ingestion should target bp_legal_corpus for these source codes.
|
||
{
|
||
"source_code": "BDSG_FULL",
|
||
"name": "BDSG Volltext (Deutschland)",
|
||
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext inkl. aller Teile",
|
||
"organization": "Bundesrepublik Deutschland",
|
||
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland (gesetze-im-internet.de)",
|
||
"document_type": "legislation",
|
||
"language": "de",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "AT_DSG",
|
||
"name": "DSG Oesterreich",
|
||
"full_name": "Bundesgesetz zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten (Datenschutzgesetz - DSG)",
|
||
"organization": "Republik Oesterreich",
|
||
"source_url": "https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001597",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Quelle: DSG, Republik Oesterreich (RIS)",
|
||
"document_type": "legislation",
|
||
"language": "de",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "CH_DSG",
|
||
"name": "DSG Schweiz (revDSG 2023)",
|
||
"full_name": "Bundesgesetz ueber den Datenschutz (Datenschutzgesetz, DSG) - revidierte Fassung 2023",
|
||
"organization": "Schweizerische Eidgenossenschaft",
|
||
"source_url": "https://www.fedlex.admin.ch/eli/cc/2022/491/de",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Quelle: DSG, Schweizerische Eidgenossenschaft (Fedlex)",
|
||
"document_type": "legislation",
|
||
"language": "de",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "LI_DSG",
|
||
"name": "DSG Liechtenstein",
|
||
"full_name": "Datenschutzgesetz (DSG) Liechtenstein",
|
||
"organization": "Fuerstentum Liechtenstein",
|
||
"source_url": "https://www.gesetze.li/konso/2018.272",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Quelle: DSG, Fuerstentum Liechtenstein (gesetze.li)",
|
||
"document_type": "legislation",
|
||
"language": "de",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "FR_CNIL_GUIDE",
|
||
"name": "CNIL Guide RGPD",
|
||
"full_name": "Guide pratique RGPD - Commission Nationale de l'Informatique et des Libertes",
|
||
"organization": "CNIL (France)",
|
||
"source_url": "https://www.cnil.fr/fr/rgpd-de-quoi-parle-t-on",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Source: CNIL Guide RGPD, Commission Nationale de l'Informatique et des Libertes",
|
||
"document_type": "guideline",
|
||
"language": "fr",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "ES_LOPDGDD",
|
||
"name": "LOPDGDD Spanien",
|
||
"full_name": "Ley Organica de Proteccion de Datos Personales y garantia de los derechos digitales",
|
||
"organization": "Reino de Espana",
|
||
"source_url": "https://www.boe.es/buscar/act.php?id=BOE-A-2018-16673",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Fuente: LOPDGDD, Reino de Espana (BOE)",
|
||
"document_type": "legislation",
|
||
"language": "es",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "IT_CODICE_PRIVACY",
|
||
"name": "Codice Privacy Italien",
|
||
"full_name": "Codice in materia di protezione dei dati personali (D.Lgs. 196/2003, aggiornato D.Lgs. 101/2018)",
|
||
"organization": "Repubblica Italiana",
|
||
"source_url": "https://www.garanteprivacy.it/home/docweb/-/docweb-display/docweb/9042678",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Fonte: Codice Privacy, Garante per la protezione dei dati personali",
|
||
"document_type": "legislation",
|
||
"language": "it",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "NL_UAVG",
|
||
"name": "UAVG Niederlande",
|
||
"full_name": "Uitvoeringswet Algemene verordening gegevensbescherming (UAVG)",
|
||
"organization": "Koninkrijk der Nederlanden",
|
||
"source_url": "https://wetten.overheid.nl/BWBR0040940/",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Bron: UAVG, Koninkrijk der Nederlanden (wetten.overheid.nl)",
|
||
"document_type": "legislation",
|
||
"language": "nl",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "BE_DPA_LAW",
|
||
"name": "Datenschutzgesetz Belgien",
|
||
"full_name": "Loi relative a la protection des personnes physiques a l'egard des traitements de donnees a caractere personnel",
|
||
"organization": "Royaume de Belgique",
|
||
"source_url": "https://www.ejustice.just.fgov.be/cgi_loi/change_lg.pl?language=fr&la=F&cn=2018073046",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Source: Loi Protection des Donnees, Royaume de Belgique (eJustice)",
|
||
"document_type": "legislation",
|
||
"language": "fr",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "LU_DPA_LAW",
|
||
"name": "Datenschutzgesetz Luxemburg",
|
||
"full_name": "Loi du 1er aout 2018 portant organisation de la Commission nationale pour la protection des donnees",
|
||
"organization": "Grand-Duche de Luxembourg",
|
||
"source_url": "https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Source: Loi Protection des Donnees, Grand-Duche de Luxembourg (Legilux)",
|
||
"document_type": "legislation",
|
||
"language": "fr",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "IE_DPA_2018",
|
||
"name": "Data Protection Act 2018 Ireland",
|
||
"full_name": "Data Protection Act 2018 (Act No. 7 of 2018) - Ireland",
|
||
"organization": "Government of Ireland",
|
||
"source_url": "https://www.irishstatutebook.ie/eli/2018/act/7/enacted/en/html",
|
||
"license_code": "OGL-3.0",
|
||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, Ireland",
|
||
"document_type": "legislation",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "UK_DPA_2018",
|
||
"name": "Data Protection Act 2018 UK",
|
||
"full_name": "Data Protection Act 2018 (c. 12) - United Kingdom",
|
||
"organization": "Government of the United Kingdom",
|
||
"source_url": "https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted",
|
||
"license_code": "OGL-3.0",
|
||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, UK",
|
||
"document_type": "legislation",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "UK_GDPR",
|
||
"name": "UK GDPR (retained EU law)",
|
||
"full_name": "United Kingdom General Data Protection Regulation (UK GDPR) - retained EU law",
|
||
"organization": "Government of the United Kingdom",
|
||
"source_url": "https://www.legislation.gov.uk/eur/2016/679/contents",
|
||
"license_code": "OGL-3.0",
|
||
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: UK GDPR, legislation.gov.uk",
|
||
"document_type": "legislation",
|
||
"language": "en",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "NO_PERSONOPPLYSNINGSLOVEN",
|
||
"name": "Personopplysningsloven Norwegen",
|
||
"full_name": "Lov om behandling av personopplysninger (personopplysningsloven)",
|
||
"organization": "Kongeriket Norge",
|
||
"source_url": "https://lovdata.no/dokument/NL/lov/2018-06-15-38",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Kilde: Personopplysningsloven, Kongeriket Norge (Lovdata)",
|
||
"document_type": "legislation",
|
||
"language": "no",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "SE_DATASKYDDSLAG",
|
||
"name": "Dataskyddslag Schweden",
|
||
"full_name": "Lag (2018:218) med kompletterande bestammelser till EU:s dataskyddsforordning",
|
||
"organization": "Konungariket Sverige",
|
||
"source_url": "https://www.riksdagen.se/sv/dokument-och-lagar/dokument/svensk-forfattningssamling/lag-2018218-med-kompletterande-bestammelser-till_sfs-2018-218/",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Kalla: Dataskyddslag (2018:218), Konungariket Sverige (Riksdagen)",
|
||
"document_type": "legislation",
|
||
"language": "sv",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "DK_DATABESKYTTELSESLOVEN",
|
||
"name": "Databeskyttelsesloven Daenemark",
|
||
"full_name": "Lov om supplerende bestemmelser til forordning om beskyttelse af fysiske personer i forbindelse med behandling af personoplysninger",
|
||
"organization": "Kongeriget Danmark",
|
||
"source_url": "https://www.retsinformation.dk/eli/lta/2018/502",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Kilde: Databeskyttelsesloven, Kongeriget Danmark (Retsinformation)",
|
||
"document_type": "legislation",
|
||
"language": "da",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "FI_TIETOSUOJALAKI",
|
||
"name": "Tietosuojalaki Finnland",
|
||
"full_name": "Tietosuojalaki (1050/2018) - Datenschutzgesetz Finnland",
|
||
"organization": "Suomen tasavalta",
|
||
"source_url": "https://www.finlex.fi/fi/laki/ajantasa/2018/20181050",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Lahde: Tietosuojalaki, Suomen tasavalta (Finlex)",
|
||
"document_type": "legislation",
|
||
"language": "fi",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "PL_UODO",
|
||
"name": "UODO Polen",
|
||
"full_name": "Ustawa o ochronie danych osobowych - Datenschutzgesetz Polen",
|
||
"organization": "Rzeczpospolita Polska",
|
||
"source_url": "https://isap.sejm.gov.pl/isap.nsf/DocDetails.xsp?id=WDU20180001000",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Zrodlo: Ustawa o ochronie danych osobowych, Rzeczpospolita Polska (ISAP)",
|
||
"document_type": "legislation",
|
||
"language": "pl",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "CZ_ZOU",
|
||
"name": "Zakon o ochrane osobnich udaju Tschechien",
|
||
"full_name": "Zakon c. 110/2019 Sb. o zpracovani osobnich udaju",
|
||
"organization": "Ceska republika",
|
||
"source_url": "https://www.zakonyprolidi.cz/cs/2019-110",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Zdroj: Zakon o ochrane osobnich udaju, Ceska republika (zakonyprolidi.cz)",
|
||
"document_type": "legislation",
|
||
"language": "cs",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
{
|
||
"source_code": "HU_INFOTV",
|
||
"name": "Informacios torvenye Ungarn",
|
||
"full_name": "2011. evi CXII. torveny az informacios onrendelkezesi jogrol es az informacioszabadsagrol (Infotv.)",
|
||
"organization": "Magyarorszag",
|
||
"source_url": "https://njt.hu/jogszabaly/2011-112-00-00",
|
||
"license_code": "PUBLIC_DOMAIN",
|
||
"attribution_text": "Forras: Infotv., Magyarorszag (njt.hu)",
|
||
"document_type": "legislation",
|
||
"language": "hu",
|
||
"migrated_to": "bp_legal_corpus"
|
||
},
|
||
]
|
||
|
||
|
||
# =============================================================================
|
||
# Chunking Configuration
|
||
# =============================================================================
|
||
|
||
DSFA_CHUNK_CONFIG = {
|
||
# WP248 - Kriterien einzeln chunken
|
||
"WP248": {
|
||
"strategy": "section_based",
|
||
"section_markers": [r"K1[:\s]", r"K2[:\s]", r"K3[:\s]", r"K4[:\s]", r"K5[:\s]",
|
||
r"K6[:\s]", r"K7[:\s]", r"K8[:\s]", r"K9[:\s]"],
|
||
"max_chunk_size": 1500,
|
||
"overlap": 200
|
||
},
|
||
# DSK Kurzpapier - Prozessschritte einzeln
|
||
"DSK_KP5": {
|
||
"strategy": "section_based",
|
||
"section_markers": [r"Schritt\s*1", r"Schritt\s*2", r"Schritt\s*3",
|
||
r"Schritt\s*4", r"Schritt\s*5", r"Schritt\s*6"],
|
||
"max_chunk_size": 1200,
|
||
"overlap": 150
|
||
},
|
||
# SDM V2.0 - Gewährleistungsziele einzeln
|
||
'SDM_V2': {
|
||
'strategy': 'section_based',
|
||
'section_markers': [
|
||
r'Gewährleistungsziel\s+\d',
|
||
r'\d+\.\d+\s+',
|
||
],
|
||
'max_chunk_size': 1200,
|
||
'overlap': 150,
|
||
'categories': ['sdm_goal', 'methodology', 'implementation']
|
||
},
|
||
# Muss-Listen - Jeder Eintrag = 1 Chunk
|
||
"MUSS_LISTEN": {
|
||
"strategy": "list_item",
|
||
"list_markers": [r"^•", r"^-", r"^\d+\."],
|
||
"max_chunk_size": 800,
|
||
"overlap": 0
|
||
},
|
||
# DSK Kurzpapier Nr. 1 - VVT Schritte einzeln
|
||
"DSK_KP1": {
|
||
"strategy": "section_based",
|
||
"section_markers": [r"Schritt\s+\d", r"\d+\.\s+"],
|
||
"max_chunk_size": 1000,
|
||
"overlap": 150,
|
||
"categories": ["vvt_guidance", "art30_requirements", "controller_duties"]
|
||
},
|
||
# ICO RoPA - Sections einzeln
|
||
"ICO_ROPA": {
|
||
"strategy": "section_based",
|
||
"section_markers": [r"What\s+should", r"How\s+to", r"Controller", r"Processor"],
|
||
"max_chunk_size": 1000,
|
||
"overlap": 150,
|
||
"categories": ["vvt_guidance", "art30_requirements", "ropa_templates"]
|
||
},
|
||
# SDM Bausteine - Gewaehrleistungsziele einzeln
|
||
"SDM_BAUSTEINE": {
|
||
"strategy": "section_based",
|
||
"section_markers": [
|
||
r"Baustein\s+\d",
|
||
r"Gewaehrleistungsziel",
|
||
r"\d+\.\d+\s+",
|
||
],
|
||
"max_chunk_size": 1200,
|
||
"overlap": 150,
|
||
"categories": ["sdm_goal", "tom_measure", "implementation"]
|
||
},
|
||
# DSK Kurzpapier Nr. 7 - Loeschung
|
||
"DSK_KP7": {
|
||
"strategy": "section_based",
|
||
"section_markers": [r"Schritt\s+\d", r"\d+\.\s+", r"Loeschkonzept"],
|
||
"max_chunk_size": 1000,
|
||
"overlap": 150,
|
||
"categories": ["loeschung", "art17_requirements", "retention_guidance"]
|
||
},
|
||
# Fallback
|
||
"DEFAULT": {
|
||
"strategy": "recursive",
|
||
"max_chunk_size": 1000,
|
||
"overlap": 200
|
||
}
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Data Classes
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class DSFAChunkPayload:
|
||
"""Payload schema for Qdrant points."""
|
||
chunk_id: str
|
||
document_id: str
|
||
source_id: str
|
||
content: str
|
||
section_title: Optional[str] = None
|
||
source_code: str = ""
|
||
source_name: str = ""
|
||
attribution_text: str = ""
|
||
license_code: str = ""
|
||
attribution_required: bool = True
|
||
document_type: str = ""
|
||
category: str = ""
|
||
language: str = "de"
|
||
page_number: Optional[int] = None
|
||
|
||
|
||
@dataclass
|
||
class DSFASearchResult:
|
||
"""Search result with attribution."""
|
||
chunk_id: str
|
||
content: str
|
||
score: float
|
||
source_code: str
|
||
source_name: str
|
||
attribution_text: str
|
||
license_code: str
|
||
license_url: Optional[str]
|
||
attribution_required: bool
|
||
source_url: Optional[str]
|
||
document_type: str
|
||
category: str
|
||
section_title: Optional[str]
|
||
page_number: Optional[int]
|
||
|
||
|
||
# =============================================================================
|
||
# Database Operations
|
||
# =============================================================================
|
||
|
||
class DSFACorpusStore:
|
||
"""Database operations for DSFA corpus."""
|
||
|
||
def __init__(self, pool: asyncpg.Pool):
|
||
self.pool = pool
|
||
|
||
async def register_source(self, source_data: Dict) -> str:
|
||
"""Register a DSFA source in the database."""
|
||
async with self.pool.acquire() as conn:
|
||
# Check if source already exists
|
||
existing = await conn.fetchval(
|
||
"SELECT id FROM dsfa_sources WHERE source_code = $1",
|
||
source_data["source_code"]
|
||
)
|
||
if existing:
|
||
# Update existing source
|
||
await conn.execute("""
|
||
UPDATE dsfa_sources SET
|
||
name = $2,
|
||
full_name = $3,
|
||
organization = $4,
|
||
source_url = $5,
|
||
eur_lex_celex = $6,
|
||
license_code = $7,
|
||
license_url = $8,
|
||
attribution_required = $9,
|
||
attribution_text = $10,
|
||
document_type = $11,
|
||
language = $12,
|
||
updated_at = NOW()
|
||
WHERE source_code = $1
|
||
""",
|
||
source_data["source_code"],
|
||
source_data["name"],
|
||
source_data.get("full_name"),
|
||
source_data.get("organization"),
|
||
source_data.get("source_url"),
|
||
source_data.get("eur_lex_celex"),
|
||
source_data["license_code"],
|
||
source_data.get("license_url"),
|
||
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
|
||
source_data["attribution_text"],
|
||
source_data.get("document_type"),
|
||
source_data.get("language", "de")
|
||
)
|
||
return str(existing)
|
||
else:
|
||
# Insert new source
|
||
source_id = await conn.fetchval("""
|
||
INSERT INTO dsfa_sources (
|
||
source_code, name, full_name, organization, source_url,
|
||
eur_lex_celex, license_code, license_url, attribution_required,
|
||
attribution_text, document_type, language
|
||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
|
||
RETURNING id
|
||
""",
|
||
source_data["source_code"],
|
||
source_data["name"],
|
||
source_data.get("full_name"),
|
||
source_data.get("organization"),
|
||
source_data.get("source_url"),
|
||
source_data.get("eur_lex_celex"),
|
||
source_data["license_code"],
|
||
source_data.get("license_url"),
|
||
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
|
||
source_data["attribution_text"],
|
||
source_data.get("document_type"),
|
||
source_data.get("language", "de")
|
||
)
|
||
return str(source_id)
|
||
|
||
async def get_source_by_code(self, source_code: str) -> Optional[Dict]:
|
||
"""Get source by its code."""
|
||
async with self.pool.acquire() as conn:
|
||
row = await conn.fetchrow(
|
||
"SELECT * FROM dsfa_sources WHERE source_code = $1",
|
||
source_code
|
||
)
|
||
if row:
|
||
return dict(row)
|
||
return None
|
||
|
||
async def list_sources(self) -> List[Dict]:
|
||
"""List all registered sources."""
|
||
async with self.pool.acquire() as conn:
|
||
rows = await conn.fetch(
|
||
"SELECT * FROM dsfa_sources ORDER BY source_code"
|
||
)
|
||
return [dict(row) for row in rows]
|
||
|
||
async def create_document(
|
||
self,
|
||
source_id: str,
|
||
title: str,
|
||
file_name: Optional[str] = None,
|
||
file_type: Optional[str] = None,
|
||
minio_path: Optional[str] = None,
|
||
original_url: Optional[str] = None,
|
||
metadata: Optional[Dict] = None
|
||
) -> str:
|
||
"""Create a document record."""
|
||
import json
|
||
metadata_json = json.dumps(metadata or {})
|
||
async with self.pool.acquire() as conn:
|
||
doc_id = await conn.fetchval("""
|
||
INSERT INTO dsfa_documents (
|
||
source_id, title, file_name, file_type, minio_path,
|
||
original_url, metadata
|
||
) VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb)
|
||
RETURNING id
|
||
""",
|
||
uuid.UUID(source_id),
|
||
title,
|
||
file_name,
|
||
file_type,
|
||
minio_path,
|
||
original_url,
|
||
metadata_json
|
||
)
|
||
return str(doc_id)
|
||
|
||
async def create_chunk(
|
||
self,
|
||
document_id: str,
|
||
source_id: str,
|
||
content: str,
|
||
chunk_index: int,
|
||
section_title: Optional[str] = None,
|
||
page_number: Optional[int] = None,
|
||
category: Optional[str] = None,
|
||
qdrant_point_id: Optional[str] = None,
|
||
metadata: Optional[Dict] = None
|
||
) -> str:
|
||
"""Create a chunk record."""
|
||
import json
|
||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||
|
||
async with self.pool.acquire() as conn:
|
||
chunk_id = await conn.fetchval("""
|
||
INSERT INTO dsfa_document_chunks (
|
||
document_id, source_id, content, content_hash, chunk_index,
|
||
section_title, page_number, category, qdrant_point_id, metadata
|
||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb)
|
||
RETURNING id
|
||
""",
|
||
uuid.UUID(document_id),
|
||
uuid.UUID(source_id),
|
||
content,
|
||
content_hash,
|
||
chunk_index,
|
||
section_title,
|
||
page_number,
|
||
category,
|
||
qdrant_point_id,
|
||
json.dumps(metadata or {})
|
||
)
|
||
return str(chunk_id)
|
||
|
||
async def get_chunk_with_attribution(self, chunk_id: str) -> Optional[Dict]:
|
||
"""Get a chunk with full source attribution."""
|
||
async with self.pool.acquire() as conn:
|
||
row = await conn.fetchrow("""
|
||
SELECT * FROM dsfa_chunk_with_attribution
|
||
WHERE chunk_id = $1
|
||
""", uuid.UUID(chunk_id))
|
||
if row:
|
||
return dict(row)
|
||
return None
|
||
|
||
async def get_source_stats(self) -> List[Dict]:
|
||
"""Get aggregated stats per source."""
|
||
async with self.pool.acquire() as conn:
|
||
rows = await conn.fetch("SELECT * FROM dsfa_source_stats")
|
||
return [dict(row) for row in rows]
|
||
|
||
async def update_document_indexed(self, document_id: str, chunks_count: int):
|
||
"""Update document with indexing information."""
|
||
async with self.pool.acquire() as conn:
|
||
await conn.execute("""
|
||
UPDATE dsfa_documents
|
||
SET chunks_generated = $2,
|
||
last_indexed_at = NOW(),
|
||
text_extracted = true
|
||
WHERE id = $1
|
||
""", uuid.UUID(document_id), chunks_count)
|
||
|
||
|
||
# =============================================================================
|
||
# Qdrant Operations
|
||
# =============================================================================
|
||
|
||
class DSFAQdrantService:
|
||
"""Qdrant operations for DSFA corpus."""
|
||
|
||
def __init__(self, url: str = None):
|
||
self.url = url or QDRANT_URL
|
||
self._client = None
|
||
|
||
@property
|
||
def client(self) -> QdrantClient:
|
||
if self._client is None:
|
||
self._client = QdrantClient(url=self.url, check_compatibility=False)
|
||
return self._client
|
||
|
||
async def ensure_collection(self) -> bool:
|
||
"""Ensure DSFA collection exists."""
|
||
try:
|
||
collections = self.client.get_collections().collections
|
||
collection_names = [c.name for c in collections]
|
||
|
||
if DSFA_COLLECTION not in collection_names:
|
||
self.client.create_collection(
|
||
collection_name=DSFA_COLLECTION,
|
||
vectors_config=VectorParams(
|
||
size=VECTOR_SIZE,
|
||
distance=Distance.COSINE
|
||
)
|
||
)
|
||
print(f"Created collection: {DSFA_COLLECTION}")
|
||
return True
|
||
except Exception as e:
|
||
print(f"Error ensuring collection: {e}")
|
||
return False
|
||
|
||
async def index_chunks(
|
||
self,
|
||
chunks: List[Dict],
|
||
embeddings: List[List[float]]
|
||
) -> int:
|
||
"""Index chunks into Qdrant."""
|
||
if not chunks or not embeddings:
|
||
return 0
|
||
|
||
points = []
|
||
for chunk, embedding in zip(chunks, embeddings):
|
||
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk["chunk_id"]))
|
||
|
||
payload = DSFAChunkPayload(
|
||
chunk_id=chunk["chunk_id"],
|
||
document_id=chunk["document_id"],
|
||
source_id=chunk["source_id"],
|
||
content=chunk["content"],
|
||
section_title=chunk.get("section_title"),
|
||
source_code=chunk["source_code"],
|
||
source_name=chunk["source_name"],
|
||
attribution_text=chunk["attribution_text"],
|
||
license_code=chunk["license_code"],
|
||
attribution_required=chunk.get("attribution_required", True),
|
||
document_type=chunk.get("document_type", ""),
|
||
category=chunk.get("category", ""),
|
||
language=chunk.get("language", "de"),
|
||
page_number=chunk.get("page_number")
|
||
)
|
||
|
||
points.append(
|
||
PointStruct(
|
||
id=point_id,
|
||
vector=embedding,
|
||
payload=asdict(payload)
|
||
)
|
||
)
|
||
|
||
self.client.upsert(collection_name=DSFA_COLLECTION, points=points)
|
||
return len(points)
|
||
|
||
async def search(
|
||
self,
|
||
query_embedding: List[float],
|
||
source_codes: Optional[List[str]] = None,
|
||
document_types: Optional[List[str]] = None,
|
||
categories: Optional[List[str]] = None,
|
||
limit: int = 10
|
||
) -> List[Dict]:
|
||
"""Search DSFA corpus with filters."""
|
||
must_conditions = []
|
||
|
||
if source_codes:
|
||
for code in source_codes:
|
||
must_conditions.append(
|
||
FieldCondition(key="source_code", match=MatchValue(value=code))
|
||
)
|
||
|
||
if document_types:
|
||
for dtype in document_types:
|
||
must_conditions.append(
|
||
FieldCondition(key="document_type", match=MatchValue(value=dtype))
|
||
)
|
||
|
||
if categories:
|
||
for cat in categories:
|
||
must_conditions.append(
|
||
FieldCondition(key="category", match=MatchValue(value=cat))
|
||
)
|
||
|
||
query_filter = Filter(must=must_conditions) if must_conditions else None
|
||
|
||
# Use query_points for newer qdrant-client API
|
||
results = self.client.query_points(
|
||
collection_name=DSFA_COLLECTION,
|
||
query=query_embedding,
|
||
query_filter=query_filter,
|
||
limit=limit
|
||
)
|
||
|
||
return [
|
||
{
|
||
"id": str(r.id),
|
||
"score": r.score,
|
||
**r.payload
|
||
}
|
||
for r in results.points
|
||
]
|
||
|
||
async def get_stats(self) -> Dict:
|
||
"""Get collection statistics."""
|
||
try:
|
||
info = self.client.get_collection(DSFA_COLLECTION)
|
||
return {
|
||
"collection": DSFA_COLLECTION,
|
||
"vectors_count": info.vectors_count,
|
||
"points_count": info.points_count,
|
||
"status": info.status.value
|
||
}
|
||
except Exception as e:
|
||
return {"error": str(e), "collection": DSFA_COLLECTION}
|
||
|
||
|
||
# =============================================================================
|
||
# Chunking Functions
|
||
# =============================================================================
|
||
|
||
def chunk_text_recursive(text: str, max_size: int = 1000, overlap: int = 200) -> List[Dict]:
|
||
"""Recursively chunk text with overlap."""
|
||
chunks = []
|
||
start = 0
|
||
|
||
while start < len(text):
|
||
end = min(start + max_size, len(text))
|
||
|
||
# Find a good break point (sentence end, paragraph)
|
||
if end < len(text):
|
||
for sep in ["\n\n", "\n", ". ", ", ", " "]:
|
||
last_sep = text[start:end].rfind(sep)
|
||
if last_sep > max_size // 2:
|
||
end = start + last_sep + len(sep)
|
||
break
|
||
|
||
chunk_text = text[start:end].strip()
|
||
if chunk_text:
|
||
chunks.append({
|
||
"content": chunk_text,
|
||
"start_char": start,
|
||
"end_char": end
|
||
})
|
||
|
||
start = end - overlap if end < len(text) else len(text)
|
||
|
||
return chunks
|
||
|
||
|
||
def chunk_by_sections(text: str, markers: List[str], max_size: int = 1500, overlap: int = 200) -> List[Dict]:
|
||
"""Chunk text by section markers."""
|
||
chunks = []
|
||
pattern = "|".join(f"({m})" for m in markers)
|
||
|
||
# Find all section starts
|
||
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
|
||
|
||
if not matches:
|
||
return chunk_text_recursive(text, max_size, overlap)
|
||
|
||
for i, match in enumerate(matches):
|
||
start = match.start()
|
||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||
|
||
section_text = text[start:end].strip()
|
||
section_title = match.group(0).strip()
|
||
|
||
if len(section_text) > max_size:
|
||
sub_chunks = chunk_text_recursive(section_text, max_size, overlap)
|
||
for j, sub in enumerate(sub_chunks):
|
||
chunks.append({
|
||
"content": sub["content"],
|
||
"section_title": section_title if j == 0 else f"{section_title} (cont.)",
|
||
"start_char": start + sub["start_char"],
|
||
"end_char": start + sub["end_char"]
|
||
})
|
||
else:
|
||
chunks.append({
|
||
"content": section_text,
|
||
"section_title": section_title,
|
||
"start_char": start,
|
||
"end_char": end
|
||
})
|
||
|
||
return chunks
|
||
|
||
|
||
def chunk_by_list_items(text: str, markers: List[str], max_size: int = 800) -> List[Dict]:
|
||
"""Chunk text by list item markers."""
|
||
chunks = []
|
||
pattern = "|".join(f"({m})" for m in markers)
|
||
|
||
lines = text.split("\n")
|
||
current_item = ""
|
||
current_start = 0
|
||
|
||
for i, line in enumerate(lines):
|
||
if re.match(pattern, line.strip()):
|
||
if current_item.strip():
|
||
chunks.append({
|
||
"content": current_item.strip(),
|
||
"start_char": current_start,
|
||
"end_char": current_start + len(current_item)
|
||
})
|
||
current_item = line
|
||
current_start = sum(len(lines[j]) + 1 for j in range(i))
|
||
else:
|
||
current_item += "\n" + line
|
||
|
||
if current_item.strip():
|
||
chunks.append({
|
||
"content": current_item.strip(),
|
||
"start_char": current_start,
|
||
"end_char": current_start + len(current_item)
|
||
})
|
||
|
||
return chunks
|
||
|
||
|
||
def chunk_document(text: str, source_code: str) -> List[Dict]:
|
||
"""Chunk document using appropriate strategy for source type."""
|
||
config = DSFA_CHUNK_CONFIG.get(source_code, DSFA_CHUNK_CONFIG["DEFAULT"])
|
||
|
||
if source_code.endswith("_MUSS_PUBLIC") or source_code.endswith("_MUSS_PRIVATE"):
|
||
config = DSFA_CHUNK_CONFIG["MUSS_LISTEN"]
|
||
|
||
if config["strategy"] == "section_based":
|
||
return chunk_by_sections(
|
||
text,
|
||
config["section_markers"],
|
||
config["max_chunk_size"],
|
||
config["overlap"]
|
||
)
|
||
elif config["strategy"] == "list_item":
|
||
return chunk_by_list_items(
|
||
text,
|
||
config["list_markers"],
|
||
config["max_chunk_size"]
|
||
)
|
||
else:
|
||
return chunk_text_recursive(
|
||
text,
|
||
config["max_chunk_size"],
|
||
config["overlap"]
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Attribution Functions
|
||
# =============================================================================
|
||
|
||
def generate_attribution_notice(results: List[DSFASearchResult]) -> str:
|
||
"""Generate combined attribution notice for all used sources."""
|
||
from collections import defaultdict
|
||
|
||
by_license = defaultdict(list)
|
||
for r in results:
|
||
by_license[r.license_code].append(r)
|
||
|
||
notices = []
|
||
for license_code, items in by_license.items():
|
||
license_info = LICENSE_REGISTRY.get(license_code, {})
|
||
if license_info.get("attribution_required", True):
|
||
sources = ", ".join(set(r.source_name for r in items))
|
||
license_name = license_info.get("name", license_code)
|
||
notices.append(f"• {sources} - {license_name}")
|
||
|
||
if notices:
|
||
return "Quellennachweis:\n" + "\n".join(notices)
|
||
return ""
|
||
|
||
|
||
def get_license_label(license_code: str) -> str:
|
||
"""Get human-readable license label."""
|
||
license_info = LICENSE_REGISTRY.get(license_code, {})
|
||
return license_info.get("name", license_code)
|
||
|
||
|
||
# =============================================================================
|
||
# Main Functions
|
||
# =============================================================================
|
||
|
||
async def init_dsfa_tables(pool: asyncpg.Pool):
|
||
"""Initialize DSFA tables by running migration."""
|
||
migration_path = os.path.join(
|
||
os.path.dirname(__file__),
|
||
"migrations",
|
||
"003_dsfa_rag_tables.sql"
|
||
)
|
||
|
||
async with pool.acquire() as conn:
|
||
with open(migration_path, "r") as f:
|
||
await conn.execute(f.read())
|
||
|
||
print("DSFA tables initialized successfully")
|
||
|
||
|
||
async def register_all_sources(pool: asyncpg.Pool):
|
||
"""Register all DSFA sources in the database (skips migrated sources)."""
|
||
store = DSFACorpusStore(pool)
|
||
|
||
registered = 0
|
||
skipped = 0
|
||
for source in DSFA_SOURCES:
|
||
if source.get("migrated_to"):
|
||
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
|
||
skipped += 1
|
||
continue
|
||
source_id = await store.register_source(source)
|
||
print(f"Registered source: {source['source_code']} -> {source_id}")
|
||
registered += 1
|
||
|
||
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
|
||
|
||
|
||
async def get_ingestion_status(pool: asyncpg.Pool):
|
||
"""Get current ingestion status."""
|
||
store = DSFACorpusStore(pool)
|
||
qdrant = DSFAQdrantService()
|
||
|
||
print("\n=== DSFA Corpus Status ===\n")
|
||
|
||
# PostgreSQL stats
|
||
stats = await store.get_source_stats()
|
||
print("PostgreSQL Sources:")
|
||
print("-" * 80)
|
||
print(f"{'Source Code':<25} {'Documents':>10} {'Chunks':>10} {'Last Indexed':<20}")
|
||
print("-" * 80)
|
||
|
||
total_docs = 0
|
||
total_chunks = 0
|
||
for s in stats:
|
||
total_docs += s.get("document_count", 0)
|
||
total_chunks += s.get("chunk_count", 0)
|
||
last_indexed = s.get("last_indexed_at")
|
||
last_indexed_str = last_indexed.strftime("%Y-%m-%d %H:%M") if last_indexed else "Never"
|
||
print(f"{s['source_code']:<25} {s.get('document_count', 0):>10} {s.get('chunk_count', 0):>10} {last_indexed_str:<20}")
|
||
|
||
print("-" * 80)
|
||
print(f"{'TOTAL':<25} {total_docs:>10} {total_chunks:>10}")
|
||
|
||
# Qdrant stats
|
||
print("\nQdrant Collection:")
|
||
qdrant_stats = await qdrant.get_stats()
|
||
if "error" in qdrant_stats:
|
||
print(f" Error: {qdrant_stats['error']}")
|
||
else:
|
||
print(f" Collection: {qdrant_stats['collection']}")
|
||
print(f" Points: {qdrant_stats['points_count']}")
|
||
print(f" Status: {qdrant_stats['status']}")
|
||
|
||
|
||
async def main():
|
||
"""Main entry point."""
|
||
parser = argparse.ArgumentParser(description="DSFA Corpus Ingestion Pipeline")
|
||
parser.add_argument("--init-sources", action="store_true", help="Register all sources")
|
||
parser.add_argument("--init-tables", action="store_true", help="Initialize database tables")
|
||
parser.add_argument("--ingest", type=str, help="Ingest specific source by code")
|
||
parser.add_argument("--ingest-all", action="store_true", help="Ingest all sources")
|
||
parser.add_argument("--status", action="store_true", help="Show ingestion status")
|
||
parser.add_argument("--init-qdrant", action="store_true", help="Initialize Qdrant collection")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Connect to database
|
||
pool = await asyncpg.create_pool(DATABASE_URL)
|
||
|
||
try:
|
||
if args.init_tables:
|
||
await init_dsfa_tables(pool)
|
||
|
||
if args.init_sources:
|
||
await register_all_sources(pool)
|
||
|
||
if args.init_qdrant:
|
||
qdrant = DSFAQdrantService()
|
||
await qdrant.ensure_collection()
|
||
print(f"Qdrant collection {DSFA_COLLECTION} initialized")
|
||
|
||
if args.status:
|
||
await get_ingestion_status(pool)
|
||
|
||
if args.ingest:
|
||
print(f"Ingesting source: {args.ingest}")
|
||
# TODO: Implement document ingestion
|
||
|
||
if args.ingest_all:
|
||
print("Ingesting all sources...")
|
||
# TODO: Implement bulk ingestion
|
||
|
||
finally:
|
||
await pool.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|