refactor: Consolidate standalone services into admin-v2, add new SDK modules

Remove standalone services (ai-compliance-sdk root, developer-portal,
dsms-gateway, dsms-node, night-scheduler) and legacy compliance/dsgvo pages.
Add new SDK pipeline modules (academy, document-crawler, dsb-portal,
incidents, whistleblower, reporting, sso, multi-tenant, industry-templates).
Add drafting engine, legal corpus files (AT/CH/DE), pitch-deck,
blog and Förderantrag pages.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-15 09:05:18 +01:00
parent 626f4966e2
commit 70f2b0ae64
396 changed files with 43163 additions and 80397 deletions

View File

@@ -0,0 +1,204 @@
"""
Country Metadata for EU/EWR Compliance Coverage.
Provides entry points (DPA URLs, legal portals, languages) for all EU/EWR countries.
DACH countries have full RAG coverage; others have entry-point metadata for system prompt injection.
Usage:
from country_metadata import COUNTRY_METADATA, get_country_context, DACH_COUNTRIES
"""
from typing import Dict, Optional
COUNTRY_METADATA: Dict[str, dict] = {
"EU": {
"country_name": "Europaeische Union",
"language_codes": ["de", "en", "fr"],
"legal_portal": {"name": "EUR-Lex", "url": "https://eur-lex.europa.eu/"},
"dpa": {"name": "European Data Protection Board (EDPB)", "url": "https://edpb.europa.eu/"},
"rag_coverage": "full",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
},
"DE": {
"country_name": "Deutschland",
"language_codes": ["de"],
"legal_portal": {"name": "Gesetze im Internet", "url": "https://www.gesetze-im-internet.de/"},
"dpa": {"name": "Datenschutzkonferenz (DSK)", "url": "https://www.datenschutzkonferenz-online.de/"},
"rag_coverage": "full",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
},
"AT": {
"country_name": "Oesterreich",
"language_codes": ["de"],
"legal_portal": {"name": "Rechtsinformationssystem (RIS)", "url": "https://www.ris.bka.gv.at/"},
"dpa": {"name": "Datenschutzbehoerde (DSB)", "url": "https://www.dsb.gv.at/"},
"rag_coverage": "full",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
},
"CH": {
"country_name": "Schweiz",
"language_codes": ["de", "fr", "it"],
"legal_portal": {"name": "Fedlex", "url": "https://www.fedlex.admin.ch/"},
"dpa": {"name": "EDOEB", "url": "https://www.edoeb.admin.ch/"},
"rag_coverage": "full",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention", "DSFA", "AVV", "Cookies"],
},
"FR": {
"country_name": "Frankreich",
"language_codes": ["fr"],
"legal_portal": {"name": "Legifrance", "url": "https://www.legifrance.gouv.fr/"},
"dpa": {"name": "CNIL", "url": "https://www.cnil.fr/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"IT": {
"country_name": "Italien",
"language_codes": ["it"],
"legal_portal": {"name": "Normattiva", "url": "https://www.normattiva.it/"},
"dpa": {"name": "Garante per la protezione dei dati personali", "url": "https://www.garanteprivacy.it/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"ES": {
"country_name": "Spanien",
"language_codes": ["es"],
"legal_portal": {"name": "BOE", "url": "https://www.boe.es/"},
"dpa": {"name": "AEPD", "url": "https://www.aepd.es/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"NL": {
"country_name": "Niederlande",
"language_codes": ["nl"],
"legal_portal": {"name": "Overheid.nl", "url": "https://wetten.overheid.nl/"},
"dpa": {"name": "Autoriteit Persoonsgegevens", "url": "https://www.autoriteitpersoonsgegevens.nl/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"BE": {
"country_name": "Belgien",
"language_codes": ["fr", "nl", "de"],
"legal_portal": {"name": "eJustice Belgium", "url": "https://www.ejustice.just.fgov.be/"},
"dpa": {"name": "Autorite de protection des donnees (APD)", "url": "https://www.autoriteprotectiondonnees.be/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"LU": {
"country_name": "Luxemburg",
"language_codes": ["fr", "de"],
"legal_portal": {"name": "Legilux", "url": "https://legilux.public.lu/"},
"dpa": {"name": "CNPD", "url": "https://cnpd.public.lu/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"IE": {
"country_name": "Irland",
"language_codes": ["en"],
"legal_portal": {"name": "Irish Statute Book", "url": "https://www.irishstatutebook.ie/"},
"dpa": {"name": "Data Protection Commission (DPC)", "url": "https://www.dataprotection.ie/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
"DK": {
"country_name": "Daenemark",
"language_codes": ["da"],
"legal_portal": {"name": "Retsinformation", "url": "https://www.retsinformation.dk/"},
"dpa": {"name": "Datatilsynet", "url": "https://www.datatilsynet.dk/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs"],
},
"SE": {
"country_name": "Schweden",
"language_codes": ["sv"],
"legal_portal": {"name": "Riksdagen", "url": "https://www.riksdagen.se/"},
"dpa": {"name": "IMY (Integritetsskyddsmyndigheten)", "url": "https://www.imy.se/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs"],
},
"FI": {
"country_name": "Finnland",
"language_codes": ["fi", "sv"],
"legal_portal": {"name": "Finlex", "url": "https://www.finlex.fi/"},
"dpa": {"name": "Tietosuojavaltuutetun toimisto", "url": "https://tietosuoja.fi/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs"],
},
"NO": {
"country_name": "Norwegen",
"language_codes": ["no"],
"legal_portal": {"name": "Lovdata", "url": "https://lovdata.no/"},
"dpa": {"name": "Datatilsynet", "url": "https://www.datatilsynet.no/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs"],
},
"IS": {
"country_name": "Island",
"language_codes": ["is"],
"legal_portal": {"name": "Althingi", "url": "https://www.althingi.is/"},
"dpa": {"name": "Personuvernd", "url": "https://www.personuvernd.is/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT"],
},
"LI": {
"country_name": "Liechtenstein",
"language_codes": ["de"],
"legal_portal": {"name": "Gesetze.li", "url": "https://www.gesetze.li/"},
"dpa": {"name": "Datenschutzstelle", "url": "https://www.datenschutzstelle.li/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs"],
},
"UK": {
"country_name": "Vereinigtes Koenigreich",
"language_codes": ["en"],
"legal_portal": {"name": "Legislation.gov.uk", "url": "https://www.legislation.gov.uk/"},
"dpa": {"name": "Information Commissioner's Office (ICO)", "url": "https://ico.org.uk/"},
"rag_coverage": "entry_point",
"priority_modules": ["DSI", "VVT", "TOMs", "AGB", "Impressum", "Retention"],
},
}
DACH_COUNTRIES = {"DE", "AT", "CH"}
def get_country_context(country_code: str) -> Optional[str]:
"""
Generate context string for system prompt injection.
For DACH countries: Full RAG search hint.
For other countries: Entry point with DPA and legal portal URLs.
"""
meta = COUNTRY_METADATA.get(country_code.upper())
if not meta:
return None
lines = [f"Land: {meta['country_name']} ({country_code.upper()})"]
lines.append(f"Sprachen: {', '.join(meta['language_codes'])}")
lines.append(f"Rechtsportal: {meta['legal_portal']['name']}{meta['legal_portal']['url']}")
lines.append(f"Datenschutzbehoerde: {meta['dpa']['name']}{meta['dpa']['url']}")
if meta["rag_coverage"] == "full":
lines.append(f"RAG-Abdeckung: VOLLSTAENDIG — Suche in bp_legal_corpus fuer nationale Gesetze verfuegbar.")
lines.append(f"Compliance-Module: {', '.join(meta['priority_modules'])}")
else:
lines.append(f"RAG-Abdeckung: Einstiegspunkt — Keine nationalen Gesetze im RAG. Verweise auf das Rechtsportal und die Datenschutzbehoerde.")
lines.append(f"Hinweis: Fuer detaillierte rechtliche Informationen zu {meta['country_name']} bitte das Rechtsportal oder die DPA konsultieren.")
return "\n".join(lines)
def get_all_countries_summary() -> str:
"""Generate a summary of all covered countries for overview display."""
lines = ["Laenderabdeckung EU/EWR:"]
lines.append("")
lines.append("VOLLSTAENDIG (DACH):")
for code in sorted(DACH_COUNTRIES):
meta = COUNTRY_METADATA[code]
lines.append(f" {code}: {meta['country_name']} — DPA: {meta['dpa']['name']}")
lines.append("")
lines.append("EINSTIEGSPUNKTE:")
for code, meta in sorted(COUNTRY_METADATA.items()):
if code not in DACH_COUNTRIES and code != "EU":
lines.append(f" {code}: {meta['country_name']} — DPA: {meta['dpa']['name']}")
return "\n".join(lines)

View File

@@ -766,6 +766,326 @@ DSFA_SOURCES = [
"document_type": "standard",
"language": "de"
},
# === EDPB Ergaenzende Leitlinien ===
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
{
"source_code": "EDPB_GUIDELINES_2_2019",
"name": "EDPB Leitlinien 2/2019 zu Art. 6(1)(b)",
"full_name": "EDPB Leitlinien 2/2019 zur Verarbeitung personenbezogener Daten auf Grundlage von Art. 6 Abs. 1 lit. b DSGVO",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 2/2019, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_3_2019",
"name": "EDPB Leitlinien 3/2019 Videoueberwachung",
"full_name": "EDPB Leitlinien 3/2019 zur Verarbeitung personenbezogener Daten durch Videoueberwachung",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 3/2019, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_5_2020",
"name": "EDPB Leitlinien 5/2020 Einwilligung",
"full_name": "EDPB Leitlinien 5/2020 zur Einwilligung gemaess Verordnung 2016/679",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 5/2020, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_7_2020",
"name": "EDPB Leitlinien 7/2020 Controller/Processor",
"full_name": "EDPB Leitlinien 7/2020 zu den Begriffen Verantwortlicher und Auftragsverarbeiter",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 7/2020, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_1_2022",
"name": "EDPB Leitlinien 1/2022 Bussgelder",
"full_name": "EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under-gdpr_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 04/2022, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "SCC_FULL_TEXT",
"name": "Standard Contractual Clauses Volltext",
"full_name": "Standardvertragsklauseln fuer die Uebermittlung personenbezogener Daten an Drittlaender (2021/914/EU)",
"organization": "Europaeische Kommission",
"source_url": "https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: SCC Volltext, Europaeische Kommission (EUR-Lex)",
"document_type": "regulation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
# === Nationale Datenschutzgesetze (DSGVO-Umsetzungen) ===
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
# These sources are kept here for reference but will be skipped during ingestion.
# Ingestion should target bp_legal_corpus for these source codes.
{
"source_code": "BDSG_FULL",
"name": "BDSG Volltext (Deutschland)",
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext inkl. aller Teile",
"organization": "Bundesrepublik Deutschland",
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland (gesetze-im-internet.de)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "AT_DSG",
"name": "DSG Oesterreich",
"full_name": "Bundesgesetz zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten (Datenschutzgesetz - DSG)",
"organization": "Republik Oesterreich",
"source_url": "https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001597",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Republik Oesterreich (RIS)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "CH_DSG",
"name": "DSG Schweiz (revDSG 2023)",
"full_name": "Bundesgesetz ueber den Datenschutz (Datenschutzgesetz, DSG) - revidierte Fassung 2023",
"organization": "Schweizerische Eidgenossenschaft",
"source_url": "https://www.fedlex.admin.ch/eli/cc/2022/491/de",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Schweizerische Eidgenossenschaft (Fedlex)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "LI_DSG",
"name": "DSG Liechtenstein",
"full_name": "Datenschutzgesetz (DSG) Liechtenstein",
"organization": "Fuerstentum Liechtenstein",
"source_url": "https://www.gesetze.li/konso/2018.272",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Fuerstentum Liechtenstein (gesetze.li)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "FR_CNIL_GUIDE",
"name": "CNIL Guide RGPD",
"full_name": "Guide pratique RGPD - Commission Nationale de l'Informatique et des Libertes",
"organization": "CNIL (France)",
"source_url": "https://www.cnil.fr/fr/rgpd-de-quoi-parle-t-on",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: CNIL Guide RGPD, Commission Nationale de l'Informatique et des Libertes",
"document_type": "guideline",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "ES_LOPDGDD",
"name": "LOPDGDD Spanien",
"full_name": "Ley Organica de Proteccion de Datos Personales y garantia de los derechos digitales",
"organization": "Reino de Espana",
"source_url": "https://www.boe.es/buscar/act.php?id=BOE-A-2018-16673",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Fuente: LOPDGDD, Reino de Espana (BOE)",
"document_type": "legislation",
"language": "es",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "IT_CODICE_PRIVACY",
"name": "Codice Privacy Italien",
"full_name": "Codice in materia di protezione dei dati personali (D.Lgs. 196/2003, aggiornato D.Lgs. 101/2018)",
"organization": "Repubblica Italiana",
"source_url": "https://www.garanteprivacy.it/home/docweb/-/docweb-display/docweb/9042678",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Fonte: Codice Privacy, Garante per la protezione dei dati personali",
"document_type": "legislation",
"language": "it",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "NL_UAVG",
"name": "UAVG Niederlande",
"full_name": "Uitvoeringswet Algemene verordening gegevensbescherming (UAVG)",
"organization": "Koninkrijk der Nederlanden",
"source_url": "https://wetten.overheid.nl/BWBR0040940/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Bron: UAVG, Koninkrijk der Nederlanden (wetten.overheid.nl)",
"document_type": "legislation",
"language": "nl",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "BE_DPA_LAW",
"name": "Datenschutzgesetz Belgien",
"full_name": "Loi relative a la protection des personnes physiques a l'egard des traitements de donnees a caractere personnel",
"organization": "Royaume de Belgique",
"source_url": "https://www.ejustice.just.fgov.be/cgi_loi/change_lg.pl?language=fr&la=F&cn=2018073046",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: Loi Protection des Donnees, Royaume de Belgique (eJustice)",
"document_type": "legislation",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "LU_DPA_LAW",
"name": "Datenschutzgesetz Luxemburg",
"full_name": "Loi du 1er aout 2018 portant organisation de la Commission nationale pour la protection des donnees",
"organization": "Grand-Duche de Luxembourg",
"source_url": "https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: Loi Protection des Donnees, Grand-Duche de Luxembourg (Legilux)",
"document_type": "legislation",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "IE_DPA_2018",
"name": "Data Protection Act 2018 Ireland",
"full_name": "Data Protection Act 2018 (Act No. 7 of 2018) - Ireland",
"organization": "Government of Ireland",
"source_url": "https://www.irishstatutebook.ie/eli/2018/act/7/enacted/en/html",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, Ireland",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "UK_DPA_2018",
"name": "Data Protection Act 2018 UK",
"full_name": "Data Protection Act 2018 (c. 12) - United Kingdom",
"organization": "Government of the United Kingdom",
"source_url": "https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, UK",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "UK_GDPR",
"name": "UK GDPR (retained EU law)",
"full_name": "United Kingdom General Data Protection Regulation (UK GDPR) - retained EU law",
"organization": "Government of the United Kingdom",
"source_url": "https://www.legislation.gov.uk/eur/2016/679/contents",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: UK GDPR, legislation.gov.uk",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "NO_PERSONOPPLYSNINGSLOVEN",
"name": "Personopplysningsloven Norwegen",
"full_name": "Lov om behandling av personopplysninger (personopplysningsloven)",
"organization": "Kongeriket Norge",
"source_url": "https://lovdata.no/dokument/NL/lov/2018-06-15-38",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kilde: Personopplysningsloven, Kongeriket Norge (Lovdata)",
"document_type": "legislation",
"language": "no",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "SE_DATASKYDDSLAG",
"name": "Dataskyddslag Schweden",
"full_name": "Lag (2018:218) med kompletterande bestammelser till EU:s dataskyddsforordning",
"organization": "Konungariket Sverige",
"source_url": "https://www.riksdagen.se/sv/dokument-och-lagar/dokument/svensk-forfattningssamling/lag-2018218-med-kompletterande-bestammelser-till_sfs-2018-218/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kalla: Dataskyddslag (2018:218), Konungariket Sverige (Riksdagen)",
"document_type": "legislation",
"language": "sv",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "DK_DATABESKYTTELSESLOVEN",
"name": "Databeskyttelsesloven Daenemark",
"full_name": "Lov om supplerende bestemmelser til forordning om beskyttelse af fysiske personer i forbindelse med behandling af personoplysninger",
"organization": "Kongeriget Danmark",
"source_url": "https://www.retsinformation.dk/eli/lta/2018/502",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kilde: Databeskyttelsesloven, Kongeriget Danmark (Retsinformation)",
"document_type": "legislation",
"language": "da",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "FI_TIETOSUOJALAKI",
"name": "Tietosuojalaki Finnland",
"full_name": "Tietosuojalaki (1050/2018) - Datenschutzgesetz Finnland",
"organization": "Suomen tasavalta",
"source_url": "https://www.finlex.fi/fi/laki/ajantasa/2018/20181050",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Lahde: Tietosuojalaki, Suomen tasavalta (Finlex)",
"document_type": "legislation",
"language": "fi",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "PL_UODO",
"name": "UODO Polen",
"full_name": "Ustawa o ochronie danych osobowych - Datenschutzgesetz Polen",
"organization": "Rzeczpospolita Polska",
"source_url": "https://isap.sejm.gov.pl/isap.nsf/DocDetails.xsp?id=WDU20180001000",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Zrodlo: Ustawa o ochronie danych osobowych, Rzeczpospolita Polska (ISAP)",
"document_type": "legislation",
"language": "pl",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "CZ_ZOU",
"name": "Zakon o ochrane osobnich udaju Tschechien",
"full_name": "Zakon c. 110/2019 Sb. o zpracovani osobnich udaju",
"organization": "Ceska republika",
"source_url": "https://www.zakonyprolidi.cz/cs/2019-110",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Zdroj: Zakon o ochrane osobnich udaju, Ceska republika (zakonyprolidi.cz)",
"document_type": "legislation",
"language": "cs",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "HU_INFOTV",
"name": "Informacios torvenye Ungarn",
"full_name": "2011. evi CXII. torveny az informacios onrendelkezesi jogrol es az informacioszabadsagrol (Infotv.)",
"organization": "Magyarorszag",
"source_url": "https://njt.hu/jogszabaly/2011-112-00-00",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Forras: Infotv., Magyarorszag (njt.hu)",
"document_type": "legislation",
"language": "hu",
"migrated_to": "bp_legal_corpus"
},
]
@@ -1100,7 +1420,7 @@ class DSFAQdrantService:
@property
def client(self) -> QdrantClient:
if self._client is None:
self._client = QdrantClient(url=self.url)
self._client = QdrantClient(url=self.url, check_compatibility=False)
return self._client
async def ensure_collection(self) -> bool:
@@ -1408,14 +1728,21 @@ async def init_dsfa_tables(pool: asyncpg.Pool):
async def register_all_sources(pool: asyncpg.Pool):
"""Register all DSFA sources in the database."""
"""Register all DSFA sources in the database (skips migrated sources)."""
store = DSFACorpusStore(pool)
registered = 0
skipped = 0
for source in DSFA_SOURCES:
if source.get("migrated_to"):
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
skipped += 1
continue
source_id = await store.register_source(source)
print(f"Registered source: {source['source_code']} -> {source_id}")
registered += 1
print(f"\nTotal sources registered: {len(DSFA_SOURCES)}")
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
async def get_ingestion_status(pool: asyncpg.Pool):

View File

@@ -1,8 +1,9 @@
"""
Legal Corpus Ingestion for UCCA RAG Integration.
Indexes all 19 regulations from the Compliance Hub into Qdrant for
Indexes all regulations from the Compliance Hub into Qdrant for
semantic search during UCCA assessments and explanations.
Includes EU regulations, DACH national laws, and EDPB guidelines.
Collections:
- bp_legal_corpus: All regulation texts (GDPR, AI Act, CRA, BSI, etc.)
@@ -84,7 +85,7 @@ class Regulation:
requirement_count: int = 0
# All 19 regulations from Compliance Hub
# All regulations from Compliance Hub (EU + DACH national laws + guidelines)
REGULATIONS: List[Regulation] = [
Regulation(
code="GDPR",
@@ -323,6 +324,348 @@ REGULATIONS: List[Regulation] = [
celex="32023R1114",
requirement_count=149,
),
# =====================================================================
# DACH National Laws — Deutschland (P1)
# =====================================================================
Regulation(
code="DE_DDG",
name="Digitale-Dienste-Gesetz",
full_name="Digitale-Dienste-Gesetz (DDG)",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/ddg/",
description="Deutsches Umsetzungsgesetz zum DSA. Regelt Impressumspflicht (§5), Informationspflichten fuer digitale Dienste und Cookies.",
requirement_count=30,
),
Regulation(
code="DE_BGB_AGB",
name="BGB AGB-Recht",
full_name="BGB §§305-310, 312-312k — AGB und Fernabsatz",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/bgb/",
description="Deutsches AGB-Recht (§§305-310 BGB) und Fernabsatzrecht (§§312-312k BGB). Klauselverbote, Inhaltskontrolle, Widerrufsrecht, Button-Loesung.",
local_path="DE_BGB_AGB.txt",
requirement_count=40,
),
Regulation(
code="DE_EGBGB",
name="EGBGB Art. 246-248",
full_name="Einfuehrungsgesetz zum BGB — Informationspflichten",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/bgbeg/",
description="Informationspflichten bei Verbrauchervertraegen (Art. 246), Fernabsatz (Art. 246a), E-Commerce (Art. 246c).",
local_path="DE_EGBGB.txt",
requirement_count=20,
),
Regulation(
code="DE_UWG",
name="UWG Deutschland",
full_name="Gesetz gegen den unlauteren Wettbewerb (UWG)",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/uwg_2004/",
description="Unlauterer Wettbewerb: irrefuehrende Werbung, Spam-Verbot, Preisangaben, Online-Marketing-Regeln.",
requirement_count=25,
),
Regulation(
code="DE_HGB_RET",
name="HGB Aufbewahrung",
full_name="HGB §§238-261, 257 — Handelsbuecher und Aufbewahrungsfristen",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/hgb/",
description="Buchfuehrungspflicht, Aufbewahrungsfristen 6/10 Jahre, Anforderungen an elektronische Aufbewahrung.",
local_path="DE_HGB_RET.txt",
requirement_count=15,
),
Regulation(
code="DE_AO_RET",
name="AO Aufbewahrung",
full_name="Abgabenordnung §§140-148 — Steuerliche Aufbewahrungspflichten",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/ao_1977/",
description="Steuerliche Buchfuehrungs- und Aufbewahrungspflichten. 6/10 Jahre Fristen, Datenzugriff durch Finanzbehoerden.",
local_path="DE_AO_RET.txt",
requirement_count=12,
),
Regulation(
code="DE_TKG",
name="TKG 2021",
full_name="Telekommunikationsgesetz 2021",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/tkg_2021/",
description="Telekommunikationsregulierung: Kundenschutz, Datenschutz, Vertragslaufzeiten, Netzinfrastruktur.",
requirement_count=45,
),
# =====================================================================
# DACH National Laws — Oesterreich (P1)
# =====================================================================
Regulation(
code="AT_ECG",
name="E-Commerce-Gesetz AT",
full_name="E-Commerce-Gesetz (ECG) Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20001703",
description="Oesterreichisches E-Commerce-Gesetz: Impressum/Offenlegungspflicht (§5), Informationspflichten, Haftung von Diensteanbietern.",
language="de",
requirement_count=30,
),
Regulation(
code="AT_TKG",
name="TKG 2021 AT",
full_name="Telekommunikationsgesetz 2021 Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20011678",
description="Oesterreichisches TKG: Cookie-Bestimmungen (§165), Kommunikationsgeheimnis, Endgeraetezugriff.",
language="de",
requirement_count=40,
),
Regulation(
code="AT_KSCHG",
name="KSchG Oesterreich",
full_name="Konsumentenschutzgesetz (KSchG) Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002462",
description="Konsumentenschutz: AGB-Kontrolle (§6 Klauselverbote, §9 Verbandsklage), Ruecktrittsrecht, Informationspflichten.",
language="de",
requirement_count=35,
),
Regulation(
code="AT_FAGG",
name="FAGG Oesterreich",
full_name="Fern- und Auswaertsgeschaefte-Gesetz (FAGG) Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20008847",
description="Fernabsatzrecht: Informationspflichten, Widerrufsrecht 14 Tage, Button-Loesung, Ausnahmen.",
language="de",
requirement_count=20,
),
Regulation(
code="AT_UGB_RET",
name="UGB Aufbewahrung AT",
full_name="UGB §§189-216, 212 — Rechnungslegung und Aufbewahrung Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001702",
description="Oesterreichische Rechnungslegungspflicht und Aufbewahrungsfristen (7 Jahre). Buchfuehrung, Jahresabschluss.",
local_path="AT_UGB_RET.txt",
language="de",
requirement_count=15,
),
Regulation(
code="AT_BAO_RET",
name="BAO §132 AT",
full_name="Bundesabgabenordnung §132 — Aufbewahrung Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10003940",
description="Steuerliche Aufbewahrungspflicht 7 Jahre fuer Buecher, Aufzeichnungen und Belege. Grundstuecke 22 Jahre.",
language="de",
requirement_count=5,
),
Regulation(
code="AT_MEDIENG",
name="MedienG §§24-25 AT",
full_name="Mediengesetz §§24-25 Oesterreich — Impressum und Offenlegung",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10000719",
description="Impressum/Offenlegungspflicht fuer periodische Medien und Websites in Oesterreich.",
language="de",
requirement_count=10,
),
# =====================================================================
# DACH National Laws — Schweiz (P1)
# =====================================================================
Regulation(
code="CH_DSV",
name="DSV Schweiz",
full_name="Datenschutzverordnung (DSV) Schweiz — SR 235.11",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/2022/568/de",
description="Ausfuehrungsverordnung zum revDSG: Meldepflichten, DSFA-Verfahren, Auslandtransfers, technische Massnahmen.",
language="de",
requirement_count=30,
),
Regulation(
code="CH_OR_AGB",
name="OR AGB/Aufbewahrung CH",
full_name="Obligationenrecht — AGB-Kontrolle und Aufbewahrung Schweiz (SR 220)",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/27/317_321_377/de",
description="Art. 8 OR (AGB-Inhaltskontrolle), Art. 19/20 (Vertragsfreiheit), Art. 957-958f (Buchfuehrung, 10 Jahre Aufbewahrung).",
local_path="CH_OR_AGB.txt",
language="de",
requirement_count=20,
),
Regulation(
code="CH_UWG",
name="UWG Schweiz",
full_name="Bundesgesetz gegen den unlauteren Wettbewerb Schweiz (SR 241)",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/1988/223_223_223/de",
description="Lauterkeitsrecht: Impressumspflicht, irrefuehrende Werbung, aggressive Verkaufsmethoden, AGB-Transparenz.",
language="de",
requirement_count=20,
),
Regulation(
code="CH_FMG",
name="FMG Schweiz",
full_name="Fernmeldegesetz Schweiz (SR 784.10)",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/1997/2187_2187_2187/de",
description="Telekommunikationsregulierung: Fernmeldegeheimnis, Cookies/Tracking (Art. 45c), Spam-Verbot, Datenschutz.",
language="de",
requirement_count=25,
),
# =====================================================================
# Deutschland P2
# =====================================================================
Regulation(
code="DE_PANGV",
name="PAngV",
full_name="Preisangabenverordnung (PAngV 2022)",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/pangv_2022/",
description="Preisangaben: Gesamtpreis, Grundpreis, Streichpreise (§11), Online-Preisauszeichnung.",
requirement_count=15,
),
Regulation(
code="DE_DLINFOV",
name="DL-InfoV",
full_name="Dienstleistungs-Informationspflichten-Verordnung",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/dlinfov/",
description="Informationspflichten fuer Dienstleister: Identitaet, Kontakt, Berufshaftpflicht, AGB-Zugang.",
requirement_count=10,
),
Regulation(
code="DE_BETRVG",
name="BetrVG §87",
full_name="Betriebsverfassungsgesetz §87 Abs.1 Nr.6",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/betrvg/",
description="Mitbestimmung bei technischer Ueberwachung: Betriebsrat-Beteiligung bei IT-Systemen, die Arbeitnehmerverhalten ueberwachen koennen.",
requirement_count=5,
),
# =====================================================================
# Oesterreich P2
# =====================================================================
Regulation(
code="AT_ABGB_AGB",
name="ABGB AGB-Recht AT",
full_name="ABGB §§861-879, 864a — AGB-Kontrolle Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001622",
description="Geltungskontrolle (§864a), Sittenwidrigkeitskontrolle (§879 Abs.3), allgemeine Vertragsregeln.",
local_path="AT_ABGB_AGB.txt",
language="de",
requirement_count=10,
),
Regulation(
code="AT_UWG",
name="UWG Oesterreich",
full_name="Bundesgesetz gegen den unlauteren Wettbewerb Oesterreich",
regulation_type="at_law",
source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002665",
description="Lauterkeitsrecht AT: irrefuehrende Geschaeftspraktiken, aggressive Praktiken, Preisauszeichnung.",
language="de",
requirement_count=15,
),
# =====================================================================
# Schweiz P2
# =====================================================================
Regulation(
code="CH_GEBUV",
name="GeBuV Schweiz",
full_name="Geschaeftsbuecher-Verordnung Schweiz (SR 221.431)",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/2002/468_468_468/de",
description="Ausfuehrungsvorschriften zur Buchfuehrung: elektronische Aufbewahrung, Integritaet, Datentraeger.",
language="de",
requirement_count=10,
),
Regulation(
code="CH_ZERTES",
name="ZertES Schweiz",
full_name="Bundesgesetz ueber die elektronische Signatur (SR 943.03)",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/2016/752/de",
description="Elektronische Signatur und Zertifizierung: Qualifizierte Signaturen, Zertifizierungsdiensteanbieter.",
language="de",
requirement_count=10,
),
# =====================================================================
# Deutschland P3
# =====================================================================
Regulation(
code="DE_GESCHGEHG",
name="GeschGehG",
full_name="Gesetz zum Schutz von Geschaeftsgeheimnissen",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/geschgehg/",
description="Schutz von Geschaeftsgeheimnissen: Definition, angemessene Geheimhaltungsmassnahmen, Reverse Engineering.",
requirement_count=10,
),
Regulation(
code="DE_BSIG",
name="BSI-Gesetz",
full_name="Gesetz ueber das Bundesamt fuer Sicherheit in der Informationstechnik (BSIG)",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/bsig_2009/",
description="BSI-Aufgaben, KRITIS-Meldepflichten, IT-Sicherheitsstandards, Zertifizierung.",
requirement_count=20,
),
Regulation(
code="DE_USTG_RET",
name="UStG §14b",
full_name="Umsatzsteuergesetz §14b — Aufbewahrung von Rechnungen",
regulation_type="de_law",
source_url="https://www.gesetze-im-internet.de/ustg_1980/",
description="Aufbewahrungspflicht fuer Rechnungen: 10 Jahre, Grundstuecke 20 Jahre, elektronische Aufbewahrung.",
local_path="DE_USTG_RET.txt",
requirement_count=5,
),
# =====================================================================
# Schweiz P3
# =====================================================================
Regulation(
code="CH_ZGB_PERS",
name="ZGB Persoenlichkeitsschutz CH",
full_name="Zivilgesetzbuch Art. 28-28l — Persoenlichkeitsschutz Schweiz (SR 210)",
regulation_type="ch_law",
source_url="https://www.fedlex.admin.ch/eli/cc/24/233_245_233/de",
description="Persoenlichkeitsschutz: Recht am eigenen Bild, Schutz der Privatsphaere, Gegendarstellungsrecht.",
language="de",
requirement_count=8,
),
# =====================================================================
# 3 fehlgeschlagene Quellen mit alternativen URLs nachholen
# =====================================================================
Regulation(
code="LU_DPA_LAW",
name="Datenschutzgesetz Luxemburg",
full_name="Loi du 1er aout 2018 — Datenschutzgesetz Luxemburg",
regulation_type="national_law",
source_url="https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
description="Luxemburgisches Datenschutzgesetz: Organisation der CNPD, nationale DSGVO-Ergaenzung.",
language="fr",
requirement_count=40,
),
Regulation(
code="DK_DATABESKYTTELSESLOVEN",
name="Databeskyttelsesloven DK",
full_name="Databeskyttelsesloven — Datenschutzgesetz Daenemark",
regulation_type="national_law",
source_url="https://www.retsinformation.dk/eli/lta/2018/502",
description="Daenisches Datenschutzgesetz als ergaenzende Bestimmungen zur DSGVO. Reguliert durch Datatilsynet.",
language="da",
requirement_count=30,
),
Regulation(
code="EDPB_GUIDELINES_1_2022",
name="EDPB GL Bussgelder",
full_name="EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
regulation_type="eu_guideline",
source_url="https://www.edpb.europa.eu/system/files/2023-05/edpb_guidelines_042022_calculationofadministrativefines_en.pdf",
description="EDPB-Leitlinien zur Berechnung von Verwaltungsbussgeldern unter der DSGVO.",
language="en",
requirement_count=15,
),
]
@@ -887,7 +1230,7 @@ async def main():
import argparse
parser = argparse.ArgumentParser(description="Legal Corpus Ingestion for UCCA")
parser.add_argument("--ingest-all", action="store_true", help="Ingest all 19 regulations")
parser.add_argument("--ingest-all", action="store_true", help="Ingest all regulations")
parser.add_argument("--ingest", nargs="+", metavar="CODE", help="Ingest specific regulations by code")
parser.add_argument("--status", action="store_true", help="Show collection status")
parser.add_argument("--search", type=str, help="Test search query")
@@ -902,7 +1245,7 @@ async def main():
print(json.dumps(status, indent=2))
elif args.ingest_all:
print("Ingesting all 19 regulations...")
print(f"Ingesting all {len(REGULATIONS)} regulations...")
results = await ingestion.ingest_all()
print("\nResults:")
for code, count in results.items():

View File

@@ -0,0 +1,307 @@
"""
RAG Chunk Migration: bp_dsfa_corpus -> bp_legal_corpus
Verschiebt nationale Datenschutzgesetze und EU-Dokumente aus bp_dsfa_corpus
nach bp_legal_corpus. Vektoren werden 1:1 uebernommen (kein Re-Embedding).
Usage:
python migrate_rag_chunks.py # Dry run (default)
python migrate_rag_chunks.py --execute # Actually migrate
python migrate_rag_chunks.py --verify # Verify after migration
"""
import os
import sys
import argparse
from datetime import datetime, timezone
from typing import List, Dict, Any
from qdrant_client import QdrantClient
from qdrant_client.models import (
PointStruct, Filter, FieldCondition, MatchAny, ScrollRequest
)
# Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
SOURCE_COLLECTION = "bp_dsfa_corpus"
TARGET_COLLECTION = "bp_legal_corpus"
# Source codes to migrate from bp_dsfa_corpus -> bp_legal_corpus
SOURCES_TO_MIGRATE = [
# Nationale Datenschutzgesetze
"AT_DSG",
"BDSG_FULL",
"BE_DPA_LAW",
"CH_DSG",
"CZ_ZOU",
"ES_LOPDGDD",
"FI_TIETOSUOJALAKI",
"FR_CNIL_GUIDE",
"HU_INFOTV",
"IE_DPA_2018",
"IT_CODICE_PRIVACY",
"LI_DSG",
"NL_UAVG",
"NO_PERSONOPPLYSNINGSLOVEN",
"PL_UODO",
"SE_DATASKYDDSLAG",
"UK_DPA_2018",
"UK_GDPR",
# EU-Dokumente
"SCC_FULL_TEXT",
"EDPB_GUIDELINES_2_2019",
"EDPB_GUIDELINES_3_2019",
"EDPB_GUIDELINES_5_2020",
"EDPB_GUIDELINES_7_2020",
]
# Mapping: source_code -> regulation_type for bp_legal_corpus
REGULATION_TYPE_MAP = {
"AT_DSG": "national_law",
"BDSG_FULL": "de_law",
"BE_DPA_LAW": "national_law",
"CH_DSG": "national_law",
"CZ_ZOU": "national_law",
"ES_LOPDGDD": "national_law",
"FI_TIETOSUOJALAKI": "national_law",
"FR_CNIL_GUIDE": "national_law",
"HU_INFOTV": "national_law",
"IE_DPA_2018": "national_law",
"IT_CODICE_PRIVACY": "national_law",
"LI_DSG": "national_law",
"NL_UAVG": "national_law",
"NO_PERSONOPPLYSNINGSLOVEN": "national_law",
"PL_UODO": "national_law",
"SE_DATASKYDDSLAG": "national_law",
"UK_DPA_2018": "national_law",
"UK_GDPR": "national_law",
"SCC_FULL_TEXT": "eu_regulation",
"EDPB_GUIDELINES_2_2019": "eu_guideline",
"EDPB_GUIDELINES_3_2019": "eu_guideline",
"EDPB_GUIDELINES_5_2020": "eu_guideline",
"EDPB_GUIDELINES_7_2020": "eu_guideline",
}
# Mapping: source_code -> regulation_name for bp_legal_corpus
REGULATION_NAME_MAP = {
"AT_DSG": "DSG Oesterreich",
"BDSG_FULL": "BDSG",
"BE_DPA_LAW": "Datenschutzgesetz Belgien",
"CH_DSG": "DSG Schweiz",
"CZ_ZOU": "Zakon Tschechien",
"ES_LOPDGDD": "LOPDGDD Spanien",
"FI_TIETOSUOJALAKI": "Tietosuojalaki Finnland",
"FR_CNIL_GUIDE": "CNIL Guide RGPD",
"HU_INFOTV": "Infotv. Ungarn",
"IE_DPA_2018": "DPA 2018 Ireland",
"IT_CODICE_PRIVACY": "Codice Privacy Italien",
"LI_DSG": "DSG Liechtenstein",
"NL_UAVG": "UAVG Niederlande",
"NO_PERSONOPPLYSNINGSLOVEN": "Personopplysningsloven",
"PL_UODO": "UODO Polen",
"SE_DATASKYDDSLAG": "Dataskyddslag Schweden",
"UK_DPA_2018": "DPA 2018 UK",
"UK_GDPR": "UK GDPR",
"SCC_FULL_TEXT": "Standardvertragsklauseln",
"EDPB_GUIDELINES_2_2019": "EDPB GL 2/2019",
"EDPB_GUIDELINES_3_2019": "EDPB GL 3/2019",
"EDPB_GUIDELINES_5_2020": "EDPB GL 5/2020",
"EDPB_GUIDELINES_7_2020": "EDPB GL 7/2020",
}
def transform_payload(dsfa_payload: Dict[str, Any]) -> Dict[str, Any]:
"""Transform bp_dsfa_corpus payload to bp_legal_corpus format."""
source_code = dsfa_payload.get("source_code", "")
now = datetime.now(timezone.utc).isoformat()
return {
"text": dsfa_payload.get("content", ""),
"regulation_code": source_code,
"regulation_name": REGULATION_NAME_MAP.get(source_code, dsfa_payload.get("source_name", "")),
"regulation_full_name": dsfa_payload.get("source_name", ""),
"regulation_type": REGULATION_TYPE_MAP.get(source_code, "national_law"),
"source_url": dsfa_payload.get("source_url", ""),
"chunk_index": dsfa_payload.get("chunk_index", 0),
"chunk_position": dsfa_payload.get("chunk_position", 0),
"article": dsfa_payload.get("article", None),
"paragraph": dsfa_payload.get("paragraph", None),
"language": dsfa_payload.get("language", "de"),
"indexed_at": now,
"training_allowed": False,
}
def scroll_all_points(client: QdrantClient, collection: str, source_codes: List[str]) -> List:
"""Scroll through all points matching the source codes."""
all_points = []
offset = None
batch_size = 100
scroll_filter = Filter(
must=[
FieldCondition(
key="source_code",
match=MatchAny(any=source_codes),
)
]
)
while True:
results, next_offset = client.scroll(
collection_name=collection,
scroll_filter=scroll_filter,
limit=batch_size,
offset=offset,
with_vectors=True,
with_payload=True,
)
all_points.extend(results)
if next_offset is None:
break
offset = next_offset
return all_points
def migrate(execute: bool = False):
"""Run the migration."""
print(f"{'=' * 60}")
print(f"RAG Chunk Migration: {SOURCE_COLLECTION} -> {TARGET_COLLECTION}")
print(f"Mode: {'EXECUTE' if execute else 'DRY RUN'}")
print(f"{'=' * 60}")
print()
client = QdrantClient(url=QDRANT_URL)
# Get initial counts
source_info = client.get_collection(SOURCE_COLLECTION)
target_info = client.get_collection(TARGET_COLLECTION)
print(f"Before migration:")
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
print()
# Scroll all points to migrate
print(f"Scrolling points for {len(SOURCES_TO_MIGRATE)} source codes...")
points = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE)
print(f" Found {len(points)} points to migrate")
print()
if not points:
print("No points found to migrate. Exiting.")
return
# Group by source_code for reporting
by_source: Dict[str, int] = {}
for p in points:
sc = p.payload.get("source_code", "UNKNOWN")
by_source[sc] = by_source.get(sc, 0) + 1
print("Points per source:")
for sc in sorted(by_source.keys()):
print(f" {sc}: {by_source[sc]} chunks")
print()
if not execute:
print("DRY RUN complete. Use --execute to actually migrate.")
return
# Transform and upsert in batches
batch_size = 50
upserted = 0
for i in range(0, len(points), batch_size):
batch = points[i:i + batch_size]
new_points = []
for p in batch:
new_payload = transform_payload(p.payload)
new_points.append(PointStruct(
id=p.id,
vector=p.vector,
payload=new_payload,
))
client.upsert(
collection_name=TARGET_COLLECTION,
points=new_points,
)
upserted += len(new_points)
print(f" Upserted {upserted}/{len(points)} points...")
print(f"\nUpsert complete: {upserted} points added to {TARGET_COLLECTION}")
# Delete from source collection
point_ids = [p.id for p in points]
for i in range(0, len(point_ids), 100):
batch_ids = point_ids[i:i + 100]
client.delete(
collection_name=SOURCE_COLLECTION,
points_selector=batch_ids,
)
print(f" Deleted {min(i + 100, len(point_ids))}/{len(point_ids)} from {SOURCE_COLLECTION}...")
print(f"\nDelete complete: {len(point_ids)} points removed from {SOURCE_COLLECTION}")
# Final counts
source_info = client.get_collection(SOURCE_COLLECTION)
target_info = client.get_collection(TARGET_COLLECTION)
print(f"\nAfter migration:")
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
print(f"\nMigration complete!")
def verify():
"""Verify migration results."""
print(f"Verifying migration...")
client = QdrantClient(url=QDRANT_URL)
source_info = client.get_collection(SOURCE_COLLECTION)
target_info = client.get_collection(TARGET_COLLECTION)
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
# Check that migrated sources are gone from dsfa
remaining = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE)
if remaining:
print(f"\n WARNING: {len(remaining)} points still in {SOURCE_COLLECTION}!")
by_source: Dict[str, int] = {}
for p in remaining:
sc = p.payload.get("source_code", "UNKNOWN")
by_source[sc] = by_source.get(sc, 0) + 1
for sc, cnt in sorted(by_source.items()):
print(f" {sc}: {cnt}")
else:
print(f"\n OK: No migrated sources remaining in {SOURCE_COLLECTION}")
# Check that migrated sources exist in legal
for code in SOURCES_TO_MIGRATE:
results, _ = client.scroll(
collection_name=TARGET_COLLECTION,
scroll_filter=Filter(
must=[FieldCondition(key="regulation_code", match=MatchAny(any=[code]))]
),
limit=1,
with_payload=True,
with_vectors=False,
)
status = f"{len(results)}+ chunks" if results else "MISSING"
print(f" {TARGET_COLLECTION}/{code}: {status}")
def main():
parser = argparse.ArgumentParser(description="Migrate RAG chunks between collections")
parser.add_argument("--execute", action="store_true", help="Actually execute the migration (default: dry run)")
parser.add_argument("--verify", action="store_true", help="Verify migration results")
args = parser.parse_args()
if args.verify:
verify()
else:
migrate(execute=args.execute)
if __name__ == "__main__":
main()