feat(rag): Migrate national DPA laws from bp_dsfa_corpus to bp_legal_corpus

Move 23 sources (18 national data protection laws + 5 EDPB guidelines/SCC)
from bp_dsfa_corpus to bp_legal_corpus with vector preservation. Extend
REGULATIONS array with national_law and eu_guideline types. Mark migrated
sources in dsfa_corpus_ingestion.py to prevent re-ingestion.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
BreakPilot Dev
2026-02-10 23:26:18 +01:00
parent f09e24d52c
commit e636b8cef8
3 changed files with 1289 additions and 24 deletions

View File

@@ -766,6 +766,326 @@ DSFA_SOURCES = [
"document_type": "standard",
"language": "de"
},
# === EDPB Ergaenzende Leitlinien ===
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
{
"source_code": "EDPB_GUIDELINES_2_2019",
"name": "EDPB Leitlinien 2/2019 zu Art. 6(1)(b)",
"full_name": "EDPB Leitlinien 2/2019 zur Verarbeitung personenbezogener Daten auf Grundlage von Art. 6 Abs. 1 lit. b DSGVO",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22019-processing-personal-data-under-article-61b_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 2/2019, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_3_2019",
"name": "EDPB Leitlinien 3/2019 Videoueberwachung",
"full_name": "EDPB Leitlinien 3/2019 zur Verarbeitung personenbezogener Daten durch Videoueberwachung",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-32019-processing-personal-data-through-video_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 3/2019, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_5_2020",
"name": "EDPB Leitlinien 5/2020 Einwilligung",
"full_name": "EDPB Leitlinien 5/2020 zur Einwilligung gemaess Verordnung 2016/679",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-052020-consent-under-regulation-2016679_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 5/2020, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_7_2020",
"name": "EDPB Leitlinien 7/2020 Controller/Processor",
"full_name": "EDPB Leitlinien 7/2020 zu den Begriffen Verantwortlicher und Auftragsverarbeiter",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-072020-concepts-controller-and-processor-gdpr_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 7/2020, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "EDPB_GUIDELINES_1_2022",
"name": "EDPB Leitlinien 1/2022 Bussgelder",
"full_name": "EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
"organization": "European Data Protection Board",
"source_url": "https://edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-042022-calculation-administrative-fines-under-gdpr_en",
"license_code": "EDPB-LICENSE",
"attribution_text": "Source: EDPB Guidelines 04/2022, European Data Protection Board",
"document_type": "guideline",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "SCC_FULL_TEXT",
"name": "Standard Contractual Clauses Volltext",
"full_name": "Standardvertragsklauseln fuer die Uebermittlung personenbezogener Daten an Drittlaender (2021/914/EU)",
"organization": "Europaeische Kommission",
"source_url": "https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: SCC Volltext, Europaeische Kommission (EUR-Lex)",
"document_type": "regulation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
# === Nationale Datenschutzgesetze (DSGVO-Umsetzungen) ===
# MIGRATED to bp_legal_corpus via migrate_rag_chunks.py (2026-02-10)
# These sources are kept here for reference but will be skipped during ingestion.
# Ingestion should target bp_legal_corpus for these source codes.
{
"source_code": "BDSG_FULL",
"name": "BDSG Volltext (Deutschland)",
"full_name": "Bundesdatenschutzgesetz (BDSG) - Volltext inkl. aller Teile",
"organization": "Bundesrepublik Deutschland",
"source_url": "https://www.gesetze-im-internet.de/bdsg_2018/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: BDSG, Bundesrepublik Deutschland (gesetze-im-internet.de)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "AT_DSG",
"name": "DSG Oesterreich",
"full_name": "Bundesgesetz zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten (Datenschutzgesetz - DSG)",
"organization": "Republik Oesterreich",
"source_url": "https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001597",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Republik Oesterreich (RIS)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "CH_DSG",
"name": "DSG Schweiz (revDSG 2023)",
"full_name": "Bundesgesetz ueber den Datenschutz (Datenschutzgesetz, DSG) - revidierte Fassung 2023",
"organization": "Schweizerische Eidgenossenschaft",
"source_url": "https://www.fedlex.admin.ch/eli/cc/2022/491/de",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Schweizerische Eidgenossenschaft (Fedlex)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "LI_DSG",
"name": "DSG Liechtenstein",
"full_name": "Datenschutzgesetz (DSG) Liechtenstein",
"organization": "Fuerstentum Liechtenstein",
"source_url": "https://www.gesetze.li/konso/2018.272",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Quelle: DSG, Fuerstentum Liechtenstein (gesetze.li)",
"document_type": "legislation",
"language": "de",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "FR_CNIL_GUIDE",
"name": "CNIL Guide RGPD",
"full_name": "Guide pratique RGPD - Commission Nationale de l'Informatique et des Libertes",
"organization": "CNIL (France)",
"source_url": "https://www.cnil.fr/fr/rgpd-de-quoi-parle-t-on",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: CNIL Guide RGPD, Commission Nationale de l'Informatique et des Libertes",
"document_type": "guideline",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "ES_LOPDGDD",
"name": "LOPDGDD Spanien",
"full_name": "Ley Organica de Proteccion de Datos Personales y garantia de los derechos digitales",
"organization": "Reino de Espana",
"source_url": "https://www.boe.es/buscar/act.php?id=BOE-A-2018-16673",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Fuente: LOPDGDD, Reino de Espana (BOE)",
"document_type": "legislation",
"language": "es",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "IT_CODICE_PRIVACY",
"name": "Codice Privacy Italien",
"full_name": "Codice in materia di protezione dei dati personali (D.Lgs. 196/2003, aggiornato D.Lgs. 101/2018)",
"organization": "Repubblica Italiana",
"source_url": "https://www.garanteprivacy.it/home/docweb/-/docweb-display/docweb/9042678",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Fonte: Codice Privacy, Garante per la protezione dei dati personali",
"document_type": "legislation",
"language": "it",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "NL_UAVG",
"name": "UAVG Niederlande",
"full_name": "Uitvoeringswet Algemene verordening gegevensbescherming (UAVG)",
"organization": "Koninkrijk der Nederlanden",
"source_url": "https://wetten.overheid.nl/BWBR0040940/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Bron: UAVG, Koninkrijk der Nederlanden (wetten.overheid.nl)",
"document_type": "legislation",
"language": "nl",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "BE_DPA_LAW",
"name": "Datenschutzgesetz Belgien",
"full_name": "Loi relative a la protection des personnes physiques a l'egard des traitements de donnees a caractere personnel",
"organization": "Royaume de Belgique",
"source_url": "https://www.ejustice.just.fgov.be/cgi_loi/change_lg.pl?language=fr&la=F&cn=2018073046",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: Loi Protection des Donnees, Royaume de Belgique (eJustice)",
"document_type": "legislation",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "LU_DPA_LAW",
"name": "Datenschutzgesetz Luxemburg",
"full_name": "Loi du 1er aout 2018 portant organisation de la Commission nationale pour la protection des donnees",
"organization": "Grand-Duche de Luxembourg",
"source_url": "https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Source: Loi Protection des Donnees, Grand-Duche de Luxembourg (Legilux)",
"document_type": "legislation",
"language": "fr",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "IE_DPA_2018",
"name": "Data Protection Act 2018 Ireland",
"full_name": "Data Protection Act 2018 (Act No. 7 of 2018) - Ireland",
"organization": "Government of Ireland",
"source_url": "https://www.irishstatutebook.ie/eli/2018/act/7/enacted/en/html",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, Ireland",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "UK_DPA_2018",
"name": "Data Protection Act 2018 UK",
"full_name": "Data Protection Act 2018 (c. 12) - United Kingdom",
"organization": "Government of the United Kingdom",
"source_url": "https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: Data Protection Act 2018, UK",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "UK_GDPR",
"name": "UK GDPR (retained EU law)",
"full_name": "United Kingdom General Data Protection Regulation (UK GDPR) - retained EU law",
"organization": "Government of the United Kingdom",
"source_url": "https://www.legislation.gov.uk/eur/2016/679/contents",
"license_code": "OGL-3.0",
"attribution_text": "Contains public sector information licensed under the Open Government Licence v3.0. Source: UK GDPR, legislation.gov.uk",
"document_type": "legislation",
"language": "en",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "NO_PERSONOPPLYSNINGSLOVEN",
"name": "Personopplysningsloven Norwegen",
"full_name": "Lov om behandling av personopplysninger (personopplysningsloven)",
"organization": "Kongeriket Norge",
"source_url": "https://lovdata.no/dokument/NL/lov/2018-06-15-38",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kilde: Personopplysningsloven, Kongeriket Norge (Lovdata)",
"document_type": "legislation",
"language": "no",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "SE_DATASKYDDSLAG",
"name": "Dataskyddslag Schweden",
"full_name": "Lag (2018:218) med kompletterande bestammelser till EU:s dataskyddsforordning",
"organization": "Konungariket Sverige",
"source_url": "https://www.riksdagen.se/sv/dokument-och-lagar/dokument/svensk-forfattningssamling/lag-2018218-med-kompletterande-bestammelser-till_sfs-2018-218/",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kalla: Dataskyddslag (2018:218), Konungariket Sverige (Riksdagen)",
"document_type": "legislation",
"language": "sv",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "DK_DATABESKYTTELSESLOVEN",
"name": "Databeskyttelsesloven Daenemark",
"full_name": "Lov om supplerende bestemmelser til forordning om beskyttelse af fysiske personer i forbindelse med behandling af personoplysninger",
"organization": "Kongeriget Danmark",
"source_url": "https://www.retsinformation.dk/eli/lta/2018/502",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Kilde: Databeskyttelsesloven, Kongeriget Danmark (Retsinformation)",
"document_type": "legislation",
"language": "da",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "FI_TIETOSUOJALAKI",
"name": "Tietosuojalaki Finnland",
"full_name": "Tietosuojalaki (1050/2018) - Datenschutzgesetz Finnland",
"organization": "Suomen tasavalta",
"source_url": "https://www.finlex.fi/fi/laki/ajantasa/2018/20181050",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Lahde: Tietosuojalaki, Suomen tasavalta (Finlex)",
"document_type": "legislation",
"language": "fi",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "PL_UODO",
"name": "UODO Polen",
"full_name": "Ustawa o ochronie danych osobowych - Datenschutzgesetz Polen",
"organization": "Rzeczpospolita Polska",
"source_url": "https://isap.sejm.gov.pl/isap.nsf/DocDetails.xsp?id=WDU20180001000",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Zrodlo: Ustawa o ochronie danych osobowych, Rzeczpospolita Polska (ISAP)",
"document_type": "legislation",
"language": "pl",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "CZ_ZOU",
"name": "Zakon o ochrane osobnich udaju Tschechien",
"full_name": "Zakon c. 110/2019 Sb. o zpracovani osobnich udaju",
"organization": "Ceska republika",
"source_url": "https://www.zakonyprolidi.cz/cs/2019-110",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Zdroj: Zakon o ochrane osobnich udaju, Ceska republika (zakonyprolidi.cz)",
"document_type": "legislation",
"language": "cs",
"migrated_to": "bp_legal_corpus"
},
{
"source_code": "HU_INFOTV",
"name": "Informacios torvenye Ungarn",
"full_name": "2011. evi CXII. torveny az informacios onrendelkezesi jogrol es az informacioszabadsagrol (Infotv.)",
"organization": "Magyarorszag",
"source_url": "https://njt.hu/jogszabaly/2011-112-00-00",
"license_code": "PUBLIC_DOMAIN",
"attribution_text": "Forras: Infotv., Magyarorszag (njt.hu)",
"document_type": "legislation",
"language": "hu",
"migrated_to": "bp_legal_corpus"
},
]
@@ -1100,7 +1420,7 @@ class DSFAQdrantService:
@property
def client(self) -> QdrantClient:
if self._client is None:
self._client = QdrantClient(url=self.url)
self._client = QdrantClient(url=self.url, check_compatibility=False)
return self._client
async def ensure_collection(self) -> bool:
@@ -1408,14 +1728,21 @@ async def init_dsfa_tables(pool: asyncpg.Pool):
async def register_all_sources(pool: asyncpg.Pool):
"""Register all DSFA sources in the database."""
"""Register all DSFA sources in the database (skips migrated sources)."""
store = DSFACorpusStore(pool)
registered = 0
skipped = 0
for source in DSFA_SOURCES:
if source.get("migrated_to"):
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
skipped += 1
continue
source_id = await store.register_source(source)
print(f"Registered source: {source['source_code']} -> {source_id}")
registered += 1
print(f"\nTotal sources registered: {len(DSFA_SOURCES)}")
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
async def get_ingestion_status(pool: asyncpg.Pool):

View File

@@ -0,0 +1,307 @@
"""
RAG Chunk Migration: bp_dsfa_corpus -> bp_legal_corpus
Verschiebt nationale Datenschutzgesetze und EU-Dokumente aus bp_dsfa_corpus
nach bp_legal_corpus. Vektoren werden 1:1 uebernommen (kein Re-Embedding).
Usage:
python migrate_rag_chunks.py # Dry run (default)
python migrate_rag_chunks.py --execute # Actually migrate
python migrate_rag_chunks.py --verify # Verify after migration
"""
import os
import sys
import argparse
from datetime import datetime, timezone
from typing import List, Dict, Any
from qdrant_client import QdrantClient
from qdrant_client.models import (
PointStruct, Filter, FieldCondition, MatchAny, ScrollRequest
)
# Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
SOURCE_COLLECTION = "bp_dsfa_corpus"
TARGET_COLLECTION = "bp_legal_corpus"
# Source codes to migrate from bp_dsfa_corpus -> bp_legal_corpus
SOURCES_TO_MIGRATE = [
# Nationale Datenschutzgesetze
"AT_DSG",
"BDSG_FULL",
"BE_DPA_LAW",
"CH_DSG",
"CZ_ZOU",
"ES_LOPDGDD",
"FI_TIETOSUOJALAKI",
"FR_CNIL_GUIDE",
"HU_INFOTV",
"IE_DPA_2018",
"IT_CODICE_PRIVACY",
"LI_DSG",
"NL_UAVG",
"NO_PERSONOPPLYSNINGSLOVEN",
"PL_UODO",
"SE_DATASKYDDSLAG",
"UK_DPA_2018",
"UK_GDPR",
# EU-Dokumente
"SCC_FULL_TEXT",
"EDPB_GUIDELINES_2_2019",
"EDPB_GUIDELINES_3_2019",
"EDPB_GUIDELINES_5_2020",
"EDPB_GUIDELINES_7_2020",
]
# Mapping: source_code -> regulation_type for bp_legal_corpus
REGULATION_TYPE_MAP = {
"AT_DSG": "national_law",
"BDSG_FULL": "de_law",
"BE_DPA_LAW": "national_law",
"CH_DSG": "national_law",
"CZ_ZOU": "national_law",
"ES_LOPDGDD": "national_law",
"FI_TIETOSUOJALAKI": "national_law",
"FR_CNIL_GUIDE": "national_law",
"HU_INFOTV": "national_law",
"IE_DPA_2018": "national_law",
"IT_CODICE_PRIVACY": "national_law",
"LI_DSG": "national_law",
"NL_UAVG": "national_law",
"NO_PERSONOPPLYSNINGSLOVEN": "national_law",
"PL_UODO": "national_law",
"SE_DATASKYDDSLAG": "national_law",
"UK_DPA_2018": "national_law",
"UK_GDPR": "national_law",
"SCC_FULL_TEXT": "eu_regulation",
"EDPB_GUIDELINES_2_2019": "eu_guideline",
"EDPB_GUIDELINES_3_2019": "eu_guideline",
"EDPB_GUIDELINES_5_2020": "eu_guideline",
"EDPB_GUIDELINES_7_2020": "eu_guideline",
}
# Mapping: source_code -> regulation_name for bp_legal_corpus
REGULATION_NAME_MAP = {
"AT_DSG": "DSG Oesterreich",
"BDSG_FULL": "BDSG",
"BE_DPA_LAW": "Datenschutzgesetz Belgien",
"CH_DSG": "DSG Schweiz",
"CZ_ZOU": "Zakon Tschechien",
"ES_LOPDGDD": "LOPDGDD Spanien",
"FI_TIETOSUOJALAKI": "Tietosuojalaki Finnland",
"FR_CNIL_GUIDE": "CNIL Guide RGPD",
"HU_INFOTV": "Infotv. Ungarn",
"IE_DPA_2018": "DPA 2018 Ireland",
"IT_CODICE_PRIVACY": "Codice Privacy Italien",
"LI_DSG": "DSG Liechtenstein",
"NL_UAVG": "UAVG Niederlande",
"NO_PERSONOPPLYSNINGSLOVEN": "Personopplysningsloven",
"PL_UODO": "UODO Polen",
"SE_DATASKYDDSLAG": "Dataskyddslag Schweden",
"UK_DPA_2018": "DPA 2018 UK",
"UK_GDPR": "UK GDPR",
"SCC_FULL_TEXT": "Standardvertragsklauseln",
"EDPB_GUIDELINES_2_2019": "EDPB GL 2/2019",
"EDPB_GUIDELINES_3_2019": "EDPB GL 3/2019",
"EDPB_GUIDELINES_5_2020": "EDPB GL 5/2020",
"EDPB_GUIDELINES_7_2020": "EDPB GL 7/2020",
}
def transform_payload(dsfa_payload: Dict[str, Any]) -> Dict[str, Any]:
"""Transform bp_dsfa_corpus payload to bp_legal_corpus format."""
source_code = dsfa_payload.get("source_code", "")
now = datetime.now(timezone.utc).isoformat()
return {
"text": dsfa_payload.get("content", ""),
"regulation_code": source_code,
"regulation_name": REGULATION_NAME_MAP.get(source_code, dsfa_payload.get("source_name", "")),
"regulation_full_name": dsfa_payload.get("source_name", ""),
"regulation_type": REGULATION_TYPE_MAP.get(source_code, "national_law"),
"source_url": dsfa_payload.get("source_url", ""),
"chunk_index": dsfa_payload.get("chunk_index", 0),
"chunk_position": dsfa_payload.get("chunk_position", 0),
"article": dsfa_payload.get("article", None),
"paragraph": dsfa_payload.get("paragraph", None),
"language": dsfa_payload.get("language", "de"),
"indexed_at": now,
"training_allowed": False,
}
def scroll_all_points(client: QdrantClient, collection: str, source_codes: List[str]) -> List:
"""Scroll through all points matching the source codes."""
all_points = []
offset = None
batch_size = 100
scroll_filter = Filter(
must=[
FieldCondition(
key="source_code",
match=MatchAny(any=source_codes),
)
]
)
while True:
results, next_offset = client.scroll(
collection_name=collection,
scroll_filter=scroll_filter,
limit=batch_size,
offset=offset,
with_vectors=True,
with_payload=True,
)
all_points.extend(results)
if next_offset is None:
break
offset = next_offset
return all_points
def migrate(execute: bool = False):
"""Run the migration."""
print(f"{'=' * 60}")
print(f"RAG Chunk Migration: {SOURCE_COLLECTION} -> {TARGET_COLLECTION}")
print(f"Mode: {'EXECUTE' if execute else 'DRY RUN'}")
print(f"{'=' * 60}")
print()
client = QdrantClient(url=QDRANT_URL)
# Get initial counts
source_info = client.get_collection(SOURCE_COLLECTION)
target_info = client.get_collection(TARGET_COLLECTION)
print(f"Before migration:")
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
print()
# Scroll all points to migrate
print(f"Scrolling points for {len(SOURCES_TO_MIGRATE)} source codes...")
points = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE)
print(f" Found {len(points)} points to migrate")
print()
if not points:
print("No points found to migrate. Exiting.")
return
# Group by source_code for reporting
by_source: Dict[str, int] = {}
for p in points:
sc = p.payload.get("source_code", "UNKNOWN")
by_source[sc] = by_source.get(sc, 0) + 1
print("Points per source:")
for sc in sorted(by_source.keys()):
print(f" {sc}: {by_source[sc]} chunks")
print()
if not execute:
print("DRY RUN complete. Use --execute to actually migrate.")
return
# Transform and upsert in batches
batch_size = 50
upserted = 0
for i in range(0, len(points), batch_size):
batch = points[i:i + batch_size]
new_points = []
for p in batch:
new_payload = transform_payload(p.payload)
new_points.append(PointStruct(
id=p.id,
vector=p.vector,
payload=new_payload,
))
client.upsert(
collection_name=TARGET_COLLECTION,
points=new_points,
)
upserted += len(new_points)
print(f" Upserted {upserted}/{len(points)} points...")
print(f"\nUpsert complete: {upserted} points added to {TARGET_COLLECTION}")
# Delete from source collection
point_ids = [p.id for p in points]
for i in range(0, len(point_ids), 100):
batch_ids = point_ids[i:i + 100]
client.delete(
collection_name=SOURCE_COLLECTION,
points_selector=batch_ids,
)
print(f" Deleted {min(i + 100, len(point_ids))}/{len(point_ids)} from {SOURCE_COLLECTION}...")
print(f"\nDelete complete: {len(point_ids)} points removed from {SOURCE_COLLECTION}")
# Final counts
source_info = client.get_collection(SOURCE_COLLECTION)
target_info = client.get_collection(TARGET_COLLECTION)
print(f"\nAfter migration:")
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
print(f"\nMigration complete!")
def verify():
"""Verify migration results."""
print(f"Verifying migration...")
client = QdrantClient(url=QDRANT_URL)
source_info = client.get_collection(SOURCE_COLLECTION)
target_info = client.get_collection(TARGET_COLLECTION)
print(f" {SOURCE_COLLECTION}: {source_info.points_count} points")
print(f" {TARGET_COLLECTION}: {target_info.points_count} points")
# Check that migrated sources are gone from dsfa
remaining = scroll_all_points(client, SOURCE_COLLECTION, SOURCES_TO_MIGRATE)
if remaining:
print(f"\n WARNING: {len(remaining)} points still in {SOURCE_COLLECTION}!")
by_source: Dict[str, int] = {}
for p in remaining:
sc = p.payload.get("source_code", "UNKNOWN")
by_source[sc] = by_source.get(sc, 0) + 1
for sc, cnt in sorted(by_source.items()):
print(f" {sc}: {cnt}")
else:
print(f"\n OK: No migrated sources remaining in {SOURCE_COLLECTION}")
# Check that migrated sources exist in legal
for code in SOURCES_TO_MIGRATE:
results, _ = client.scroll(
collection_name=TARGET_COLLECTION,
scroll_filter=Filter(
must=[FieldCondition(key="regulation_code", match=MatchAny(any=[code]))]
),
limit=1,
with_payload=True,
with_vectors=False,
)
status = f"{len(results)}+ chunks" if results else "MISSING"
print(f" {TARGET_COLLECTION}/{code}: {status}")
def main():
parser = argparse.ArgumentParser(description="Migrate RAG chunks between collections")
parser.add_argument("--execute", action="store_true", help="Actually execute the migration (default: dry run)")
parser.add_argument("--verify", action="store_true", help="Verify migration results")
args = parser.parse_args()
if args.verify:
verify()
else:
migrate(execute=args.execute)
if __name__ == "__main__":
main()