#!/usr/bin/env python3 """ Qdrant Migration: Local → qdrant-dev.breakpilot.ai Migrates compliance collections only (skips NIBIS/Lehrer collections). Uses persistent HTTP sessions and rate limiting for hosted Qdrant. """ import json import sys import time import requests from urllib.parse import urljoin SOURCE_URL = "http://macmini:6333" TARGET_URL = "https://qdrant-dev.breakpilot.ai" TARGET_API_KEY = "z9cKbT74vl1aKPD1QGIlKWfET47VH93u" BATCH_SIZE = 20 RATE_LIMIT_DELAY = 0.3 # seconds between batches SKIP_COLLECTIONS = { "bp_nibis", "bp_nibis_eh", "bp_compliance_schulrecht", "bp_eh", "bp_vocab" } # Persistent sessions for connection reuse source = requests.Session() source.headers.update({"Content-Type": "application/json"}) source.timeout = 30 target = requests.Session() target.headers.update({ "Content-Type": "application/json", "api-key": TARGET_API_KEY, }) target.timeout = 120 def log(msg): print(f"\033[0;32m[INFO]\033[0m {msg}") def warn(msg): print(f"\033[1;33m[WARN]\033[0m {msg}") def fail(msg): print(f"\033[0;31m[FAIL]\033[0m {msg}") sys.exit(1) def get_collections(): resp = source.get(f"{SOURCE_URL}/collections") resp.raise_for_status() return [c["name"] for c in resp.json()["result"]["collections"]] def get_collection_info(url, session, col): resp = session.get(f"{url}/collections/{col}") resp.raise_for_status() return resp.json() def create_collection(col, config): vectors = config["result"]["config"]["params"]["vectors"] body = {"vectors": vectors} params = config["result"]["config"]["params"] if "shard_number" in params: body["shard_number"] = params["shard_number"] resp = target.put(f"{TARGET_URL}/collections/{col}", json=body) if resp.status_code == 200: return True # Check if already exists check = target.get(f"{TARGET_URL}/collections/{col}") if check.status_code == 200 and check.json().get("status") == "ok": warn(f" Collection already exists, will upsert points") return True warn(f" Failed to create collection: {resp.text}") return False def scroll_points(col, offset=None, retries=3): body = { "limit": BATCH_SIZE, "with_payload": True, "with_vector": True, } if offset is not None: body["offset"] = offset for attempt in range(retries): try: resp = source.post(f"{SOURCE_URL}/collections/{col}/points/scroll", json=body, timeout=60) resp.raise_for_status() result = resp.json()["result"] return result["points"], result.get("next_page_offset") except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: warn(f" Source scroll error (attempt {attempt+1}/{retries}): {e}") time.sleep(5 * (attempt + 1)) fail(f" Source unreachable after {retries} retries") def upsert_points(col, points, retries=3): for attempt in range(retries): try: resp = target.put( f"{TARGET_URL}/collections/{col}/points?wait=true", json={"points": points}, timeout=120, ) if resp.status_code == 200: return True if resp.status_code == 502: warn(f" 502 Bad Gateway (attempt {attempt+1}/{retries}), waiting...") time.sleep(5 * (attempt + 1)) continue warn(f" Upsert failed ({resp.status_code}): {resp.text[:200]}") return False except requests.exceptions.Timeout: warn(f" Timeout (attempt {attempt+1}/{retries}), waiting...") time.sleep(10 * (attempt + 1)) except requests.exceptions.ConnectionError: warn(f" Connection error (attempt {attempt+1}/{retries}), waiting...") time.sleep(15 * (attempt + 1)) return False def migrate_collection(col): info = get_collection_info(SOURCE_URL, source, col) source_count = info["result"]["points_count"] log(f" Source points: {source_count}") if source_count == 0: warn(" Empty collection, skipping") return 0 if not create_collection(col, info): return 0 # Wait for collection to be ready time.sleep(3) offset = None migrated = 0 failed = 0 while True: points, next_offset = scroll_points(col, offset) if not points: break if upsert_points(col, points): migrated += len(points) else: # Retry one-by-one warn(f" Batch failed, retrying {len(points)} points individually...") for pt in points: time.sleep(RATE_LIMIT_DELAY) if upsert_points(col, [pt]): migrated += 1 else: failed += 1 sys.stdout.write(f"\r Migrated: {migrated}/{source_count} (failed: {failed})") sys.stdout.flush() if next_offset is None: break offset = next_offset time.sleep(RATE_LIMIT_DELAY) print() # Verify try: target_info = get_collection_info(TARGET_URL, target, col) target_count = target_info["result"]["points_count"] if target_count == source_count: log(f" Verified: {target_count}/{source_count} points") else: warn(f" Count mismatch! Source: {source_count}, Target: {target_count}") except Exception as e: warn(f" Could not verify: {e}") return migrated def main(): log(f"Source: {SOURCE_URL}") log(f"Target: {TARGET_URL}") log(f"Batch size: {BATCH_SIZE}, Rate limit: {RATE_LIMIT_DELAY}s") print() # Preflight try: source.get(f"{SOURCE_URL}/collections").raise_for_status() except Exception: fail(f"Source not reachable at {SOURCE_URL}") try: target.get(f"{TARGET_URL}/collections").raise_for_status() except Exception: fail(f"Target not reachable at {TARGET_URL}") collections = get_collections() log("Collections on source:") for col in collections: if col in SKIP_COLLECTIONS: print(f" \033[1;33mSKIP\033[0m {col}") else: print(f" \033[0;32m OK\033[0m {col}") print() total_migrated = 0 total_points = 0 for col in collections: if col in SKIP_COLLECTIONS: continue log(f"=== Migrating: {col} ===") points = migrate_collection(col) total_migrated += 1 total_points += points print() log("=" * 40) log(f"Migration complete!") log(f"Collections migrated: {total_migrated}") log(f"Total points migrated: {total_points}") log("=" * 40) if __name__ == "__main__": main()