#!/usr/bin/env python3 """ Qdrant Duplicate Cleanup Removes redundant/duplicate chunks from Qdrant collections. Targets: 1. bp_compliance_recht — entire collection (100% subset of bp_compliance_ce) 2. bp_compliance_gesetze — old versions where _komplett exists 3. bp_compliance_gesetze — BGB section extracts (subset of bgb_komplett) 4. bp_compliance_gesetze — AT law duplicates (renamed copies) 5. bp_compliance_gesetze — stubs (1 chunk placeholders) 6. bp_compliance_gesetze — EU regulations already in bp_compliance_ce 7. bp_compliance_gesetze — dual-naming duplicates (keep newer/longer version) 8. bp_compliance_datenschutz — EDPB/WP duplicate ingestions Run with --dry-run to preview deletions without executing. """ import argparse import json import sys import time import requests # ───────────────────────────────────────────────────────────────────────────── # Config — targets BOTH local and production Qdrant # ───────────────────────────────────────────────────────────────────────────── TARGETS = { "local": { "url": "http://macmini:6333", "api_key": None, }, "production": { "url": "https://qdrant-dev.breakpilot.ai", "api_key": "z9cKbT74vl1aKPD1QGIlKWfET47VH93u", }, } # ───────────────────────────────────────────────────────────────────────────── # Deletion plan — regulation_ids to remove per collection # ───────────────────────────────────────────────────────────────────────────── # 1. bp_compliance_recht: DELETE ENTIRE COLLECTION # All 9 regulation_ids are already in bp_compliance_ce with same or more chunks # 2. bp_compliance_gesetze: old versions (keep _komplett) GESETZE_OLD_VERSIONS = [ "ao", # ao_komplett has 9,669 chunks vs ao's 1,752 "bdsg", # bdsg_2018_komplett has 1,056 vs bdsg's 389 "egbgb", # egbgb_komplett has 1,412 vs egbgb's 269 "hgb", # hgb_komplett has 11,363 vs hgb's 1,937 ] # 3. bp_compliance_gesetze: BGB section extracts (subset of bgb_komplett 4,024 chunks) GESETZE_BGB_EXTRACTS = [ "bgb_agb", # 94 chunks "bgb_digital", # 42 chunks "bgb_fernabsatz", # 71 chunks "bgb_kaufrecht", # 147 chunks "bgb_widerruf", # 50 chunks ] # 4. bp_compliance_gesetze: AT law duplicates (renamed copies with identical chunks) GESETZE_AT_DUPLICATES = [ "at_abgb_agb", # 2,521 chunks = exact copy of at_abgb "at_bao_ret", # 2,246 chunks = exact copy of at_bao "at_ugb_ret", # 2,828 chunks = exact copy of at_ugb ] # 5. bp_compliance_gesetze: stubs (1 chunk, incomplete ingestions) GESETZE_STUBS = [ "de_uwg", # 1 chunk (uwg has 157) "de_pangv", # 1 chunk (pangv has 99) "de_bsig", # 1 chunk (standalone stub) ] # 6. bp_compliance_gesetze: EU regulations already fully in bp_compliance_ce # CE has equal or more chunks for all of these GESETZE_EU_CROSS_COLLECTION = [ "eu_2016_679", # GDPR: 423 in both "eu_2024_1689", # AI Act: 726 in both "eu_2024_2847", # CRA: 429 in gesetze, 1365 in CE "eu_2022_2555", # NIS2: 344 in gesetze, 342 in CE (near-identical) "eu_2023_1230", # Machinery: 395 in gesetze, 1271 in CE ] # 7. bp_compliance_gesetze: dual-naming (keep the longer/newer version) GESETZE_DUAL_NAMING = [ "tkg", # 1,391 chunks — de_tkg has 1,631 (keep de_tkg) "ustg", # 915 chunks — de_ustg_ret has 1,071 (keep de_ustg_ret) "ddg_5", # 40 chunks — ddg has 189 (section extract) "egbgb_widerruf", # 36 chunks — egbgb_komplett has 1,412 (section extract) ] # 8. bp_compliance_datenschutz: EDPB/WP duplicate ingestions (keep the longer-named version) DATENSCHUTZ_DUPLICATES = [ "edpb_rtbf_05_2019", # 111 chunks — edpb_right_to_be_forgotten_05_2019 has 111 (keep long name) "edpb_vva_02_2021", # 273 chunks — edpb_virtual_voice_assistant_02_2021 has 273 (keep long name) "edpb_01_2020", # 337 chunks — edpb_transfers_01_2020 has 337 (keep long name) "wp242_portability", # 141 chunks — wp242_right_portability has 141 (keep long name) "wp250_breach", # 201 chunks — wp251_data_breach has 201 (keep long name) "wp244_profiling", # 247 chunks — wp251_profiling has 247 (keep long name) "edpb_legitimate_interest", # 672 chunks — edpb_legitimate_interest_01_2024 has 336 (keep dated version) ] # ───────────────────────────────────────────────────────────────────────────── # All gesetze deletions combined # ───────────────────────────────────────────────────────────────────────────── ALL_GESETZE_DELETIONS = ( GESETZE_OLD_VERSIONS + GESETZE_BGB_EXTRACTS + GESETZE_AT_DUPLICATES + GESETZE_STUBS + GESETZE_EU_CROSS_COLLECTION + GESETZE_DUAL_NAMING ) def log(msg): print(f"\033[0;32m[OK]\033[0m {msg}") def warn(msg): print(f"\033[1;33m[WARN]\033[0m {msg}") def info(msg): print(f"\033[0;36m[INFO]\033[0m {msg}") def fail(msg): print(f"\033[0;31m[FAIL]\033[0m {msg}") def make_session(target_config): """Create a requests session for the given target.""" s = requests.Session() s.headers.update({"Content-Type": "application/json"}) if target_config["api_key"]: s.headers.update({"api-key": target_config["api_key"]}) s.timeout = 60 return s def count_by_regulation_id(session, url, collection, regulation_id): """Count points in a collection matching a regulation_id.""" resp = session.post( f"{url}/collections/{collection}/points/count", json={ "filter": { "must": [ {"key": "regulation_id", "match": {"value": regulation_id}} ] }, "exact": True, }, ) if resp.status_code == 200: return resp.json().get("result", {}).get("count", 0) return -1 def count_collection(session, url, collection): """Get total point count for a collection.""" resp = session.get(f"{url}/collections/{collection}") if resp.status_code == 200: return resp.json().get("result", {}).get("points_count", 0) return -1 def delete_by_regulation_id(session, url, collection, regulation_id, dry_run=True): """Delete all points in a collection matching a regulation_id.""" count = count_by_regulation_id(session, url, collection, regulation_id) if count <= 0: if count == 0: info(f" {collection}/{regulation_id}: 0 chunks (already clean)") else: warn(f" {collection}/{regulation_id}: count failed") return 0 if dry_run: info(f" {collection}/{regulation_id}: {count} chunks (would delete)") return count resp = session.post( f"{url}/collections/{collection}/points/delete", json={ "filter": { "must": [ {"key": "regulation_id", "match": {"value": regulation_id}} ] } }, ) if resp.status_code == 200: log(f" {collection}/{regulation_id}: {count} chunks deleted") return count else: warn(f" {collection}/{regulation_id}: delete failed ({resp.status_code}: {resp.text[:200]})") return 0 def delete_collection(session, url, collection, dry_run=True): """Delete an entire collection.""" count = count_collection(session, url, collection) if count < 0: warn(f" {collection}: not found or error") return 0 if dry_run: info(f" {collection}: {count} chunks total (would delete collection)") return count resp = session.delete(f"{url}/collections/{collection}") if resp.status_code == 200: log(f" {collection}: deleted ({count} chunks)") return count else: warn(f" {collection}: delete failed ({resp.status_code}: {resp.text[:200]})") return 0 def run_cleanup(target_name, target_config, dry_run=True): """Run the full cleanup for a single Qdrant target.""" url = target_config["url"] session = make_session(target_config) print(f"\n{'='*60}") print(f"Target: {target_name} ({url})") print(f"Mode: {'DRY RUN' if dry_run else 'LIVE DELETE'}") print(f"{'='*60}") # Check connectivity try: resp = session.get(f"{url}/collections") resp.raise_for_status() collections = [c["name"] for c in resp.json().get("result", {}).get("collections", [])] info(f"Connected. Collections: {len(collections)}") except Exception as e: warn(f"Cannot connect to {url}: {e}") return total_deleted = 0 # ── Step 1: Delete bp_compliance_recht ── print(f"\n--- Step 1: Delete bp_compliance_recht (100% subset of CE) ---") if "bp_compliance_recht" in collections: total_deleted += delete_collection(session, url, "bp_compliance_recht", dry_run) else: info(" bp_compliance_recht: not found (already deleted)") # ── Step 2: Delete old versions in gesetze ── print(f"\n--- Step 2: Delete old versions in bp_compliance_gesetze ---") print(f" (ao, bdsg, egbgb, hgb — _komplett versions exist)") if "bp_compliance_gesetze" in collections: for reg_id in GESETZE_OLD_VERSIONS: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_gesetze", reg_id, dry_run ) time.sleep(0.2) # ── Step 3: Delete BGB section extracts ── print(f"\n--- Step 3: Delete BGB section extracts (bgb_komplett covers all) ---") if "bp_compliance_gesetze" in collections: for reg_id in GESETZE_BGB_EXTRACTS: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_gesetze", reg_id, dry_run ) time.sleep(0.2) # ── Step 4: Delete AT law duplicates ── print(f"\n--- Step 4: Delete Austrian law duplicates ---") if "bp_compliance_gesetze" in collections: for reg_id in GESETZE_AT_DUPLICATES: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_gesetze", reg_id, dry_run ) time.sleep(0.2) # ── Step 5: Delete stubs ── print(f"\n--- Step 5: Delete stub entries (1-chunk placeholders) ---") if "bp_compliance_gesetze" in collections: for reg_id in GESETZE_STUBS: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_gesetze", reg_id, dry_run ) time.sleep(0.2) # ── Step 6: Delete EU cross-collection duplicates from gesetze ── print(f"\n--- Step 6: Delete EU regulations from gesetze (keep CE) ---") if "bp_compliance_gesetze" in collections: for reg_id in GESETZE_EU_CROSS_COLLECTION: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_gesetze", reg_id, dry_run ) time.sleep(0.2) # ── Step 7: Delete dual-naming duplicates in gesetze ── print(f"\n--- Step 7: Delete dual-naming duplicates in gesetze ---") if "bp_compliance_gesetze" in collections: for reg_id in GESETZE_DUAL_NAMING: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_gesetze", reg_id, dry_run ) time.sleep(0.2) # ── Step 8: Delete EDPB/WP duplicates in datenschutz ── print(f"\n--- Step 8: Delete EDPB/WP duplicate ingestions ---") if "bp_compliance_datenschutz" in collections: for reg_id in DATENSCHUTZ_DUPLICATES: total_deleted += delete_by_regulation_id( session, url, "bp_compliance_datenschutz", reg_id, dry_run ) time.sleep(0.2) # ── Summary ── print(f"\n{'='*60}") action = "would be deleted" if dry_run else "deleted" print(f"Total chunks {action}: {total_deleted:,}") print(f"{'='*60}\n") def main(): parser = argparse.ArgumentParser(description="Qdrant duplicate chunk cleanup") parser.add_argument( "--dry-run", action="store_true", default=False, help="Preview deletions without executing (default: false)", ) parser.add_argument( "--target", choices=["local", "production", "both"], default="both", help="Which Qdrant instance to clean (default: both)", ) args = parser.parse_args() if not args.dry_run: print("\n" + "!" * 60) print(" WARNING: LIVE DELETE MODE — chunks will be permanently removed!") print("!" * 60) answer = input(" Type 'DELETE' to confirm: ") if answer != "DELETE": print(" Aborted.") sys.exit(0) targets = ( TARGETS.items() if args.target == "both" else [(args.target, TARGETS[args.target])] ) for name, config in targets: run_cleanup(name, config, dry_run=args.dry_run) if __name__ == "__main__": main()