""" Task 2: Delete duplicate GPSR document (eu_2023_988) from Qdrant. gpsr and eu_2023_988 are 100% identical (509/509 chunks). Keep gpsr, delete eu_2023_988. Also update any controls that reference eu_2023_988 to use gpsr instead. """ import json import os import sys try: import httpx def http_post(url, data, timeout=30): return httpx.post(url, json=data, timeout=timeout).json() except ImportError: import requests def http_post(url, data, timeout=30): return requests.post(url, json=data, timeout=timeout).json() from sqlalchemy import create_engine, text as sql_text DB_URL = os.environ['DATABASE_URL'] QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333') engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"}) DRY_RUN = '--dry-run' in sys.argv # ── Step 1: Count eu_2023_988 points in Qdrant ────────────────────── print("=" * 60) print("TASK 2: DELETE DUPLICATE GPSR (eu_2023_988) FROM QDRANT") print("=" * 60) count_resp = http_post( f"{QDRANT_URL}/collections/bp_compliance_ce/points/count", {"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True}, ) count = count_resp.get("result", {}).get("count", 0) print(f" eu_2023_988 chunks in Qdrant: {count}") # ── Step 2: Delete from Qdrant ─────────────────────────────────────── if not DRY_RUN and count > 0: del_resp = http_post( f"{QDRANT_URL}/collections/bp_compliance_ce/points/delete", {"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}}, timeout=60, ) status = del_resp.get("status") print(f" Qdrant delete: {status}") # Verify count_after = http_post( f"{QDRANT_URL}/collections/bp_compliance_ce/points/count", {"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True}, ) remaining = count_after.get("result", {}).get("count", 0) print(f" Remaining after delete: {remaining}") else: print(f" [DRY RUN] Would delete {count} points") # ── Step 3: Update DB references ───────────────────────────────────── print(f"\n Updating DB references eu_2023_988 → gpsr...") with engine.begin() as conn: # Check controls referencing eu_2023_988 r = conn.execute(sql_text(""" SELECT count(*) FROM compliance.canonical_controls WHERE generation_metadata->>'source_regulation' = 'eu_2023_988' """)) ctrl_count = r.scalar() print(f" Controls with eu_2023_988: {ctrl_count}") if ctrl_count > 0 and not DRY_RUN: # Update generation_metadata.source_regulation conn.execute(sql_text(""" UPDATE compliance.canonical_controls SET generation_metadata = jsonb_set( COALESCE(generation_metadata, '{}'::jsonb), '{source_regulation}', '"gpsr"' ) WHERE generation_metadata->>'source_regulation' = 'eu_2023_988' """)) print(f" Updated {ctrl_count} controls: source_regulation → gpsr") # Check processed_chunks r2 = conn.execute(sql_text(""" SELECT count(*) FROM compliance.canonical_processed_chunks WHERE regulation_code = 'eu_2023_988' """)) chunk_count = r2.scalar() print(f" Processed chunks with eu_2023_988: {chunk_count}") if chunk_count > 0 and not DRY_RUN: conn.execute(sql_text(""" UPDATE compliance.canonical_processed_chunks SET regulation_code = 'gpsr' WHERE regulation_code = 'eu_2023_988' """)) print(f" Updated {chunk_count} processed_chunks: regulation_code → gpsr") print(f"\n DRY RUN: {DRY_RUN}") print(" DONE.")