Files
breakpilot-compliance/scripts/qa/qa_delete_gpsr_dupe.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

102 lines
3.9 KiB
Python

"""
Task 2: Delete duplicate GPSR document (eu_2023_988) from Qdrant.
gpsr and eu_2023_988 are 100% identical (509/509 chunks).
Keep gpsr, delete eu_2023_988.
Also update any controls that reference eu_2023_988 to use gpsr instead.
"""
import json
import os
import sys
try:
import httpx
def http_post(url, data, timeout=30):
return httpx.post(url, json=data, timeout=timeout).json()
except ImportError:
import requests
def http_post(url, data, timeout=30):
return requests.post(url, json=data, timeout=timeout).json()
from sqlalchemy import create_engine, text as sql_text
DB_URL = os.environ['DATABASE_URL']
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
DRY_RUN = '--dry-run' in sys.argv
# ── Step 1: Count eu_2023_988 points in Qdrant ──────────────────────
print("=" * 60)
print("TASK 2: DELETE DUPLICATE GPSR (eu_2023_988) FROM QDRANT")
print("=" * 60)
count_resp = http_post(
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
)
count = count_resp.get("result", {}).get("count", 0)
print(f" eu_2023_988 chunks in Qdrant: {count}")
# ── Step 2: Delete from Qdrant ───────────────────────────────────────
if not DRY_RUN and count > 0:
del_resp = http_post(
f"{QDRANT_URL}/collections/bp_compliance_ce/points/delete",
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}},
timeout=60,
)
status = del_resp.get("status")
print(f" Qdrant delete: {status}")
# Verify
count_after = http_post(
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
)
remaining = count_after.get("result", {}).get("count", 0)
print(f" Remaining after delete: {remaining}")
else:
print(f" [DRY RUN] Would delete {count} points")
# ── Step 3: Update DB references ─────────────────────────────────────
print(f"\n Updating DB references eu_2023_988 → gpsr...")
with engine.begin() as conn:
# Check controls referencing eu_2023_988
r = conn.execute(sql_text("""
SELECT count(*) FROM compliance.canonical_controls
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
"""))
ctrl_count = r.scalar()
print(f" Controls with eu_2023_988: {ctrl_count}")
if ctrl_count > 0 and not DRY_RUN:
# Update generation_metadata.source_regulation
conn.execute(sql_text("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata, '{}'::jsonb),
'{source_regulation}',
'"gpsr"'
)
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
"""))
print(f" Updated {ctrl_count} controls: source_regulation → gpsr")
# Check processed_chunks
r2 = conn.execute(sql_text("""
SELECT count(*) FROM compliance.canonical_processed_chunks
WHERE regulation_code = 'eu_2023_988'
"""))
chunk_count = r2.scalar()
print(f" Processed chunks with eu_2023_988: {chunk_count}")
if chunk_count > 0 and not DRY_RUN:
conn.execute(sql_text("""
UPDATE compliance.canonical_processed_chunks
SET regulation_code = 'gpsr'
WHERE regulation_code = 'eu_2023_988'
"""))
print(f" Updated {chunk_count} processed_chunks: regulation_code → gpsr")
print(f"\n DRY RUN: {DRY_RUN}")
print(" DONE.")