Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
102 lines
3.9 KiB
Python
102 lines
3.9 KiB
Python
"""
|
|
Task 2: Delete duplicate GPSR document (eu_2023_988) from Qdrant.
|
|
gpsr and eu_2023_988 are 100% identical (509/509 chunks).
|
|
Keep gpsr, delete eu_2023_988.
|
|
Also update any controls that reference eu_2023_988 to use gpsr instead.
|
|
"""
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
try:
|
|
import httpx
|
|
def http_post(url, data, timeout=30):
|
|
return httpx.post(url, json=data, timeout=timeout).json()
|
|
except ImportError:
|
|
import requests
|
|
def http_post(url, data, timeout=30):
|
|
return requests.post(url, json=data, timeout=timeout).json()
|
|
|
|
from sqlalchemy import create_engine, text as sql_text
|
|
|
|
DB_URL = os.environ['DATABASE_URL']
|
|
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
|
DRY_RUN = '--dry-run' in sys.argv
|
|
|
|
# ── Step 1: Count eu_2023_988 points in Qdrant ──────────────────────
|
|
print("=" * 60)
|
|
print("TASK 2: DELETE DUPLICATE GPSR (eu_2023_988) FROM QDRANT")
|
|
print("=" * 60)
|
|
|
|
count_resp = http_post(
|
|
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
|
|
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
|
)
|
|
count = count_resp.get("result", {}).get("count", 0)
|
|
print(f" eu_2023_988 chunks in Qdrant: {count}")
|
|
|
|
# ── Step 2: Delete from Qdrant ───────────────────────────────────────
|
|
if not DRY_RUN and count > 0:
|
|
del_resp = http_post(
|
|
f"{QDRANT_URL}/collections/bp_compliance_ce/points/delete",
|
|
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}},
|
|
timeout=60,
|
|
)
|
|
status = del_resp.get("status")
|
|
print(f" Qdrant delete: {status}")
|
|
|
|
# Verify
|
|
count_after = http_post(
|
|
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
|
|
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
|
)
|
|
remaining = count_after.get("result", {}).get("count", 0)
|
|
print(f" Remaining after delete: {remaining}")
|
|
else:
|
|
print(f" [DRY RUN] Would delete {count} points")
|
|
|
|
# ── Step 3: Update DB references ─────────────────────────────────────
|
|
print(f"\n Updating DB references eu_2023_988 → gpsr...")
|
|
|
|
with engine.begin() as conn:
|
|
# Check controls referencing eu_2023_988
|
|
r = conn.execute(sql_text("""
|
|
SELECT count(*) FROM compliance.canonical_controls
|
|
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
|
|
"""))
|
|
ctrl_count = r.scalar()
|
|
print(f" Controls with eu_2023_988: {ctrl_count}")
|
|
|
|
if ctrl_count > 0 and not DRY_RUN:
|
|
# Update generation_metadata.source_regulation
|
|
conn.execute(sql_text("""
|
|
UPDATE compliance.canonical_controls
|
|
SET generation_metadata = jsonb_set(
|
|
COALESCE(generation_metadata, '{}'::jsonb),
|
|
'{source_regulation}',
|
|
'"gpsr"'
|
|
)
|
|
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
|
|
"""))
|
|
print(f" Updated {ctrl_count} controls: source_regulation → gpsr")
|
|
|
|
# Check processed_chunks
|
|
r2 = conn.execute(sql_text("""
|
|
SELECT count(*) FROM compliance.canonical_processed_chunks
|
|
WHERE regulation_code = 'eu_2023_988'
|
|
"""))
|
|
chunk_count = r2.scalar()
|
|
print(f" Processed chunks with eu_2023_988: {chunk_count}")
|
|
|
|
if chunk_count > 0 and not DRY_RUN:
|
|
conn.execute(sql_text("""
|
|
UPDATE compliance.canonical_processed_chunks
|
|
SET regulation_code = 'gpsr'
|
|
WHERE regulation_code = 'eu_2023_988'
|
|
"""))
|
|
print(f" Updated {chunk_count} processed_chunks: regulation_code → gpsr")
|
|
|
|
print(f"\n DRY RUN: {DRY_RUN}")
|
|
print(" DONE.")
|