chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
101
scripts/qa/qa_delete_gpsr_dupe.py
Normal file
101
scripts/qa/qa_delete_gpsr_dupe.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""
|
||||
Task 2: Delete duplicate GPSR document (eu_2023_988) from Qdrant.
|
||||
gpsr and eu_2023_988 are 100% identical (509/509 chunks).
|
||||
Keep gpsr, delete eu_2023_988.
|
||||
Also update any controls that reference eu_2023_988 to use gpsr instead.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
try:
|
||||
import httpx
|
||||
def http_post(url, data, timeout=30):
|
||||
return httpx.post(url, json=data, timeout=timeout).json()
|
||||
except ImportError:
|
||||
import requests
|
||||
def http_post(url, data, timeout=30):
|
||||
return requests.post(url, json=data, timeout=timeout).json()
|
||||
|
||||
from sqlalchemy import create_engine, text as sql_text
|
||||
|
||||
DB_URL = os.environ['DATABASE_URL']
|
||||
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
||||
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||
DRY_RUN = '--dry-run' in sys.argv
|
||||
|
||||
# ── Step 1: Count eu_2023_988 points in Qdrant ──────────────────────
|
||||
print("=" * 60)
|
||||
print("TASK 2: DELETE DUPLICATE GPSR (eu_2023_988) FROM QDRANT")
|
||||
print("=" * 60)
|
||||
|
||||
count_resp = http_post(
|
||||
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
|
||||
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
||||
)
|
||||
count = count_resp.get("result", {}).get("count", 0)
|
||||
print(f" eu_2023_988 chunks in Qdrant: {count}")
|
||||
|
||||
# ── Step 2: Delete from Qdrant ───────────────────────────────────────
|
||||
if not DRY_RUN and count > 0:
|
||||
del_resp = http_post(
|
||||
f"{QDRANT_URL}/collections/bp_compliance_ce/points/delete",
|
||||
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}},
|
||||
timeout=60,
|
||||
)
|
||||
status = del_resp.get("status")
|
||||
print(f" Qdrant delete: {status}")
|
||||
|
||||
# Verify
|
||||
count_after = http_post(
|
||||
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
|
||||
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
||||
)
|
||||
remaining = count_after.get("result", {}).get("count", 0)
|
||||
print(f" Remaining after delete: {remaining}")
|
||||
else:
|
||||
print(f" [DRY RUN] Would delete {count} points")
|
||||
|
||||
# ── Step 3: Update DB references ─────────────────────────────────────
|
||||
print(f"\n Updating DB references eu_2023_988 → gpsr...")
|
||||
|
||||
with engine.begin() as conn:
|
||||
# Check controls referencing eu_2023_988
|
||||
r = conn.execute(sql_text("""
|
||||
SELECT count(*) FROM compliance.canonical_controls
|
||||
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
|
||||
"""))
|
||||
ctrl_count = r.scalar()
|
||||
print(f" Controls with eu_2023_988: {ctrl_count}")
|
||||
|
||||
if ctrl_count > 0 and not DRY_RUN:
|
||||
# Update generation_metadata.source_regulation
|
||||
conn.execute(sql_text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata, '{}'::jsonb),
|
||||
'{source_regulation}',
|
||||
'"gpsr"'
|
||||
)
|
||||
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
|
||||
"""))
|
||||
print(f" Updated {ctrl_count} controls: source_regulation → gpsr")
|
||||
|
||||
# Check processed_chunks
|
||||
r2 = conn.execute(sql_text("""
|
||||
SELECT count(*) FROM compliance.canonical_processed_chunks
|
||||
WHERE regulation_code = 'eu_2023_988'
|
||||
"""))
|
||||
chunk_count = r2.scalar()
|
||||
print(f" Processed chunks with eu_2023_988: {chunk_count}")
|
||||
|
||||
if chunk_count > 0 and not DRY_RUN:
|
||||
conn.execute(sql_text("""
|
||||
UPDATE compliance.canonical_processed_chunks
|
||||
SET regulation_code = 'gpsr'
|
||||
WHERE regulation_code = 'eu_2023_988'
|
||||
"""))
|
||||
print(f" Updated {chunk_count} processed_chunks: regulation_code → gpsr")
|
||||
|
||||
print(f"\n DRY RUN: {DRY_RUN}")
|
||||
print(" DONE.")
|
||||
Reference in New Issue
Block a user