chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
190
scripts/qa/qa_apply_and_dedup.py
Normal file
190
scripts/qa/qa_apply_and_dedup.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""
|
||||
Step 3: Apply article mappings to all controls + detect duplicates.
|
||||
1. Update source_citation article/paragraph for controls that have a better mapping
|
||||
2. Identify duplicate controls (same regulation + article + paragraph)
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
from sqlalchemy import create_engine, text as sql_text
|
||||
|
||||
DB_URL = os.environ['DATABASE_URL']
|
||||
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||
DRY_RUN = '--dry-run' in sys.argv
|
||||
|
||||
# Load mappings
|
||||
with open("/tmp/all_article_mappings.json") as f:
|
||||
article_mapping = json.load(f)
|
||||
print(f"Loaded {len(article_mapping)} article mappings")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print("STEP 3a: UPDATE CONTROLS WITH IMPROVED ARTICLE MAPPINGS")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
with engine.begin() as conn:
|
||||
# Fast approach: load all chunk→control mappings at once
|
||||
print(" Loading chunk→control mappings...")
|
||||
chunk_rows = conn.execute(sql_text("""
|
||||
SELECT chunk_hash, jsonb_array_elements_text(generated_control_ids) as control_id
|
||||
FROM compliance.canonical_processed_chunks
|
||||
WHERE jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
|
||||
""")).fetchall()
|
||||
|
||||
control_to_hash = {}
|
||||
for row in chunk_rows:
|
||||
control_to_hash[row[1]] = row[0]
|
||||
print(f" Unique controls with chunk: {len(control_to_hash)}")
|
||||
|
||||
# Get current article info for controls with citations (skip v1/v2 without citation)
|
||||
print(" Loading control article data...")
|
||||
ctrl_rows = conn.execute(sql_text("""
|
||||
SELECT id,
|
||||
source_citation->>'article' as current_article,
|
||||
source_citation->>'paragraph' as current_paragraph
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND release_state NOT IN ('rejected')
|
||||
""")).fetchall()
|
||||
print(f" Controls with citation: {len(ctrl_rows)}")
|
||||
|
||||
updated = 0
|
||||
improved = 0
|
||||
changed = 0
|
||||
|
||||
for row in ctrl_rows:
|
||||
ctrl_id = str(row[0])
|
||||
current_art = row[1] or ""
|
||||
current_para = row[2] or ""
|
||||
chunk_hash = control_to_hash.get(ctrl_id)
|
||||
|
||||
if not chunk_hash:
|
||||
continue
|
||||
|
||||
mapping = article_mapping.get(chunk_hash)
|
||||
if not mapping or not mapping["article"]:
|
||||
continue
|
||||
|
||||
new_art = mapping["article"]
|
||||
new_para = mapping["paragraph"]
|
||||
|
||||
# Only update if it's an improvement
|
||||
if current_art == new_art and current_para == new_para:
|
||||
continue
|
||||
|
||||
if not current_art and new_art:
|
||||
improved += 1
|
||||
elif current_art != new_art:
|
||||
changed += 1
|
||||
|
||||
if not DRY_RUN:
|
||||
citation_patch = json.dumps({"article": new_art, "paragraph": new_para})
|
||||
meta_patch = json.dumps({"source_article": new_art, "source_paragraph": new_para})
|
||||
conn.execute(sql_text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || CAST(:citation AS jsonb),
|
||||
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta AS jsonb)
|
||||
WHERE id = :id
|
||||
"""), {"id": row[0], "citation": citation_patch, "meta": meta_patch})
|
||||
|
||||
updated += 1
|
||||
|
||||
print(f"\n Updated: {updated}")
|
||||
print(f" New article (was empty): {improved}")
|
||||
print(f" Changed article: {changed}")
|
||||
print(f" Dry run: {DRY_RUN}")
|
||||
|
||||
# ── Step 3b: Verification — article coverage after update ─────────
|
||||
print(f"\n{'=' * 70}")
|
||||
print("STEP 3b: ARTICLE COVERAGE AFTER UPDATE")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
r = conn.execute(sql_text("""
|
||||
SELECT
|
||||
generation_metadata->>'source_regulation' as reg,
|
||||
count(*) as total,
|
||||
count(*) FILTER (WHERE source_citation->>'article' != '' AND source_citation->>'article' IS NOT NULL) as with_art,
|
||||
count(*) FILTER (WHERE source_citation IS NULL) as no_cit
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('rejected')
|
||||
GROUP BY generation_metadata->>'source_regulation'
|
||||
HAVING count(*) >= 3
|
||||
ORDER BY count(*) DESC
|
||||
"""))
|
||||
print(f"\n {'Regulation':35s} {'Total':>6s} {'WithArt':>7s} {'%':>5s}")
|
||||
print(f" {'-' * 60}")
|
||||
grand_total = 0
|
||||
grand_art = 0
|
||||
for row in r.fetchall():
|
||||
reg = str(row[0])[:35] if row[0] else "(none/v1v2)"
|
||||
pct = f"{row[2]/row[1]*100:.0f}%" if row[1] > 0 else ""
|
||||
print(f" {reg:35s} {row[1]:6d} {row[2]:7d} {pct:>5s}")
|
||||
grand_total += row[1]
|
||||
grand_art += row[2]
|
||||
print(f"\n TOTAL: {grand_total} controls, {grand_art} with article ({grand_art/grand_total*100:.0f}%)")
|
||||
|
||||
# ── Step 3c: Duplicate analysis ──────────────────────────────────
|
||||
print(f"\n{'=' * 70}")
|
||||
print("STEP 3c: DUPLICATE CONTROLS (same reg + article + paragraph, >1)")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
r2 = conn.execute(sql_text("""
|
||||
SELECT
|
||||
generation_metadata->>'source_regulation' as reg,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'paragraph' as paragraph,
|
||||
count(*) as cnt,
|
||||
array_agg(id ORDER BY created_at) as ids,
|
||||
array_agg(title ORDER BY created_at) as titles,
|
||||
array_agg(release_state ORDER BY created_at) as states
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('rejected', 'too_close')
|
||||
AND source_citation->>'article' IS NOT NULL
|
||||
AND source_citation->>'article' != ''
|
||||
GROUP BY
|
||||
generation_metadata->>'source_regulation',
|
||||
source_citation->>'article',
|
||||
source_citation->>'paragraph'
|
||||
HAVING count(*) > 1
|
||||
ORDER BY count(*) DESC
|
||||
"""))
|
||||
|
||||
dup_groups = []
|
||||
total_dup_controls = 0
|
||||
total_removable = 0
|
||||
|
||||
for row in r2.fetchall():
|
||||
group = {
|
||||
"reg": row[0],
|
||||
"article": row[1],
|
||||
"paragraph": row[2],
|
||||
"count": row[3],
|
||||
"ids": [str(i) for i in row[4]],
|
||||
"titles": row[5],
|
||||
"states": row[6],
|
||||
}
|
||||
dup_groups.append(group)
|
||||
total_dup_controls += row[3]
|
||||
total_removable += row[3] - 1 # Keep the oldest
|
||||
|
||||
print(f"\n Duplicate groups: {len(dup_groups)}")
|
||||
print(f" Controls in groups: {total_dup_controls}")
|
||||
print(f" Removable (keep oldest): {total_removable}")
|
||||
|
||||
# Show top 20
|
||||
print(f"\n {'Reg':25s} {'Article':15s} {'Para':10s} {'Count':>5s}")
|
||||
print(f" {'-' * 60}")
|
||||
for g in dup_groups[:30]:
|
||||
print(f" {str(g['reg']):25s} {str(g['article']):15s} {str(g['paragraph']):10s} {g['count']:5d}")
|
||||
for i, title in enumerate(g['titles'][:3]):
|
||||
state = g['states'][i] if i < len(g['states']) else '?'
|
||||
marker = "KEEP" if i == 0 else "DUP "
|
||||
print(f" [{marker}][{state:6s}] {title[:70]}")
|
||||
if g['count'] > 3:
|
||||
print(f" ... +{g['count'] - 3} more")
|
||||
|
||||
# Save dedup plan
|
||||
with open("/tmp/dedup_plan.json", "w") as f:
|
||||
json.dump(dup_groups, f, indent=2, default=str)
|
||||
print(f"\n Saved dedup plan to /tmp/dedup_plan.json")
|
||||
Reference in New Issue
Block a user