Files
breakpilot-compliance/scripts/qa/qa_apply_and_dedup.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

191 lines
7.1 KiB
Python

"""
Step 3: Apply article mappings to all controls + detect duplicates.
1. Update source_citation article/paragraph for controls that have a better mapping
2. Identify duplicate controls (same regulation + article + paragraph)
"""
import json
import os
import sys
from collections import defaultdict
from sqlalchemy import create_engine, text as sql_text
DB_URL = os.environ['DATABASE_URL']
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
DRY_RUN = '--dry-run' in sys.argv
# Load mappings
with open("/tmp/all_article_mappings.json") as f:
article_mapping = json.load(f)
print(f"Loaded {len(article_mapping)} article mappings")
print(f"\n{'=' * 70}")
print("STEP 3a: UPDATE CONTROLS WITH IMPROVED ARTICLE MAPPINGS")
print(f"{'=' * 70}")
with engine.begin() as conn:
# Fast approach: load all chunk→control mappings at once
print(" Loading chunk→control mappings...")
chunk_rows = conn.execute(sql_text("""
SELECT chunk_hash, jsonb_array_elements_text(generated_control_ids) as control_id
FROM compliance.canonical_processed_chunks
WHERE jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
""")).fetchall()
control_to_hash = {}
for row in chunk_rows:
control_to_hash[row[1]] = row[0]
print(f" Unique controls with chunk: {len(control_to_hash)}")
# Get current article info for controls with citations (skip v1/v2 without citation)
print(" Loading control article data...")
ctrl_rows = conn.execute(sql_text("""
SELECT id,
source_citation->>'article' as current_article,
source_citation->>'paragraph' as current_paragraph
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND release_state NOT IN ('rejected')
""")).fetchall()
print(f" Controls with citation: {len(ctrl_rows)}")
updated = 0
improved = 0
changed = 0
for row in ctrl_rows:
ctrl_id = str(row[0])
current_art = row[1] or ""
current_para = row[2] or ""
chunk_hash = control_to_hash.get(ctrl_id)
if not chunk_hash:
continue
mapping = article_mapping.get(chunk_hash)
if not mapping or not mapping["article"]:
continue
new_art = mapping["article"]
new_para = mapping["paragraph"]
# Only update if it's an improvement
if current_art == new_art and current_para == new_para:
continue
if not current_art and new_art:
improved += 1
elif current_art != new_art:
changed += 1
if not DRY_RUN:
citation_patch = json.dumps({"article": new_art, "paragraph": new_para})
meta_patch = json.dumps({"source_article": new_art, "source_paragraph": new_para})
conn.execute(sql_text("""
UPDATE compliance.canonical_controls
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || CAST(:citation AS jsonb),
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta AS jsonb)
WHERE id = :id
"""), {"id": row[0], "citation": citation_patch, "meta": meta_patch})
updated += 1
print(f"\n Updated: {updated}")
print(f" New article (was empty): {improved}")
print(f" Changed article: {changed}")
print(f" Dry run: {DRY_RUN}")
# ── Step 3b: Verification — article coverage after update ─────────
print(f"\n{'=' * 70}")
print("STEP 3b: ARTICLE COVERAGE AFTER UPDATE")
print(f"{'=' * 70}")
r = conn.execute(sql_text("""
SELECT
generation_metadata->>'source_regulation' as reg,
count(*) as total,
count(*) FILTER (WHERE source_citation->>'article' != '' AND source_citation->>'article' IS NOT NULL) as with_art,
count(*) FILTER (WHERE source_citation IS NULL) as no_cit
FROM compliance.canonical_controls
WHERE release_state NOT IN ('rejected')
GROUP BY generation_metadata->>'source_regulation'
HAVING count(*) >= 3
ORDER BY count(*) DESC
"""))
print(f"\n {'Regulation':35s} {'Total':>6s} {'WithArt':>7s} {'%':>5s}")
print(f" {'-' * 60}")
grand_total = 0
grand_art = 0
for row in r.fetchall():
reg = str(row[0])[:35] if row[0] else "(none/v1v2)"
pct = f"{row[2]/row[1]*100:.0f}%" if row[1] > 0 else ""
print(f" {reg:35s} {row[1]:6d} {row[2]:7d} {pct:>5s}")
grand_total += row[1]
grand_art += row[2]
print(f"\n TOTAL: {grand_total} controls, {grand_art} with article ({grand_art/grand_total*100:.0f}%)")
# ── Step 3c: Duplicate analysis ──────────────────────────────────
print(f"\n{'=' * 70}")
print("STEP 3c: DUPLICATE CONTROLS (same reg + article + paragraph, >1)")
print(f"{'=' * 70}")
r2 = conn.execute(sql_text("""
SELECT
generation_metadata->>'source_regulation' as reg,
source_citation->>'article' as article,
source_citation->>'paragraph' as paragraph,
count(*) as cnt,
array_agg(id ORDER BY created_at) as ids,
array_agg(title ORDER BY created_at) as titles,
array_agg(release_state ORDER BY created_at) as states
FROM compliance.canonical_controls
WHERE release_state NOT IN ('rejected', 'too_close')
AND source_citation->>'article' IS NOT NULL
AND source_citation->>'article' != ''
GROUP BY
generation_metadata->>'source_regulation',
source_citation->>'article',
source_citation->>'paragraph'
HAVING count(*) > 1
ORDER BY count(*) DESC
"""))
dup_groups = []
total_dup_controls = 0
total_removable = 0
for row in r2.fetchall():
group = {
"reg": row[0],
"article": row[1],
"paragraph": row[2],
"count": row[3],
"ids": [str(i) for i in row[4]],
"titles": row[5],
"states": row[6],
}
dup_groups.append(group)
total_dup_controls += row[3]
total_removable += row[3] - 1 # Keep the oldest
print(f"\n Duplicate groups: {len(dup_groups)}")
print(f" Controls in groups: {total_dup_controls}")
print(f" Removable (keep oldest): {total_removable}")
# Show top 20
print(f"\n {'Reg':25s} {'Article':15s} {'Para':10s} {'Count':>5s}")
print(f" {'-' * 60}")
for g in dup_groups[:30]:
print(f" {str(g['reg']):25s} {str(g['article']):15s} {str(g['paragraph']):10s} {g['count']:5d}")
for i, title in enumerate(g['titles'][:3]):
state = g['states'][i] if i < len(g['states']) else '?'
marker = "KEEP" if i == 0 else "DUP "
print(f" [{marker}][{state:6s}] {title[:70]}")
if g['count'] > 3:
print(f" ... +{g['count'] - 3} more")
# Save dedup plan
with open("/tmp/dedup_plan.json", "w") as f:
json.dump(dup_groups, f, indent=2, default=str)
print(f"\n Saved dedup plan to /tmp/dedup_plan.json")