Files
breakpilot-compliance/scripts/qa/apply_pdf_qa_results.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

214 lines
6.5 KiB
Python

"""
Apply PDF QA results: update source_citation with correct article_type + article.
Safety modes:
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
--dry-run: Show what would change without writing.
Usage:
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
"""
import os
import sys
import json
import psycopg2
import urllib.parse
from collections import Counter
RESULTS_FILE = "/tmp/pdf_qa_results.json"
# Parse args
dry_run = "--dry-run" in sys.argv
force_article = "--force-article" in sys.argv
# Load results
with open(RESULTS_FILE) as f:
results = json.load(f)
print(f"Loaded {len(results)} results")
# DB connection
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
# Load current DB state for all affected controls
cur = conn.cursor()
ctrl_ids = [r["ctrl_id"] for r in results]
cur.execute("""
SELECT id,
source_citation->>'article' as article,
source_citation->>'article_type' as article_type,
source_citation->>'source' as source
FROM compliance.canonical_controls
WHERE id = ANY(%s::uuid[])
""", (ctrl_ids,))
db_state = {}
for row in cur.fetchall():
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
# Counters
stats = Counter()
updated_type = 0
updated_article = 0
updated_recital = 0
errors = 0
for i, r in enumerate(results):
ctrl_id = r["ctrl_id"]
new_article = r["article_label"]
new_type = r["article_type"]
db = db_state.get(ctrl_id, {})
if not db:
stats["missing_in_db"] += 1
continue
old_type = db.get("article_type")
old_article = db.get("article", "").strip()
# Decide what to update
set_type = (old_type != new_type)
set_article = (not old_article) or (force_article and old_article != new_article)
set_recital = (new_type == "preamble")
if set_type:
stats["type_" + ("new" if not old_type else "changed")] += 1
else:
stats["type_unchanged"] += 1
if not old_article and set_article:
stats["article_new"] += 1
elif old_article and old_article != new_article:
if force_article:
stats["article_force_changed"] += 1
else:
stats["article_skipped"] += 1
else:
stats["article_unchanged"] += 1
if set_recital:
stats["recital"] += 1
if dry_run:
continue
try:
# Build JSONB update
updates = {}
if set_type:
updates["article_type"] = new_type
if set_article:
updates["article"] = new_article
if updates:
# Merge into source_citation
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
updated_at = now()
WHERE id = %s::uuid
""", (json.dumps(updates), ctrl_id))
if set_type:
updated_type += 1
if set_article:
updated_article += 1
# Mark preamble as recital_suspect
if set_recital:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata, '{}'::jsonb),
'{recital_suspect}',
'true'::jsonb
),
updated_at = now()
WHERE id = %s::uuid
""", (ctrl_id,))
updated_recital += 1
except Exception as e:
errors += 1
if errors <= 5:
print(f" ERROR {ctrl_id}: {str(e)[:100]}")
conn.rollback()
continue
if (i + 1) % 1000 == 0:
conn.commit()
print(f" Progress: {i+1}/{len(results)}")
if not dry_run:
conn.commit()
mode = "DRY-RUN" if dry_run else "APPLIED"
print(f"\n{'='*60}")
print(f" Mode: {mode}")
print(f"{'='*60}")
print(f"\n article_type:")
print(f" New (was NULL): {stats['type_new']:5d}")
print(f" Changed: {stats['type_changed']:5d}")
print(f" Unchanged: {stats['type_unchanged']:5d}")
print(f"\n article:")
print(f" New (was empty): {stats['article_new']:5d}")
if force_article:
print(f" Force-changed: {stats['article_force_changed']:5d}")
else:
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
print(f" Unchanged: {stats['article_unchanged']:5d}")
print(f"\n Preamble/Recital: {stats['recital']:5d}")
print(f" Missing in DB: {stats['missing_in_db']:5d}")
if not dry_run:
print(f"\n Updates written:")
print(f" article_type: {updated_type:5d}")
print(f" article: {updated_article:5d}")
print(f" recital_suspect: {updated_recital:5d}")
print(f" Errors: {errors:5d}")
# Verify: count by article_type
cur.execute("""
SELECT source_citation->>'article_type' as art_type, count(*)
FROM compliance.canonical_controls
WHERE source_citation->>'article_type' IS NOT NULL
GROUP BY 1
ORDER BY count(*) DESC
""")
print("\nArticle type distribution in DB:")
for row in cur.fetchall():
print(f" {str(row[0]):12s}: {row[1]:5d}")
# Verify: sample preamble controls
cur.execute("""
SELECT control_id, source_citation->>'article', source_citation->>'article_type',
source_citation->>'source'
FROM compliance.canonical_controls
WHERE source_citation->>'article_type' = 'preamble'
LIMIT 5
""")
print("\nSample preamble controls:")
for row in cur.fetchall():
print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")
# Verify: sample annex controls
cur.execute("""
SELECT control_id, source_citation->>'article', source_citation->>'article_type',
source_citation->>'source'
FROM compliance.canonical_controls
WHERE source_citation->>'article_type' = 'annex'
LIMIT 5
""")
print("\nSample annex controls:")
for row in cur.fetchall():
print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")
conn.close()