Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
214 lines
6.5 KiB
Python
214 lines
6.5 KiB
Python
"""
|
|
Apply PDF QA results: update source_citation with correct article_type + article.
|
|
|
|
Safety modes:
|
|
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
|
|
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
|
|
--dry-run: Show what would change without writing.
|
|
|
|
Usage:
|
|
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
|
|
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
|
|
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
|
|
"""
|
|
import os
|
|
import sys
|
|
import json
|
|
import psycopg2
|
|
import urllib.parse
|
|
from collections import Counter
|
|
|
|
RESULTS_FILE = "/tmp/pdf_qa_results.json"
|
|
|
|
# Parse args
|
|
dry_run = "--dry-run" in sys.argv
|
|
force_article = "--force-article" in sys.argv
|
|
|
|
# Load results
|
|
with open(RESULTS_FILE) as f:
|
|
results = json.load(f)
|
|
print(f"Loaded {len(results)} results")
|
|
|
|
# DB connection
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
|
|
# Load current DB state for all affected controls
|
|
cur = conn.cursor()
|
|
ctrl_ids = [r["ctrl_id"] for r in results]
|
|
cur.execute("""
|
|
SELECT id,
|
|
source_citation->>'article' as article,
|
|
source_citation->>'article_type' as article_type,
|
|
source_citation->>'source' as source
|
|
FROM compliance.canonical_controls
|
|
WHERE id = ANY(%s::uuid[])
|
|
""", (ctrl_ids,))
|
|
db_state = {}
|
|
for row in cur.fetchall():
|
|
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
|
|
|
|
# Counters
|
|
stats = Counter()
|
|
updated_type = 0
|
|
updated_article = 0
|
|
updated_recital = 0
|
|
errors = 0
|
|
|
|
for i, r in enumerate(results):
|
|
ctrl_id = r["ctrl_id"]
|
|
new_article = r["article_label"]
|
|
new_type = r["article_type"]
|
|
db = db_state.get(ctrl_id, {})
|
|
|
|
if not db:
|
|
stats["missing_in_db"] += 1
|
|
continue
|
|
|
|
old_type = db.get("article_type")
|
|
old_article = db.get("article", "").strip()
|
|
|
|
# Decide what to update
|
|
set_type = (old_type != new_type)
|
|
set_article = (not old_article) or (force_article and old_article != new_article)
|
|
set_recital = (new_type == "preamble")
|
|
|
|
if set_type:
|
|
stats["type_" + ("new" if not old_type else "changed")] += 1
|
|
else:
|
|
stats["type_unchanged"] += 1
|
|
|
|
if not old_article and set_article:
|
|
stats["article_new"] += 1
|
|
elif old_article and old_article != new_article:
|
|
if force_article:
|
|
stats["article_force_changed"] += 1
|
|
else:
|
|
stats["article_skipped"] += 1
|
|
else:
|
|
stats["article_unchanged"] += 1
|
|
|
|
if set_recital:
|
|
stats["recital"] += 1
|
|
|
|
if dry_run:
|
|
continue
|
|
|
|
try:
|
|
# Build JSONB update
|
|
updates = {}
|
|
if set_type:
|
|
updates["article_type"] = new_type
|
|
if set_article:
|
|
updates["article"] = new_article
|
|
|
|
if updates:
|
|
# Merge into source_citation
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
|
|
updated_at = now()
|
|
WHERE id = %s::uuid
|
|
""", (json.dumps(updates), ctrl_id))
|
|
if set_type:
|
|
updated_type += 1
|
|
if set_article:
|
|
updated_article += 1
|
|
|
|
# Mark preamble as recital_suspect
|
|
if set_recital:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET generation_metadata = jsonb_set(
|
|
COALESCE(generation_metadata, '{}'::jsonb),
|
|
'{recital_suspect}',
|
|
'true'::jsonb
|
|
),
|
|
updated_at = now()
|
|
WHERE id = %s::uuid
|
|
""", (ctrl_id,))
|
|
updated_recital += 1
|
|
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 5:
|
|
print(f" ERROR {ctrl_id}: {str(e)[:100]}")
|
|
conn.rollback()
|
|
continue
|
|
|
|
if (i + 1) % 1000 == 0:
|
|
conn.commit()
|
|
print(f" Progress: {i+1}/{len(results)}")
|
|
|
|
if not dry_run:
|
|
conn.commit()
|
|
|
|
mode = "DRY-RUN" if dry_run else "APPLIED"
|
|
print(f"\n{'='*60}")
|
|
print(f" Mode: {mode}")
|
|
print(f"{'='*60}")
|
|
print(f"\n article_type:")
|
|
print(f" New (was NULL): {stats['type_new']:5d}")
|
|
print(f" Changed: {stats['type_changed']:5d}")
|
|
print(f" Unchanged: {stats['type_unchanged']:5d}")
|
|
print(f"\n article:")
|
|
print(f" New (was empty): {stats['article_new']:5d}")
|
|
if force_article:
|
|
print(f" Force-changed: {stats['article_force_changed']:5d}")
|
|
else:
|
|
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
|
|
print(f" Unchanged: {stats['article_unchanged']:5d}")
|
|
print(f"\n Preamble/Recital: {stats['recital']:5d}")
|
|
print(f" Missing in DB: {stats['missing_in_db']:5d}")
|
|
|
|
if not dry_run:
|
|
print(f"\n Updates written:")
|
|
print(f" article_type: {updated_type:5d}")
|
|
print(f" article: {updated_article:5d}")
|
|
print(f" recital_suspect: {updated_recital:5d}")
|
|
print(f" Errors: {errors:5d}")
|
|
|
|
# Verify: count by article_type
|
|
cur.execute("""
|
|
SELECT source_citation->>'article_type' as art_type, count(*)
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'article_type' IS NOT NULL
|
|
GROUP BY 1
|
|
ORDER BY count(*) DESC
|
|
""")
|
|
print("\nArticle type distribution in DB:")
|
|
for row in cur.fetchall():
|
|
print(f" {str(row[0]):12s}: {row[1]:5d}")
|
|
|
|
# Verify: sample preamble controls
|
|
cur.execute("""
|
|
SELECT control_id, source_citation->>'article', source_citation->>'article_type',
|
|
source_citation->>'source'
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'article_type' = 'preamble'
|
|
LIMIT 5
|
|
""")
|
|
print("\nSample preamble controls:")
|
|
for row in cur.fetchall():
|
|
print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")
|
|
|
|
# Verify: sample annex controls
|
|
cur.execute("""
|
|
SELECT control_id, source_citation->>'article', source_citation->>'article_type',
|
|
source_citation->>'source'
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'article_type' = 'annex'
|
|
LIMIT 5
|
|
""")
|
|
print("\nSample annex controls:")
|
|
for row in cur.fetchall():
|
|
print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")
|
|
|
|
conn.close()
|