""" Apply PDF QA results: update source_citation with correct article_type + article. Safety modes: --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect. --force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable). --dry-run: Show what would change without writing. Usage: python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles) python3 apply_pdf_qa_results.py --dry-run # show changes without writing python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles """ import os import sys import json import psycopg2 import urllib.parse from collections import Counter RESULTS_FILE = "/tmp/pdf_qa_results.json" # Parse args dry_run = "--dry-run" in sys.argv force_article = "--force-article" in sys.argv # Load results with open(RESULTS_FILE) as f: results = json.load(f) print(f"Loaded {len(results)} results") # DB connection db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) # Load current DB state for all affected controls cur = conn.cursor() ctrl_ids = [r["ctrl_id"] for r in results] cur.execute(""" SELECT id, source_citation->>'article' as article, source_citation->>'article_type' as article_type, source_citation->>'source' as source FROM compliance.canonical_controls WHERE id = ANY(%s::uuid[]) """, (ctrl_ids,)) db_state = {} for row in cur.fetchall(): db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]} # Counters stats = Counter() updated_type = 0 updated_article = 0 updated_recital = 0 errors = 0 for i, r in enumerate(results): ctrl_id = r["ctrl_id"] new_article = r["article_label"] new_type = r["article_type"] db = db_state.get(ctrl_id, {}) if not db: stats["missing_in_db"] += 1 continue old_type = db.get("article_type") old_article = db.get("article", "").strip() # Decide what to update set_type = (old_type != new_type) set_article = (not old_article) or (force_article and old_article != new_article) set_recital = (new_type == "preamble") if set_type: stats["type_" + ("new" if not old_type else "changed")] += 1 else: stats["type_unchanged"] += 1 if not old_article and set_article: stats["article_new"] += 1 elif old_article and old_article != new_article: if force_article: stats["article_force_changed"] += 1 else: stats["article_skipped"] += 1 else: stats["article_unchanged"] += 1 if set_recital: stats["recital"] += 1 if dry_run: continue try: # Build JSONB update updates = {} if set_type: updates["article_type"] = new_type if set_article: updates["article"] = new_article if updates: # Merge into source_citation cur.execute(""" UPDATE compliance.canonical_controls SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb, updated_at = now() WHERE id = %s::uuid """, (json.dumps(updates), ctrl_id)) if set_type: updated_type += 1 if set_article: updated_article += 1 # Mark preamble as recital_suspect if set_recital: cur.execute(""" UPDATE compliance.canonical_controls SET generation_metadata = jsonb_set( COALESCE(generation_metadata, '{}'::jsonb), '{recital_suspect}', 'true'::jsonb ), updated_at = now() WHERE id = %s::uuid """, (ctrl_id,)) updated_recital += 1 except Exception as e: errors += 1 if errors <= 5: print(f" ERROR {ctrl_id}: {str(e)[:100]}") conn.rollback() continue if (i + 1) % 1000 == 0: conn.commit() print(f" Progress: {i+1}/{len(results)}") if not dry_run: conn.commit() mode = "DRY-RUN" if dry_run else "APPLIED" print(f"\n{'='*60}") print(f" Mode: {mode}") print(f"{'='*60}") print(f"\n article_type:") print(f" New (was NULL): {stats['type_new']:5d}") print(f" Changed: {stats['type_changed']:5d}") print(f" Unchanged: {stats['type_unchanged']:5d}") print(f"\n article:") print(f" New (was empty): {stats['article_new']:5d}") if force_article: print(f" Force-changed: {stats['article_force_changed']:5d}") else: print(f" Differs (SKIPPED): {stats['article_skipped']:5d}") print(f" Unchanged: {stats['article_unchanged']:5d}") print(f"\n Preamble/Recital: {stats['recital']:5d}") print(f" Missing in DB: {stats['missing_in_db']:5d}") if not dry_run: print(f"\n Updates written:") print(f" article_type: {updated_type:5d}") print(f" article: {updated_article:5d}") print(f" recital_suspect: {updated_recital:5d}") print(f" Errors: {errors:5d}") # Verify: count by article_type cur.execute(""" SELECT source_citation->>'article_type' as art_type, count(*) FROM compliance.canonical_controls WHERE source_citation->>'article_type' IS NOT NULL GROUP BY 1 ORDER BY count(*) DESC """) print("\nArticle type distribution in DB:") for row in cur.fetchall(): print(f" {str(row[0]):12s}: {row[1]:5d}") # Verify: sample preamble controls cur.execute(""" SELECT control_id, source_citation->>'article', source_citation->>'article_type', source_citation->>'source' FROM compliance.canonical_controls WHERE source_citation->>'article_type' = 'preamble' LIMIT 5 """) print("\nSample preamble controls:") for row in cur.fetchall(): print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}") # Verify: sample annex controls cur.execute(""" SELECT control_id, source_citation->>'article', source_citation->>'article_type', source_citation->>'source' FROM compliance.canonical_controls WHERE source_citation->>'article_type' = 'annex' LIMIT 5 """) print("\nSample annex controls:") for row in cur.fetchall(): print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}") conn.close()