"""Apply PDF QA results: update source_citation with correct article + article_type.""" import os import json import psycopg2 import urllib.parse RESULTS_FILE = "/tmp/pdf_qa_results.json" # Load results with open(RESULTS_FILE) as f: results = json.load(f) print(f"Loaded {len(results)} results") # DB connection db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) # Update in batches cur = conn.cursor() updated = 0 errors = 0 unchanged = 0 for i, r in enumerate(results): ctrl_id = r["ctrl_id"] article_label = r["article_label"] article_type = r["article_type"] # preamble, article, annex, section, unknown try: # Update source_citation: set article and article_type cur.execute(""" UPDATE compliance.canonical_controls SET source_citation = source_citation || jsonb_build_object('article', %s, 'article_type', %s), updated_at = now() WHERE id = %s::uuid AND ( source_citation->>'article' IS DISTINCT FROM %s OR source_citation->>'article_type' IS DISTINCT FROM %s ) """, (article_label, article_type, ctrl_id, article_label, article_type)) if cur.rowcount > 0: updated += 1 else: unchanged += 1 except Exception as e: errors += 1 if errors <= 5: print(f" ERROR {ctrl_id}: {str(e)[:100]}") conn.rollback() continue if (i + 1) % 500 == 0: conn.commit() print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})") conn.commit() print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}") # Verify: count by article_type cur.execute(""" SELECT source_citation->>'article_type' as art_type, count(*) FROM compliance.canonical_controls WHERE source_citation->>'article_type' IS NOT NULL GROUP BY 1 ORDER BY count(*) DESC """) print("\nArticle type distribution in DB:") for row in cur.fetchall(): print(f" {str(row[0]):12s}: {row[1]:5d}") # Verify: sample preamble controls cur.execute(""" SELECT control_id, source_citation->>'article', source_citation->>'article_type', source_citation->>'source' FROM compliance.canonical_controls WHERE source_citation->>'article_type' = 'preamble' LIMIT 5 """) print("\nSample preamble controls:") for row in cur.fetchall(): print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}") # Verify: sample annex controls cur.execute(""" SELECT control_id, source_citation->>'article', source_citation->>'article_type', source_citation->>'source' FROM compliance.canonical_controls WHERE source_citation->>'article_type' = 'annex' LIMIT 5 """) print("\nSample annex controls:") for row in cur.fetchall(): print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}") conn.close()