breakpilot-compliance/scripts/qa/apply_pdf_qa_results.py

"""
Apply PDF QA results: update source_citation with correct article_type + article.

Safety modes:
  --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
  --force-article:  Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
  --dry-run:        Show what would change without writing.

Usage:
    python3 apply_pdf_qa_results.py                    # safe mode (apply article_type + empty articles)
    python3 apply_pdf_qa_results.py --dry-run          # show changes without writing
    python3 apply_pdf_qa_results.py --force-article    # also overwrite existing articles
"""
import os
import sys
import json
import psycopg2
import urllib.parse
from collections import Counter

RESULTS_FILE = "/tmp/pdf_qa_results.json"

# Parse args
dry_run = "--dry-run" in sys.argv
force_article = "--force-article" in sys.argv

# Load results
with open(RESULTS_FILE) as f:
    results = json.load(f)
print(f"Loaded {len(results)} results")

# DB connection
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
    host=parsed.hostname, port=parsed.port or 5432,
    user=parsed.username, password=parsed.password,
    dbname=parsed.path.lstrip('/'),
    options="-c search_path=compliance,public"
)

# Load current DB state for all affected controls
cur = conn.cursor()
ctrl_ids = [r["ctrl_id"] for r in results]
cur.execute("""
    SELECT id,
           source_citation->>'article' as article,
           source_citation->>'article_type' as article_type,
           source_citation->>'source' as source
    FROM compliance.canonical_controls
    WHERE id = ANY(%s::uuid[])
""", (ctrl_ids,))
db_state = {}
for row in cur.fetchall():
    db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}

# Counters
stats = Counter()
updated_type = 0
updated_article = 0
updated_recital = 0
errors = 0

for i, r in enumerate(results):
    ctrl_id = r["ctrl_id"]
    new_article = r["article_label"]
    new_type = r["article_type"]
    db = db_state.get(ctrl_id, {})

    if not db:
        stats["missing_in_db"] += 1
        continue

    old_type = db.get("article_type")
    old_article = db.get("article", "").strip()

    # Decide what to update
    set_type = (old_type != new_type)
    set_article = (not old_article) or (force_article and old_article != new_article)
    set_recital = (new_type == "preamble")

    if set_type:
        stats["type_" + ("new" if not old_type else "changed")] += 1
    else:
        stats["type_unchanged"] += 1

    if not old_article and set_article:
        stats["article_new"] += 1
    elif old_article and old_article != new_article:
        if force_article:
            stats["article_force_changed"] += 1
        else:
            stats["article_skipped"] += 1
    else:
        stats["article_unchanged"] += 1

    if set_recital:
        stats["recital"] += 1

    if dry_run:
        continue

    try:
        # Build JSONB update
        updates = {}
        if set_type:
            updates["article_type"] = new_type
        if set_article:
            updates["article"] = new_article

        if updates:
            # Merge into source_citation
            cur.execute("""
                UPDATE compliance.canonical_controls
                SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
                    updated_at = now()
                WHERE id = %s::uuid
            """, (json.dumps(updates), ctrl_id))
            if set_type:
                updated_type += 1
            if set_article:
                updated_article += 1

        # Mark preamble as recital_suspect
        if set_recital:
            cur.execute("""
                UPDATE compliance.canonical_controls
                SET generation_metadata = jsonb_set(
                    COALESCE(generation_metadata, '{}'::jsonb),
                    '{recital_suspect}',
                    'true'::jsonb
                ),
                updated_at = now()
                WHERE id = %s::uuid
            """, (ctrl_id,))
            updated_recital += 1

    except Exception as e:
        errors += 1
        if errors <= 5:
            print(f"  ERROR {ctrl_id}: {str(e)[:100]}")
        conn.rollback()
        continue

    if (i + 1) % 1000 == 0:
        conn.commit()
        print(f"  Progress: {i+1}/{len(results)}")

if not dry_run:
    conn.commit()

mode = "DRY-RUN" if dry_run else "APPLIED"
print(f"\n{'='*60}")
print(f"  Mode: {mode}")
print(f"{'='*60}")
print(f"\n  article_type:")
print(f"    New (was NULL):    {stats['type_new']:5d}")
print(f"    Changed:           {stats['type_changed']:5d}")
print(f"    Unchanged:         {stats['type_unchanged']:5d}")
print(f"\n  article:")
print(f"    New (was empty):   {stats['article_new']:5d}")
if force_article:
    print(f"    Force-changed:     {stats['article_force_changed']:5d}")
else:
    print(f"    Differs (SKIPPED): {stats['article_skipped']:5d}")
print(f"    Unchanged:         {stats['article_unchanged']:5d}")
print(f"\n  Preamble/Recital:    {stats['recital']:5d}")
print(f"  Missing in DB:       {stats['missing_in_db']:5d}")

if not dry_run:
    print(f"\n  Updates written:")
    print(f"    article_type:      {updated_type:5d}")
    print(f"    article:           {updated_article:5d}")
    print(f"    recital_suspect:   {updated_recital:5d}")
    print(f"    Errors:            {errors:5d}")

# Verify: count by article_type
cur.execute("""
    SELECT source_citation->>'article_type' as art_type, count(*)
    FROM compliance.canonical_controls
    WHERE source_citation->>'article_type' IS NOT NULL
    GROUP BY 1
    ORDER BY count(*) DESC
""")
print("\nArticle type distribution in DB:")
for row in cur.fetchall():
    print(f"  {str(row[0]):12s}: {row[1]:5d}")

# Verify: sample preamble controls
cur.execute("""
    SELECT control_id, source_citation->>'article', source_citation->>'article_type',
           source_citation->>'source'
    FROM compliance.canonical_controls
    WHERE source_citation->>'article_type' = 'preamble'
    LIMIT 5
""")
print("\nSample preamble controls:")
for row in cur.fetchall():
    print(f"  {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")

# Verify: sample annex controls
cur.execute("""
    SELECT control_id, source_citation->>'article', source_citation->>'article_type',
           source_citation->>'source'
    FROM compliance.canonical_controls
    WHERE source_citation->>'article_type' = 'annex'
    LIMIT 5
""")
print("\nSample annex controls:")
for row in cur.fetchall():
    print(f"  {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")

conn.close()