#!/usr/bin/env python3 """Inherit source_citation from parent to atom controls. Background ========== citation_backfill.py fills source_citation on the *source-bearing* controls (those with source_original_text — ~2-7 %) by re-linking them to the re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom" controls (decompositions) that carry a parent_control_uuid but no own citation. They cite the SAME norm as their parent, so the citation can be inherited — no re-matching needed. Self-written controls (license_rule = 3) are skipped (no external source). Runs in idempotent iterations (atom -> master -> grandmaster) and prints per-stage counts before any write. Safe to rerun — only fills rows whose source_citation lacks an 'article'. Usage:: python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\ --db-password breakpilot123 --dry-run python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\ --db-password breakpilot123 --apply """ from __future__ import annotations import argparse import os import sys DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db") # A row "needs" a citation when it has no article yet. _NEEDS = ( "(cc.source_citation IS NULL " " OR cc.source_citation->>'article' IS NULL " " OR cc.source_citation->>'article' = '')" ) # A parent can supply one when it carries a real article. _PARENT_HAS = ( "p.source_citation IS NOT NULL " "AND p.source_citation->>'article' IS NOT NULL " "AND p.source_citation->>'article' <> ''" ) SQL_REPORT = f""" SET search_path TO compliance, public; SELECT CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent' WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article' ELSE 'parent_no_article' END AS bucket, COUNT(*) AS n FROM canonical_controls cc LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id WHERE {_NEEDS} AND cc.license_rule IS DISTINCT FROM 3 GROUP BY 1 ORDER BY 2 DESC; """ SQL_INHERIT = f""" SET search_path TO compliance, public; UPDATE canonical_controls cc SET source_citation = p.source_citation, updated_at = NOW() FROM canonical_controls p WHERE cc.parent_control_uuid = p.id AND {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3; """ def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__) p.add_argument("--db-url", default=DB_URL, help="Postgres URL (default: $DATABASE_URL)") p.add_argument("--max-iterations", type=int, default=6, help="Cap on inheritance iterations to avoid loops") g = p.add_mutually_exclusive_group(required=True) g.add_argument("--dry-run", action="store_true") g.add_argument("--apply", action="store_true") return p.parse_args() def print_bucket(rows, label: str) -> None: print(f"\n## {label}") total = 0 for bucket, n in rows: print(f" {bucket:20} {n:>8}") total += n print(f" {'TOTAL':20} {total:>8}") def main() -> int: args = parse_args() try: import psycopg2 except ImportError: print("error: psycopg2 not installed", file=sys.stderr) return 2 conn = psycopg2.connect(args.db_url) conn.autocommit = False cur = conn.cursor() print("=" * 60) print(" Atom citation inheritance — source_citation via parent") print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}") print("=" * 60) cur.execute(SQL_REPORT) print_bucket(cur.fetchall(), "Controls without article (need citation)") if args.dry_run: cur.execute( "SET search_path TO compliance, public; " f"SELECT COUNT(*) FROM canonical_controls cc " f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id " f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;" ) print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows") print("\nNo writes performed. Use --apply to execute.") conn.rollback() return 0 total = 0 for i in range(1, args.max_iterations + 1): cur.execute(SQL_INHERIT) updated = cur.rowcount total += updated print(f"\n iteration {i}: {updated} rows inherited") if updated == 0: break conn.commit() print(f"\n✓ Total atoms inherited: {total}") cur.execute(SQL_REPORT) print_bucket(cur.fetchall(), "Remaining without article") return 0 if __name__ == "__main__": raise SystemExit(main())