#!/usr/bin/env python3 """Backfill license_rule on canonical_controls by inheriting from parent. Background ========== Audit (audit_license_classification.py) showed that 279,384 of 314,811 rows in compliance.canonical_controls have NULL license_rule. Drilling in: - 261,980 of those (94%) have a parent_control_uuid whose parent already carries a non-NULL license_rule. The pass0b decomposition pipeline did not propagate the rule to its child controls — this is a clear inheritance bug, fixable without any classification decisions. - 16,617 have a parent that itself has no license_rule (transitive case). Inheriting iteratively converges to either rule-set or root-orphan. - 787 have no parent at all (decomposition roots). These need cluster-based manual classification (see Strategy Notes at the bottom of this file). This script runs the inheritance fix in three idempotent stages and prints per-stage counts before any write happens. Usage:: # Always dry-run first: python3 scripts/backfill_license_rule.py --db-host 100.80.114.48 \\ --db-password breakpilot123 --dry-run # If counts look right: python3 scripts/backfill_license_rule.py --db-host 100.80.114.48 \\ --db-password breakpilot123 --apply The script is safe to rerun — it only touches rows where license_rule IS NULL. """ from __future__ import annotations import argparse import sys def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__) p.add_argument("--db-host", default="100.80.114.48") p.add_argument("--db-port", type=int, default=5432) p.add_argument("--db-user", default="breakpilot") p.add_argument("--db-name", default="breakpilot_db") p.add_argument("--db-password", required=True) g = p.add_mutually_exclusive_group(required=True) g.add_argument("--dry-run", action="store_true") g.add_argument("--apply", action="store_true") p.add_argument("--max-iterations", type=int, default=5, help="Cap on inheritance iterations to avoid loops") return p.parse_args() # Stage 1: direct parent has license_rule — single UPDATE. # Stage 2: iterative — parent did not have it, but a grandparent does. # We loop until no more rows can be filled or max-iterations. # Stage 3: residual rows with no resolvable parent. Report them clustered # by category/pattern_id so the user can classify by family. SQL_REPORT_NULLS = """ SET search_path TO compliance, public; SELECT CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent' WHEN p.license_rule IS NULL THEN 'parent_null' ELSE 'parent_set' END AS bucket, COUNT(*) AS n FROM canonical_controls cc LEFT JOIN canonical_controls p ON cc.parent_control_uuid = p.id WHERE cc.license_rule IS NULL GROUP BY 1 ORDER BY 2 DESC; """ SQL_INHERIT_FROM_PARENT = """ SET search_path TO compliance, public; UPDATE canonical_controls cc SET license_rule = p.license_rule, updated_at = NOW() FROM canonical_controls p WHERE cc.parent_control_uuid = p.id AND cc.license_rule IS NULL AND p.license_rule IS NOT NULL; """ SQL_REPORT_ORPHAN_CLUSTERS = """ SET search_path TO compliance, public; SELECT COALESCE(category, '(null)') AS category, COALESCE(pattern_id, '(null)') AS pattern_id, COALESCE(generation_strategy, '(null)') AS gen, COUNT(*) AS n FROM canonical_controls WHERE license_rule IS NULL AND parent_control_uuid IS NULL GROUP BY 1, 2, 3 ORDER BY n DESC LIMIT 25; """ def print_bucket(rows, label: str) -> None: print(f"\n## {label}") total = 0 for bucket, n in rows: print(f" {bucket:12} {n:>8}") total += n print(f" {'TOTAL':12} {total:>8}") def main() -> int: args = parse_args() try: import psycopg2 except ImportError: print("error: psycopg2 not installed", file=sys.stderr) return 2 conn = psycopg2.connect( host=args.db_host, port=args.db_port, user=args.db_user, dbname=args.db_name, password=args.db_password, ) conn.autocommit = False cur = conn.cursor() print("=" * 60) print(" Backfill — license_rule via parent inheritance") print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}") print("=" * 60) # Initial bucket report cur.execute(SQL_REPORT_NULLS) rows = cur.fetchall() print_bucket(rows, "Initial NULL distribution") if args.dry_run: # Print what the FIRST inherit pass would resolve (without writing) cur.execute( "SET search_path TO compliance, public; " "SELECT p.license_rule, COUNT(*) " "FROM canonical_controls cc " "JOIN canonical_controls p ON cc.parent_control_uuid = p.id " "WHERE cc.license_rule IS NULL AND p.license_rule IS NOT NULL " "GROUP BY 1 ORDER BY 1;" ) print("\n## First inherit-pass would fill:") for rule, n in cur.fetchall(): print(f" rule={rule} {n:>8} rows") # Show orphan clusters that would remain cur.execute(SQL_REPORT_ORPHAN_CLUSTERS) print("\n## Orphan clusters (no parent + no rule, top 25):") for cat, pid, gen, n in cur.fetchall(): print(f" cat={cat[:20]:20} pat={pid[:20]:20} gen={gen[:20]:20} n={n}") print("\nNo writes performed. Use --apply to execute.") conn.rollback() return 0 # Apply mode — iterative inheritance total_updated = 0 for i in range(1, args.max_iterations + 1): cur.execute(SQL_INHERIT_FROM_PARENT) updated = cur.rowcount total_updated += updated print(f"\n iteration {i}: {updated} rows updated") if updated == 0: break conn.commit() print(f"\n✓ Total rows backfilled: {total_updated}") # Final bucket report cur.execute(SQL_REPORT_NULLS) print_bucket(cur.fetchall(), "Remaining NULL distribution") cur.execute(SQL_REPORT_ORPHAN_CLUSTERS) rows = cur.fetchall() if rows: print("\n## Orphan clusters still need classification:") for cat, pid, gen, n in rows: print(f" cat={cat[:20]:20} pat={pid[:20]:20} gen={gen[:20]:20} n={n}") return 0 if __name__ == "__main__": raise SystemExit(main())