Files
breakpilot-compliance/scripts/qa/sync_db.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

308 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""Sync canonical control tables between production and local DB.
Modes:
--pull Production → Local (initial sync, full table copy)
--push Local → Production (incremental, only new obligation_candidates)
--loop Run --push every N minutes (default 60)
Usage:
python3 sync_db.py --pull # Full sync production → local
python3 sync_db.py --push # Push new obligations to production
python3 sync_db.py --loop 60 # Push every 60 minutes
python3 sync_db.py --pull --tables canonical_controls # Only one table
"""
import argparse
import json
import os
import sys
import time
import urllib.parse
import io
import psycopg2
import psycopg2.extras
import psycopg2.extensions
# Register JSON adapter so dicts are automatically converted to JSONB
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
# ── DB Config ────────────────────────────────────────────────────────
PROD_URL = os.environ.get(
"PROD_DATABASE_URL",
"postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
"@46.225.100.82:54321/postgres?sslmode=require",
)
LOCAL_URL = os.environ.get(
"LOCAL_DATABASE_URL",
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
)
SCHEMA = "compliance"
# Tables to sync (production → local)
SYNC_TABLES = [
"canonical_control_frameworks",
"canonical_control_licenses",
"canonical_control_sources",
"canonical_control_categories",
"canonical_blocked_sources",
"canonical_controls",
"canonical_control_mappings",
"canonical_processed_chunks",
"canonical_generation_jobs",
"control_patterns",
"crosswalk_matrix",
"obligation_extractions",
"obligation_candidates",
]
def connect(url, label="DB"):
parsed = urllib.parse.urlparse(url)
params = dict(urllib.parse.parse_qsl(parsed.query))
conn = psycopg2.connect(
host=parsed.hostname,
port=parsed.port or 5432,
user=parsed.username,
password=parsed.password,
dbname=parsed.path.lstrip("/"),
sslmode=params.get("sslmode", "prefer"),
options=f"-c search_path={SCHEMA},public",
keepalives=1,
keepalives_idle=30,
keepalives_interval=10,
keepalives_count=5,
)
conn.autocommit = False
print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
return conn
def get_columns(cur, table):
cur.execute(f"""
SELECT column_name FROM information_schema.columns
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
ORDER BY ordinal_position
""")
return [r[0] for r in cur.fetchall()]
def pull_table(prod_conn, local_conn, table):
"""Copy entire table from production to local via SELECT + INSERT."""
prod_cur = prod_conn.cursor()
local_cur = local_conn.cursor()
# Check table exists on production
prod_cur.execute(f"""
SELECT 1 FROM pg_tables
WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
""")
if not prod_cur.fetchone():
print(f" SKIP {table} — not found on production")
return 0
# Drop local table
local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
local_conn.commit()
# Build simple CREATE TABLE (no constraints, no defaults — just for data)
prod_cur.execute(f"""
SELECT column_name, data_type, udt_name, character_maximum_length
FROM information_schema.columns
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
ORDER BY ordinal_position
""")
col_defs = prod_cur.fetchall()
parts = []
col_names = []
jsonb_cols = set()
for name, dtype, udt, max_len in col_defs:
col_names.append(name)
if dtype == "ARRAY":
type_map = {
"_text": "text[]", "_varchar": "varchar[]",
"_int4": "integer[]", "_uuid": "uuid[]",
"_jsonb": "jsonb[]", "_float8": "float8[]",
}
sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
elif dtype == "USER-DEFINED" and udt == "jsonb":
sql_type = "jsonb"
jsonb_cols.add(name)
elif dtype == "USER-DEFINED":
sql_type = udt
elif dtype == "jsonb":
sql_type = "jsonb"
jsonb_cols.add(name)
elif max_len:
sql_type = f"{dtype}({max_len})"
else:
sql_type = dtype
parts.append(f'"{name}" {sql_type}')
ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
local_cur.execute(ddl)
local_conn.commit()
# Fetch all rows from production
col_list = ", ".join(f'"{c}"' for c in col_names)
prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
rows = prod_cur.fetchall()
if rows:
# Wrap dict/list values in Json for JSONB columns
adapted_rows = []
for row in rows:
adapted = []
for i, val in enumerate(row):
if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
adapted.append(psycopg2.extras.Json(val))
else:
adapted.append(val)
adapted_rows.append(tuple(adapted))
placeholders = ", ".join(["%s"] * len(col_names))
insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
local_conn.commit()
print(f" {table}: {len(rows)} rows")
return len(rows)
def pull(tables=None):
"""Full sync: production → local."""
print("\n=== PULL: Production → Local ===\n")
prod_conn = connect(PROD_URL, "Production")
local_conn = connect(LOCAL_URL, "Local")
# Ensure schema exists
local_cur = local_conn.cursor()
local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
local_conn.commit()
sync_list = tables if tables else SYNC_TABLES
total = 0
for table in sync_list:
try:
count = pull_table(prod_conn, local_conn, table)
total += count
except Exception as e:
print(f" ERROR {table}: {e}")
local_conn.rollback()
prod_conn.rollback()
print(f"\n Total: {total} rows synced")
prod_conn.close()
local_conn.close()
def push():
"""Incremental push: new obligation_candidates local → production."""
print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
local_conn = connect(LOCAL_URL, "Local")
prod_conn = connect(PROD_URL, "Production")
local_cur = local_conn.cursor()
prod_cur = prod_conn.cursor()
# Find obligation_candidates in local that don't exist in production
# Use candidate_id as the unique key
local_cur.execute(f"""
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
""")
local_ids = {r[0] for r in local_cur.fetchall()}
if not local_ids:
print(" No obligation_candidates in local DB")
local_conn.close()
prod_conn.close()
return 0
# Check which already exist on production
prod_cur.execute(f"""
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
""")
prod_ids = {r[0] for r in prod_cur.fetchall()}
new_ids = local_ids - prod_ids
if not new_ids:
print(f" All {len(local_ids)} obligations already on production")
local_conn.close()
prod_conn.close()
return 0
print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
# Get columns
columns = get_columns(local_cur, "obligation_candidates")
col_list = ", ".join(columns)
placeholders = ", ".join(["%s"] * len(columns))
# Fetch new rows from local
id_list = ", ".join(f"'{i}'" for i in new_ids)
local_cur.execute(f"""
SELECT {col_list} FROM {SCHEMA}.obligation_candidates
WHERE candidate_id IN ({id_list})
""")
rows = local_cur.fetchall()
# Insert into production
insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
prod_conn.commit()
print(f" Pushed {len(rows)} obligations to production")
local_conn.close()
prod_conn.close()
return len(rows)
def loop(interval_min):
"""Run push every N minutes."""
print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Press Ctrl+C to stop\n")
while True:
try:
pushed = push()
if pushed:
print(f" Next sync in {interval_min} min...")
except Exception as e:
print(f" SYNC ERROR: {e}")
time.sleep(interval_min * 60)
def main():
parser = argparse.ArgumentParser(description="Sync canonical control tables")
parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
args = parser.parse_args()
if not any([args.pull, args.push, args.loop]):
parser.print_help()
return
if args.pull:
pull(args.tables)
if args.push:
push()
if args.loop:
loop(args.loop)
if __name__ == "__main__":
main()