feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
307
scripts/qa/sync_db.py
Normal file
307
scripts/qa/sync_db.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sync canonical control tables between production and local DB.
|
||||
|
||||
Modes:
|
||||
--pull Production → Local (initial sync, full table copy)
|
||||
--push Local → Production (incremental, only new obligation_candidates)
|
||||
--loop Run --push every N minutes (default 60)
|
||||
|
||||
Usage:
|
||||
python3 sync_db.py --pull # Full sync production → local
|
||||
python3 sync_db.py --push # Push new obligations to production
|
||||
python3 sync_db.py --loop 60 # Push every 60 minutes
|
||||
python3 sync_db.py --pull --tables canonical_controls # Only one table
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
import io
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import psycopg2.extensions
|
||||
|
||||
# Register JSON adapter so dicts are automatically converted to JSONB
|
||||
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
|
||||
|
||||
# ── DB Config ────────────────────────────────────────────────────────
|
||||
|
||||
PROD_URL = os.environ.get(
|
||||
"PROD_DATABASE_URL",
|
||||
"postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
|
||||
"@46.225.100.82:54321/postgres?sslmode=require",
|
||||
)
|
||||
LOCAL_URL = os.environ.get(
|
||||
"LOCAL_DATABASE_URL",
|
||||
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
|
||||
)
|
||||
|
||||
SCHEMA = "compliance"
|
||||
|
||||
# Tables to sync (production → local)
|
||||
SYNC_TABLES = [
|
||||
"canonical_control_frameworks",
|
||||
"canonical_control_licenses",
|
||||
"canonical_control_sources",
|
||||
"canonical_control_categories",
|
||||
"canonical_blocked_sources",
|
||||
"canonical_controls",
|
||||
"canonical_control_mappings",
|
||||
"canonical_processed_chunks",
|
||||
"canonical_generation_jobs",
|
||||
"control_patterns",
|
||||
"crosswalk_matrix",
|
||||
"obligation_extractions",
|
||||
"obligation_candidates",
|
||||
]
|
||||
|
||||
|
||||
def connect(url, label="DB"):
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
params = dict(urllib.parse.parse_qsl(parsed.query))
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname,
|
||||
port=parsed.port or 5432,
|
||||
user=parsed.username,
|
||||
password=parsed.password,
|
||||
dbname=parsed.path.lstrip("/"),
|
||||
sslmode=params.get("sslmode", "prefer"),
|
||||
options=f"-c search_path={SCHEMA},public",
|
||||
keepalives=1,
|
||||
keepalives_idle=30,
|
||||
keepalives_interval=10,
|
||||
keepalives_count=5,
|
||||
)
|
||||
conn.autocommit = False
|
||||
print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
|
||||
return conn
|
||||
|
||||
|
||||
def get_columns(cur, table):
|
||||
cur.execute(f"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
|
||||
ORDER BY ordinal_position
|
||||
""")
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def pull_table(prod_conn, local_conn, table):
|
||||
"""Copy entire table from production to local via SELECT + INSERT."""
|
||||
prod_cur = prod_conn.cursor()
|
||||
local_cur = local_conn.cursor()
|
||||
|
||||
# Check table exists on production
|
||||
prod_cur.execute(f"""
|
||||
SELECT 1 FROM pg_tables
|
||||
WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
|
||||
""")
|
||||
if not prod_cur.fetchone():
|
||||
print(f" SKIP {table} — not found on production")
|
||||
return 0
|
||||
|
||||
# Drop local table
|
||||
local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
|
||||
local_conn.commit()
|
||||
|
||||
# Build simple CREATE TABLE (no constraints, no defaults — just for data)
|
||||
prod_cur.execute(f"""
|
||||
SELECT column_name, data_type, udt_name, character_maximum_length
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
|
||||
ORDER BY ordinal_position
|
||||
""")
|
||||
col_defs = prod_cur.fetchall()
|
||||
|
||||
parts = []
|
||||
col_names = []
|
||||
jsonb_cols = set()
|
||||
for name, dtype, udt, max_len in col_defs:
|
||||
col_names.append(name)
|
||||
if dtype == "ARRAY":
|
||||
type_map = {
|
||||
"_text": "text[]", "_varchar": "varchar[]",
|
||||
"_int4": "integer[]", "_uuid": "uuid[]",
|
||||
"_jsonb": "jsonb[]", "_float8": "float8[]",
|
||||
}
|
||||
sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
|
||||
elif dtype == "USER-DEFINED" and udt == "jsonb":
|
||||
sql_type = "jsonb"
|
||||
jsonb_cols.add(name)
|
||||
elif dtype == "USER-DEFINED":
|
||||
sql_type = udt
|
||||
elif dtype == "jsonb":
|
||||
sql_type = "jsonb"
|
||||
jsonb_cols.add(name)
|
||||
elif max_len:
|
||||
sql_type = f"{dtype}({max_len})"
|
||||
else:
|
||||
sql_type = dtype
|
||||
parts.append(f'"{name}" {sql_type}')
|
||||
|
||||
ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
|
||||
local_cur.execute(ddl)
|
||||
local_conn.commit()
|
||||
|
||||
# Fetch all rows from production
|
||||
col_list = ", ".join(f'"{c}"' for c in col_names)
|
||||
prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
|
||||
rows = prod_cur.fetchall()
|
||||
|
||||
if rows:
|
||||
# Wrap dict/list values in Json for JSONB columns
|
||||
adapted_rows = []
|
||||
for row in rows:
|
||||
adapted = []
|
||||
for i, val in enumerate(row):
|
||||
if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
|
||||
adapted.append(psycopg2.extras.Json(val))
|
||||
else:
|
||||
adapted.append(val)
|
||||
adapted_rows.append(tuple(adapted))
|
||||
|
||||
placeholders = ", ".join(["%s"] * len(col_names))
|
||||
insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
|
||||
psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
|
||||
local_conn.commit()
|
||||
|
||||
print(f" {table}: {len(rows)} rows")
|
||||
return len(rows)
|
||||
|
||||
|
||||
def pull(tables=None):
|
||||
"""Full sync: production → local."""
|
||||
print("\n=== PULL: Production → Local ===\n")
|
||||
|
||||
prod_conn = connect(PROD_URL, "Production")
|
||||
local_conn = connect(LOCAL_URL, "Local")
|
||||
|
||||
# Ensure schema exists
|
||||
local_cur = local_conn.cursor()
|
||||
local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
|
||||
local_conn.commit()
|
||||
|
||||
sync_list = tables if tables else SYNC_TABLES
|
||||
total = 0
|
||||
|
||||
for table in sync_list:
|
||||
try:
|
||||
count = pull_table(prod_conn, local_conn, table)
|
||||
total += count
|
||||
except Exception as e:
|
||||
print(f" ERROR {table}: {e}")
|
||||
local_conn.rollback()
|
||||
prod_conn.rollback()
|
||||
|
||||
print(f"\n Total: {total} rows synced")
|
||||
prod_conn.close()
|
||||
local_conn.close()
|
||||
|
||||
|
||||
def push():
|
||||
"""Incremental push: new obligation_candidates local → production."""
|
||||
print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
|
||||
|
||||
local_conn = connect(LOCAL_URL, "Local")
|
||||
prod_conn = connect(PROD_URL, "Production")
|
||||
|
||||
local_cur = local_conn.cursor()
|
||||
prod_cur = prod_conn.cursor()
|
||||
|
||||
# Find obligation_candidates in local that don't exist in production
|
||||
# Use candidate_id as the unique key
|
||||
local_cur.execute(f"""
|
||||
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
|
||||
""")
|
||||
local_ids = {r[0] for r in local_cur.fetchall()}
|
||||
|
||||
if not local_ids:
|
||||
print(" No obligation_candidates in local DB")
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return 0
|
||||
|
||||
# Check which already exist on production
|
||||
prod_cur.execute(f"""
|
||||
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
|
||||
""")
|
||||
prod_ids = {r[0] for r in prod_cur.fetchall()}
|
||||
|
||||
new_ids = local_ids - prod_ids
|
||||
if not new_ids:
|
||||
print(f" All {len(local_ids)} obligations already on production")
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return 0
|
||||
|
||||
print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
|
||||
|
||||
# Get columns
|
||||
columns = get_columns(local_cur, "obligation_candidates")
|
||||
col_list = ", ".join(columns)
|
||||
placeholders = ", ".join(["%s"] * len(columns))
|
||||
|
||||
# Fetch new rows from local
|
||||
id_list = ", ".join(f"'{i}'" for i in new_ids)
|
||||
local_cur.execute(f"""
|
||||
SELECT {col_list} FROM {SCHEMA}.obligation_candidates
|
||||
WHERE candidate_id IN ({id_list})
|
||||
""")
|
||||
rows = local_cur.fetchall()
|
||||
|
||||
# Insert into production
|
||||
insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
|
||||
psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
|
||||
prod_conn.commit()
|
||||
|
||||
print(f" Pushed {len(rows)} obligations to production")
|
||||
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return len(rows)
|
||||
|
||||
|
||||
def loop(interval_min):
|
||||
"""Run push every N minutes."""
|
||||
print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
|
||||
print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Press Ctrl+C to stop\n")
|
||||
|
||||
while True:
|
||||
try:
|
||||
pushed = push()
|
||||
if pushed:
|
||||
print(f" Next sync in {interval_min} min...")
|
||||
except Exception as e:
|
||||
print(f" SYNC ERROR: {e}")
|
||||
time.sleep(interval_min * 60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sync canonical control tables")
|
||||
parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
|
||||
parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
|
||||
parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
|
||||
parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not any([args.pull, args.push, args.loop]):
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.pull:
|
||||
pull(args.tables)
|
||||
|
||||
if args.push:
|
||||
push()
|
||||
|
||||
if args.loop:
|
||||
loop(args.loop)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user