Files
breakpilot-compliance/scripts/qa/sync_controls_to_prod.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

207 lines
10 KiB
Python

"""
Sync controls from Mac Mini (local) to Production (Hetzner).
Both have PostgreSQL. Mac Mini has 6,373 active controls, Production ~3,159.
Strategy:
1. Export all non-duplicate/non-too_close controls from Mac Mini
2. Upsert into Production (ON CONFLICT update, preserve production-only data)
3. Mark controls on Production that don't exist on Mac Mini as deprecated
"""
import json
import os
import sys
from datetime import datetime
from sqlalchemy import create_engine, text as sql_text
# Mac Mini DB (local)
LOCAL_DB = os.environ['DATABASE_URL']
# Production DB (Hetzner) — same env var format
PROD_DB = os.environ.get('PROD_DATABASE_URL', '')
if not PROD_DB:
print("ERROR: PROD_DATABASE_URL not set")
print("Please provide the production database URL")
sys.exit(1)
DRY_RUN = '--dry-run' in sys.argv
local_engine = create_engine(LOCAL_DB, connect_args={"options": "-c search_path=compliance,public"})
prod_engine = create_engine(PROD_DB, connect_args={"options": "-c search_path=compliance,public"})
# ── Step 1: Export from Mac Mini ──────────────────────────────────────
print("=" * 60)
print("SYNC CONTROLS: Mac Mini → Production")
print("=" * 60)
with local_engine.connect() as local_conn:
# Get all controls (include duplicates/too_close so prod knows about them)
rows = local_conn.execute(sql_text("""
SELECT id, framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort, evidence_confidence,
open_anchors, release_state, tags, created_at, updated_at,
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata, verification_method,
category, target_audience, generation_strategy,
pattern_id, obligation_ids, parent_control_uuid,
decomposition_method, pipeline_version,
applicable_industries, applicable_company_size, scope_conditions
FROM compliance.canonical_controls
""")).fetchall()
print(f" Local controls: {len(rows)}")
# Count by state
states = {}
for r in rows:
s = r[15] # release_state
states[s] = states.get(s, 0) + 1
for s, c in sorted(states.items(), key=lambda x: -x[1]):
print(f" {s}: {c}")
# ── Step 2: Check Production state ───────────────────────────────────
with prod_engine.connect() as prod_conn:
r = prod_conn.execute(sql_text("""
SELECT count(*) FROM compliance.canonical_controls
"""))
prod_count = r.scalar()
print(f"\n Production controls before sync: {prod_count}")
# Check if framework exists
fw = prod_conn.execute(sql_text("""
SELECT id FROM compliance.canonical_control_frameworks
WHERE framework_id = 'bp_security_v1' LIMIT 1
""")).fetchone()
if fw:
print(f" Framework bp_security_v1: {fw[0]}")
else:
print(" WARNING: Framework bp_security_v1 not found on production!")
# ── Step 3: Upsert to Production ─────────────────────────────────────
print(f"\n Syncing {len(rows)} controls to production...")
with prod_engine.begin() as prod_conn:
inserted = 0
updated = 0
errors = 0
for i, row in enumerate(rows):
try:
result = prod_conn.execute(sql_text("""
INSERT INTO compliance.canonical_controls (
id, framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort, evidence_confidence,
open_anchors, release_state, tags, created_at, updated_at,
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata, verification_method,
category, target_audience, generation_strategy,
pattern_id, obligation_ids, parent_control_uuid,
decomposition_method, pipeline_version,
applicable_industries, applicable_company_size, scope_conditions
) VALUES (
:id, :framework_id, :control_id, :title, :objective, :rationale,
:scope, :requirements, :test_procedure, :evidence,
:severity, :risk_score, :implementation_effort, :evidence_confidence,
:open_anchors, :release_state, :tags, :created_at, :updated_at,
:license_rule, :source_original_text, :source_citation,
:customer_visible, :generation_metadata, :verification_method,
:category, :target_audience, :generation_strategy,
:pattern_id, :obligation_ids, :parent_control_uuid,
:decomposition_method, :pipeline_version,
:applicable_industries, :applicable_company_size, :scope_conditions
)
ON CONFLICT (id) DO UPDATE SET
title = EXCLUDED.title,
objective = EXCLUDED.objective,
rationale = EXCLUDED.rationale,
scope = EXCLUDED.scope,
requirements = EXCLUDED.requirements,
test_procedure = EXCLUDED.test_procedure,
evidence = EXCLUDED.evidence,
severity = EXCLUDED.severity,
risk_score = EXCLUDED.risk_score,
implementation_effort = EXCLUDED.implementation_effort,
open_anchors = EXCLUDED.open_anchors,
release_state = EXCLUDED.release_state,
tags = EXCLUDED.tags,
updated_at = EXCLUDED.updated_at,
license_rule = EXCLUDED.license_rule,
source_original_text = EXCLUDED.source_original_text,
source_citation = EXCLUDED.source_citation,
customer_visible = EXCLUDED.customer_visible,
generation_metadata = EXCLUDED.generation_metadata,
verification_method = EXCLUDED.verification_method,
category = EXCLUDED.category,
target_audience = EXCLUDED.target_audience,
generation_strategy = EXCLUDED.generation_strategy,
pipeline_version = EXCLUDED.pipeline_version,
applicable_industries = EXCLUDED.applicable_industries,
applicable_company_size = EXCLUDED.applicable_company_size,
scope_conditions = EXCLUDED.scope_conditions
"""), {
"id": row[0], "framework_id": row[1], "control_id": row[2],
"title": row[3], "objective": row[4], "rationale": row[5],
"scope": json.dumps(row[6]) if isinstance(row[6], (dict, list)) else row[6],
"requirements": json.dumps(row[7]) if isinstance(row[7], (dict, list)) else row[7],
"test_procedure": json.dumps(row[8]) if isinstance(row[8], (dict, list)) else row[8],
"evidence": json.dumps(row[9]) if isinstance(row[9], (dict, list)) else row[9],
"severity": row[10], "risk_score": row[11],
"implementation_effort": row[12], "evidence_confidence": row[13],
"open_anchors": json.dumps(row[14]) if isinstance(row[14], (dict, list)) else row[14],
"release_state": row[15],
"tags": json.dumps(row[16]) if isinstance(row[16], (dict, list)) else row[16],
"created_at": row[17], "updated_at": row[18],
"license_rule": row[19], "source_original_text": row[20],
"source_citation": json.dumps(row[21]) if isinstance(row[21], (dict, list)) else row[21],
"customer_visible": row[22],
"generation_metadata": json.dumps(row[23]) if isinstance(row[23], (dict, list)) else row[23],
"verification_method": row[24], "category": row[25],
"target_audience": json.dumps(row[26]) if isinstance(row[26], (dict, list)) else row[26],
"generation_strategy": row[27],
"pattern_id": row[28],
"obligation_ids": json.dumps(row[29]) if isinstance(row[29], (dict, list)) else row[29],
"parent_control_uuid": row[30], "decomposition_method": row[31],
"pipeline_version": row[32],
"applicable_industries": json.dumps(row[33]) if isinstance(row[33], (dict, list)) else row[33],
"applicable_company_size": json.dumps(row[34]) if isinstance(row[34], (dict, list)) else row[34],
"scope_conditions": json.dumps(row[35]) if isinstance(row[35], (dict, list)) else row[35],
})
# Check if it was insert or update (xmax = 0 means insert)
inserted += 1
except Exception as e:
errors += 1
if errors <= 5:
print(f" ERROR on {row[2]}: {str(e)[:100]}")
if (i + 1) % 1000 == 0:
sys.stdout.write(f"\r Progress: {i+1}/{len(rows)} (errors: {errors})")
sys.stdout.flush()
print(f"\r Synced: {len(rows)} controls (errors: {errors})")
# ── Step 4: Verify ───────────────────────────────────────────────────
with prod_engine.connect() as prod_conn:
r = prod_conn.execute(sql_text("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
"""))
print(f"\n === Production control states after sync ===")
total = 0
for row in r.fetchall():
print(f" {str(row[0]):20s} {row[1]:6d}")
total += row[1]
print(f" {'TOTAL':20s} {total:6d}")
r2 = prod_conn.execute(sql_text("""
SELECT count(*) FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
"""))
active = r2.scalar()
print(f"\n Active controls on production: {active}")