""" Import compliance backup into local PostgreSQL. Fixes Python-style lists/dicts in JSONB fields to valid JSON. """ import ast import gzip import json import re import sys import psycopg2 DB_URL = "postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db" BACKUP_PATH = "/tmp/compliance-db-2026-03-28_16-25-19.sql.gz" # Tables with JSONB columns that need Python→JSON conversion JSONB_TABLES = { "canonical_controls", "canonical_controls_pre_dedup", "obligation_candidates", "control_dedup_reviews", "canonical_generation_jobs", "canonical_processed_chunks", } def fix_python_value(val: str) -> str: """Convert Python repr to JSON string for JSONB fields.""" if val == "NULL": return None # Strip outer SQL quotes if val.startswith("'") and val.endswith("'"): # Unescape SQL single quotes inner = val[1:-1].replace("''", "'") else: return val # Try to parse as Python literal and convert to JSON try: obj = ast.literal_eval(inner) return json.dumps(obj, ensure_ascii=False) except (ValueError, SyntaxError): # Already valid JSON or plain string return inner def process_line(line: str, conn) -> bool: """Process a single SQL line. Returns True if it was an INSERT.""" line = line.strip() if not line.startswith("INSERT INTO"): if line.startswith("SET "): return False return False # Execute directly for non-JSONB tables table_match = re.match(r'INSERT INTO "(\w+)"', line) if not table_match: return False table = table_match.group(1) if table not in JSONB_TABLES: # Execute as-is try: with conn.cursor() as cur: cur.execute(line) return True except Exception as e: conn.rollback() return False # For JSONB tables: use psycopg2 parameterized query # Extract column names and values cols_match = re.match(r'INSERT INTO "\w+" \(([^)]+)\) VALUES \(', line) if not cols_match: return False col_names = [c.strip().strip('"') for c in cols_match.group(1).split(",")] # Extract VALUES portion vals_start = line.index("VALUES (") + 8 vals_str = line[vals_start:-2] # Remove trailing ); # Parse SQL values (handling nested quotes and parentheses) values = [] current = "" in_quote = False depth = 0 i = 0 while i < len(vals_str): c = vals_str[i] if in_quote: if c == "'" and i + 1 < len(vals_str) and vals_str[i + 1] == "'": current += "''" i += 2 continue elif c == "'": current += "'" in_quote = False else: current += c else: if c == "'": current += "'" in_quote = True elif c == "(" : depth += 1 current += c elif c == ")": depth -= 1 current += c elif c == "," and depth == 0: values.append(current.strip()) current = "" else: current += c i += 1 values.append(current.strip()) if len(values) != len(col_names): # Fallback: try direct execution try: with conn.cursor() as cur: cur.execute(line) return True except Exception: conn.rollback() return False # Convert values params = [] placeholders = [] for col, val in zip(col_names, values): if val == "NULL": params.append(None) placeholders.append("%s") elif val in ("TRUE", "true"): params.append(True) placeholders.append("%s") elif val in ("FALSE", "false"): params.append(False) placeholders.append("%s") elif val.startswith("'") and val.endswith("'"): inner = val[1:-1].replace("''", "'") # Check if this looks like a Python literal (list/dict) stripped = inner.strip() if stripped and stripped[0] in ("[", "{") and stripped not in ("[]", "{}"): try: obj = ast.literal_eval(inner) params.append(json.dumps(obj, ensure_ascii=False)) except (ValueError, SyntaxError): params.append(inner) else: params.append(inner) placeholders.append("%s") else: # Numeric or other try: if "." in val: params.append(float(val)) else: params.append(int(val)) except ValueError: params.append(val) placeholders.append("%s") col_list = ", ".join(f'"{c}"' for c in col_names) ph_list = ", ".join(placeholders) sql = f'INSERT INTO "{table}" ({col_list}) VALUES ({ph_list})' try: with conn.cursor() as cur: cur.execute(sql, params) return True except Exception as e: conn.rollback() if "duplicate key" not in str(e): print(f" ERROR [{table}]: {str(e)[:120]}", file=sys.stderr) return False def main(): conn = psycopg2.connect(DB_URL) conn.autocommit = True with conn.cursor() as cur: cur.execute("SET search_path TO compliance, public") total = 0 ok = 0 errors = 0 print(f"Reading {BACKUP_PATH}...") with gzip.open(BACKUP_PATH, "rt", encoding="utf-8") as f: buffer = "" for line in f: buffer += line if not buffer.rstrip().endswith(";"): continue # Complete SQL statement stmt = buffer.strip() buffer = "" if not stmt.startswith("INSERT"): continue total += 1 if process_line(stmt, conn): ok += 1 else: errors += 1 if total % 10000 == 0: print(f" {total:>8} processed, {ok} ok, {errors} errors") print(f"\nDONE: {total} total, {ok} ok, {errors} errors") conn.close() if __name__ == "__main__": main()