Files
breakpilot-core/control-pipeline/scripts/import_backup.py
Benjamin Admin 441d5740bd
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 35s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 37s
CI / Deploy (push) Failing after 2s
feat: Applicability Engine + API-Filter + DB-Sync + Cleanup
- Applicability Engine (deterministisch, kein LLM): filtert Controls
  nach Branche, Unternehmensgroesse, Scope-Signalen
- API-Filter auf GET /controls, /controls-count, /controls-meta
- POST /controls/applicable Endpoint fuer Company-Profile-Matching
- 35 Unit-Tests fuer Engine
- Port-8098-Konflikt mit Nginx gefixt (nur expose, kein Host-Port)
- CLAUDE.md: control-pipeline Dokumentation ergaenzt
- 6 internationale Gesetze geloescht (ES/FR/HU/NL/SE/CZ — nur DACH)
- DB-Backup-Import-Script (import_backup.py)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 21:58:17 +02:00

220 lines
6.2 KiB
Python

"""
Import compliance backup into local PostgreSQL.
Fixes Python-style lists/dicts in JSONB fields to valid JSON.
"""
import ast
import gzip
import json
import re
import sys
import psycopg2
DB_URL = "postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db"
BACKUP_PATH = "/tmp/compliance-db-2026-03-28_16-25-19.sql.gz"
# Tables with JSONB columns that need Python→JSON conversion
JSONB_TABLES = {
"canonical_controls",
"canonical_controls_pre_dedup",
"obligation_candidates",
"control_dedup_reviews",
"canonical_generation_jobs",
"canonical_processed_chunks",
}
def fix_python_value(val: str) -> str:
"""Convert Python repr to JSON string for JSONB fields."""
if val == "NULL":
return None
# Strip outer SQL quotes
if val.startswith("'") and val.endswith("'"):
# Unescape SQL single quotes
inner = val[1:-1].replace("''", "'")
else:
return val
# Try to parse as Python literal and convert to JSON
try:
obj = ast.literal_eval(inner)
return json.dumps(obj, ensure_ascii=False)
except (ValueError, SyntaxError):
# Already valid JSON or plain string
return inner
def process_line(line: str, conn) -> bool:
"""Process a single SQL line. Returns True if it was an INSERT."""
line = line.strip()
if not line.startswith("INSERT INTO"):
if line.startswith("SET "):
return False
return False
# Execute directly for non-JSONB tables
table_match = re.match(r'INSERT INTO "(\w+)"', line)
if not table_match:
return False
table = table_match.group(1)
if table not in JSONB_TABLES:
# Execute as-is
try:
with conn.cursor() as cur:
cur.execute(line)
return True
except Exception as e:
conn.rollback()
return False
# For JSONB tables: use psycopg2 parameterized query
# Extract column names and values
cols_match = re.match(r'INSERT INTO "\w+" \(([^)]+)\) VALUES \(', line)
if not cols_match:
return False
col_names = [c.strip().strip('"') for c in cols_match.group(1).split(",")]
# Extract VALUES portion
vals_start = line.index("VALUES (") + 8
vals_str = line[vals_start:-2] # Remove trailing );
# Parse SQL values (handling nested quotes and parentheses)
values = []
current = ""
in_quote = False
depth = 0
i = 0
while i < len(vals_str):
c = vals_str[i]
if in_quote:
if c == "'" and i + 1 < len(vals_str) and vals_str[i + 1] == "'":
current += "''"
i += 2
continue
elif c == "'":
current += "'"
in_quote = False
else:
current += c
else:
if c == "'":
current += "'"
in_quote = True
elif c == "(" :
depth += 1
current += c
elif c == ")":
depth -= 1
current += c
elif c == "," and depth == 0:
values.append(current.strip())
current = ""
else:
current += c
i += 1
values.append(current.strip())
if len(values) != len(col_names):
# Fallback: try direct execution
try:
with conn.cursor() as cur:
cur.execute(line)
return True
except Exception:
conn.rollback()
return False
# Convert values
params = []
placeholders = []
for col, val in zip(col_names, values):
if val == "NULL":
params.append(None)
placeholders.append("%s")
elif val in ("TRUE", "true"):
params.append(True)
placeholders.append("%s")
elif val in ("FALSE", "false"):
params.append(False)
placeholders.append("%s")
elif val.startswith("'") and val.endswith("'"):
inner = val[1:-1].replace("''", "'")
# Check if this looks like a Python literal (list/dict)
stripped = inner.strip()
if stripped and stripped[0] in ("[", "{") and stripped not in ("[]", "{}"):
try:
obj = ast.literal_eval(inner)
params.append(json.dumps(obj, ensure_ascii=False))
except (ValueError, SyntaxError):
params.append(inner)
else:
params.append(inner)
placeholders.append("%s")
else:
# Numeric or other
try:
if "." in val:
params.append(float(val))
else:
params.append(int(val))
except ValueError:
params.append(val)
placeholders.append("%s")
col_list = ", ".join(f'"{c}"' for c in col_names)
ph_list = ", ".join(placeholders)
sql = f'INSERT INTO "{table}" ({col_list}) VALUES ({ph_list})'
try:
with conn.cursor() as cur:
cur.execute(sql, params)
return True
except Exception as e:
conn.rollback()
if "duplicate key" not in str(e):
print(f" ERROR [{table}]: {str(e)[:120]}", file=sys.stderr)
return False
def main():
conn = psycopg2.connect(DB_URL)
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SET search_path TO compliance, public")
total = 0
ok = 0
errors = 0
print(f"Reading {BACKUP_PATH}...")
with gzip.open(BACKUP_PATH, "rt", encoding="utf-8") as f:
buffer = ""
for line in f:
buffer += line
if not buffer.rstrip().endswith(";"):
continue
# Complete SQL statement
stmt = buffer.strip()
buffer = ""
if not stmt.startswith("INSERT"):
continue
total += 1
if process_line(stmt, conn):
ok += 1
else:
errors += 1
if total % 10000 == 0:
print(f" {total:>8} processed, {ok} ok, {errors} errors")
print(f"\nDONE: {total} total, {ok} ok, {errors} errors")
conn.close()
if __name__ == "__main__":
main()