#!/usr/bin/env python3 """ F3 Migration: Populate object_synonyms from hardcoded dict. Source: _OBJECT_SYNONYMS (control_dedup.py) — 75 synonyms Usage: python3 scripts/f3_migrate_objects.py --dry-run python3 scripts/f3_migrate_objects.py --db-host macmini """ import argparse import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from services.control_dedup import _OBJECT_SYNONYMS # noqa: E402 def build_rows() -> list[dict]: """Build object_synonyms rows.""" rows = [] for synonym, canonical in _OBJECT_SYNONYMS.items(): # Detect language (heuristic: German if contains umlauts or common DE words) lang = "de" lower = synonym.lower() if all(c in "abcdefghijklmnopqrstuvwxyz0123456789 -_" for c in lower): # Pure ASCII — likely English lang = "en" # Override for known German without umlauts if lower in ("passwort", "kennwort", "zugangsdaten", "fernzugriff", "sitzung", "firewall", "netzwerk", "vorfall", "schwachstelle", "richtlinie", "schulung", "protokoll", "datensicherung", "wiederherstellung"): lang = "de" rows.append({ "canonical_token": canonical, "synonym": lower, "language": lang, "source": "migration", }) return rows def insert_via_sqlalchemy(rows: list[dict], db_host: str): """Insert rows using SQLAlchemy.""" from sqlalchemy import create_engine, text url = "postgresql://breakpilot:breakpilot123@%s:5432/breakpilot_db" % db_host engine = create_engine(url) with engine.connect() as conn: conn.execute(text("SET search_path TO compliance, public")) inserted = 0 for row in rows: conn.execute( text(""" INSERT INTO object_synonyms (canonical_token, synonym, language, source) VALUES (:canonical_token, :synonym, :language, :source) ON CONFLICT (synonym, language) DO UPDATE SET canonical_token = EXCLUDED.canonical_token, source = EXCLUDED.source """), row, ) inserted += 1 conn.commit() print("Inserted %d object_synonyms" % inserted) def main(): parser = argparse.ArgumentParser(description="Migrate object synonyms") parser.add_argument("--dry-run", action="store_true", help="Print stats only") parser.add_argument("--db-host", default="localhost", help="PostgreSQL host") args = parser.parse_args() rows = build_rows() print("Object synonyms: %d" % len(rows)) # Group by canonical by_canonical = {} for r in rows: by_canonical[r["canonical_token"]] = by_canonical.get(r["canonical_token"], 0) + 1 print("Unique canonical tokens: %d" % len(by_canonical)) print("Top tokens: %s" % dict(sorted(by_canonical.items(), key=lambda x: -x[1])[:10])) if args.dry_run: return insert_via_sqlalchemy(rows, args.db_host) if __name__ == "__main__": main()