feat(pipeline): F2+F3 action/object ontology — DB-backed normalization
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 31s

Migrates ACTION_TYPES (26+8 types), _NEGATIVE_PATTERNS (22), _ACTION_SYNONYMS
(65), and _OBJECT_SYNONYMS (75) from hardcoded dicts to DB tables.

- SQL migration: 003_action_object_ontology.sql (3 tables)
- Migration scripts: f2_migrate_actions.py (34 types, 145 synonyms), f3_migrate_objects.py (75 objects)
- OntologyRegistry cache: 5min TTL, raises RuntimeError if empty (safe fallback to dicts)
- control_ontology.classify_action/get_phase delegate to DB with dict fallback
- control_dedup.normalize_action/normalize_object delegate to DB with dict fallback
- 25 new tests, 446 total pass, 0 regressions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 23:47:53 +02:00
parent aab8eeb335
commit 652e3a65a3
7 changed files with 854 additions and 16 deletions
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
F3 Migration: Populate object_synonyms from hardcoded dict.
Source: _OBJECT_SYNONYMS (control_dedup.py) — 75 synonyms
Usage:
python3 scripts/f3_migrate_objects.py --dry-run
python3 scripts/f3_migrate_objects.py --db-host macmini
"""
import argparse
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from services.control_dedup import _OBJECT_SYNONYMS # noqa: E402
def build_rows() -> list[dict]:
"""Build object_synonyms rows."""
rows = []
for synonym, canonical in _OBJECT_SYNONYMS.items():
# Detect language (heuristic: German if contains umlauts or common DE words)
lang = "de"
lower = synonym.lower()
if all(c in "abcdefghijklmnopqrstuvwxyz0123456789 -_" for c in lower):
# Pure ASCII — likely English
lang = "en"
# Override for known German without umlauts
if lower in ("passwort", "kennwort", "zugangsdaten", "fernzugriff",
"sitzung", "firewall", "netzwerk", "vorfall",
"schwachstelle", "richtlinie", "schulung",
"protokoll", "datensicherung", "wiederherstellung"):
lang = "de"
rows.append({
"canonical_token": canonical,
"synonym": lower,
"language": lang,
"source": "migration",
})
return rows
def insert_via_sqlalchemy(rows: list[dict], db_host: str):
"""Insert rows using SQLAlchemy."""
from sqlalchemy import create_engine, text
url = "postgresql://breakpilot:breakpilot123@%s:5432/breakpilot_db" % db_host
engine = create_engine(url)
with engine.connect() as conn:
conn.execute(text("SET search_path TO compliance, public"))
inserted = 0
for row in rows:
conn.execute(
text("""
INSERT INTO object_synonyms
(canonical_token, synonym, language, source)
VALUES
(:canonical_token, :synonym, :language, :source)
ON CONFLICT (synonym, language) DO UPDATE SET
canonical_token = EXCLUDED.canonical_token,
source = EXCLUDED.source
"""),
row,
)
inserted += 1
conn.commit()
print("Inserted %d object_synonyms" % inserted)
def main():
parser = argparse.ArgumentParser(description="Migrate object synonyms")
parser.add_argument("--dry-run", action="store_true", help="Print stats only")
parser.add_argument("--db-host", default="localhost", help="PostgreSQL host")
args = parser.parse_args()
rows = build_rows()
print("Object synonyms: %d" % len(rows))
# Group by canonical
by_canonical = {}
for r in rows:
by_canonical[r["canonical_token"]] = by_canonical.get(r["canonical_token"], 0) + 1
print("Unique canonical tokens: %d" % len(by_canonical))
print("Top tokens: %s" % dict(sorted(by_canonical.items(), key=lambda x: -x[1])[:10]))
if args.dry_run:
return
insert_via_sqlalchemy(rows, args.db_host)
if __name__ == "__main__":
main()