feat(pipeline): F2+F3 action/object ontology — DB-backed normalization
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 31s

Migrates ACTION_TYPES (26+8 types), _NEGATIVE_PATTERNS (22), _ACTION_SYNONYMS
(65), and _OBJECT_SYNONYMS (75) from hardcoded dicts to DB tables.

- SQL migration: 003_action_object_ontology.sql (3 tables)
- Migration scripts: f2_migrate_actions.py (34 types, 145 synonyms), f3_migrate_objects.py (75 objects)
- OntologyRegistry cache: 5min TTL, raises RuntimeError if empty (safe fallback to dicts)
- control_ontology.classify_action/get_phase delegate to DB with dict fallback
- control_dedup.normalize_action/normalize_object delegate to DB with dict fallback
- 25 new tests, 446 total pass, 0 regressions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 23:47:53 +02:00
parent aab8eeb335
commit 652e3a65a3
7 changed files with 854 additions and 16 deletions
@@ -0,0 +1,217 @@
"""
DB-backed Action & Object Ontology Registry with in-memory cache.
Replaces hardcoded ACTION_TYPES, _NEGATIVE_PATTERNS, _ACTION_SYNONYMS,
and _OBJECT_SYNONYMS with PostgreSQL tables.
Cache TTL: 5 minutes. Thread-safe via simple timestamp check.
Falls back to hardcoded dicts if DB is unavailable.
"""
import logging
import re
import time
from typing import Optional
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
from db.session import SessionLocal
logger = logging.getLogger(__name__)
_CACHE_TTL_SECONDS = 300 # 5 minutes
class OntologyRegistry:
"""In-memory cache of action_types, action_synonyms, and object_synonyms."""
def __init__(self):
# Action types: canonical_name → phase
self._action_phases: dict[str, str] = {}
# Alias → canonical action (for classify_action)
self._alias_to_action: dict[str, str] = {}
# Negative patterns: [(pattern, action_type)] ordered longest first
self._negative_patterns: list[tuple[str, str]] = []
# Action synonyms for dedup: synonym → canonical (for normalize_action)
self._action_synonyms: dict[str, str] = {}
# Object synonyms: synonym → canonical_token (for normalize_object)
self._object_synonyms: dict[str, str] = {}
# Sorted object keys (longest first) for substring matching
self._object_keys_sorted: list[str] = []
self._loaded_at: float = 0.0
def _is_stale(self) -> bool:
return (time.monotonic() - self._loaded_at) > _CACHE_TTL_SECONDS
def _load(self) -> bool:
"""Load all ontology data from DB into memory."""
try:
db = SessionLocal()
try:
return self._load_from_db(db)
finally:
db.close()
except SQLAlchemyError:
logger.warning(
"Failed to load ontology from DB — using stale cache",
exc_info=True,
)
return False
def _load_from_db(self, db) -> bool:
"""Load from DB session."""
# 1. Action types
rows = db.execute(text(
"SELECT canonical_name, phase FROM action_types"
)).fetchall()
action_phases = {r[0]: r[1] for r in rows}
# 2. Action synonyms (aliases + negative patterns)
rows = db.execute(text(
"SELECT canonical_action, synonym, pattern_type FROM action_synonyms"
)).fetchall()
alias_to_action: dict[str, str] = {}
negative_patterns: list[tuple[str, str]] = []
action_synonyms: dict[str, str] = {}
for canonical, synonym, ptype in rows:
if ptype == "negative_pattern":
negative_patterns.append((synonym, canonical))
else:
alias_to_action[synonym] = canonical
action_synonyms[synonym] = canonical
# Sort negative patterns: longest first (for priority matching)
negative_patterns.sort(key=lambda x: -len(x[0]))
# 3. Object synonyms
rows = db.execute(text(
"SELECT canonical_token, synonym FROM object_synonyms"
)).fetchall()
object_synonyms = {r[1]: r[0] for r in rows}
object_keys_sorted = sorted(object_synonyms.keys(), key=len, reverse=True)
# Commit to cache
self._action_phases = action_phases
self._alias_to_action = alias_to_action
self._negative_patterns = negative_patterns
self._action_synonyms = action_synonyms
self._object_synonyms = object_synonyms
self._object_keys_sorted = object_keys_sorted
self._loaded_at = time.monotonic()
logger.info(
"Ontology loaded: %d action_types, %d aliases, %d neg_patterns, %d object_synonyms",
len(action_phases), len(alias_to_action),
len(negative_patterns), len(object_synonyms),
)
return True
@property
def is_loaded(self) -> bool:
"""True if the cache has any data."""
return len(self._action_phases) > 0
def _ensure_loaded(self) -> None:
if self._is_stale():
self._load()
if not self.is_loaded:
raise RuntimeError("OntologyRegistry has no data")
# ── Action Classification (replaces control_ontology.classify_action) ──
def classify_action(self, text_input: str) -> str:
"""Classify text into a canonical action_type."""
self._ensure_loaded()
text_lower = text_input.lower().strip()
# Check negative patterns first
for pattern, action_type in self._negative_patterns:
if pattern in text_lower:
return action_type
# Direct alias match
if text_lower in self._alias_to_action:
return self._alias_to_action[text_lower]
# Substring match (longest first)
best_match = ""
best_action = "implement"
for alias, action_type in sorted(
self._alias_to_action.items(), key=lambda x: -len(x[0])
):
if alias in text_lower and len(alias) > len(best_match):
best_match = alias
best_action = action_type
return best_action
def get_phase(self, action_type: str) -> str:
"""Get the control_phase for an action_type."""
self._ensure_loaded()
return self._action_phases.get(action_type, "implementation")
# ── Action Normalization (replaces control_dedup.normalize_action) ──
def normalize_action(self, action: str) -> str:
"""Normalize an action verb to a canonical English form."""
self._ensure_loaded()
if not action:
return ""
action = action.strip().lower()
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
if action in self._action_synonyms:
return self._action_synonyms[action]
if action_base in self._action_synonyms:
return self._action_synonyms[action_base]
for verb, canonical in self._action_synonyms.items():
if action.startswith(verb) or verb.startswith(action):
return canonical
return action
# ── Object Normalization (replaces control_dedup.normalize_object) ──
def normalize_object(self, obj: str) -> str:
"""Normalize an object to a canonical token."""
self._ensure_loaded()
if not obj:
return ""
obj_lower = obj.strip().lower()
# Exact match
if obj_lower in self._object_synonyms:
return self._object_synonyms[obj_lower]
# Substring match (longest phrase first)
for phrase in self._object_keys_sorted:
if phrase in obj_lower:
return self._object_synonyms[phrase]
return obj_lower
def get_action_types(self) -> dict[str, str]:
"""Return all action_type → phase mappings."""
self._ensure_loaded()
return dict(self._action_phases)
def get_object_synonyms(self) -> dict[str, str]:
"""Return all object synonym → canonical mappings."""
self._ensure_loaded()
return dict(self._object_synonyms)
# Module-level singleton
_registry: Optional[OntologyRegistry] = None
def get_ontology_registry() -> OntologyRegistry:
"""Get or create the singleton OntologyRegistry instance."""
global _registry
if _registry is None:
_registry = OntologyRegistry()
return _registry