652e3a65a3
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 31s
Migrates ACTION_TYPES (26+8 types), _NEGATIVE_PATTERNS (22), _ACTION_SYNONYMS (65), and _OBJECT_SYNONYMS (75) from hardcoded dicts to DB tables. - SQL migration: 003_action_object_ontology.sql (3 tables) - Migration scripts: f2_migrate_actions.py (34 types, 145 synonyms), f3_migrate_objects.py (75 objects) - OntologyRegistry cache: 5min TTL, raises RuntimeError if empty (safe fallback to dicts) - control_ontology.classify_action/get_phase delegate to DB with dict fallback - control_dedup.normalize_action/normalize_object delegate to DB with dict fallback - 25 new tests, 446 total pass, 0 regressions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
218 lines
7.3 KiB
Python
218 lines
7.3 KiB
Python
"""
|
|
DB-backed Action & Object Ontology Registry with in-memory cache.
|
|
|
|
Replaces hardcoded ACTION_TYPES, _NEGATIVE_PATTERNS, _ACTION_SYNONYMS,
|
|
and _OBJECT_SYNONYMS with PostgreSQL tables.
|
|
|
|
Cache TTL: 5 minutes. Thread-safe via simple timestamp check.
|
|
Falls back to hardcoded dicts if DB is unavailable.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import time
|
|
from typing import Optional
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
|
|
from db.session import SessionLocal
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_CACHE_TTL_SECONDS = 300 # 5 minutes
|
|
|
|
|
|
class OntologyRegistry:
|
|
"""In-memory cache of action_types, action_synonyms, and object_synonyms."""
|
|
|
|
def __init__(self):
|
|
# Action types: canonical_name → phase
|
|
self._action_phases: dict[str, str] = {}
|
|
# Alias → canonical action (for classify_action)
|
|
self._alias_to_action: dict[str, str] = {}
|
|
# Negative patterns: [(pattern, action_type)] ordered longest first
|
|
self._negative_patterns: list[tuple[str, str]] = []
|
|
# Action synonyms for dedup: synonym → canonical (for normalize_action)
|
|
self._action_synonyms: dict[str, str] = {}
|
|
# Object synonyms: synonym → canonical_token (for normalize_object)
|
|
self._object_synonyms: dict[str, str] = {}
|
|
# Sorted object keys (longest first) for substring matching
|
|
self._object_keys_sorted: list[str] = []
|
|
self._loaded_at: float = 0.0
|
|
|
|
def _is_stale(self) -> bool:
|
|
return (time.monotonic() - self._loaded_at) > _CACHE_TTL_SECONDS
|
|
|
|
def _load(self) -> bool:
|
|
"""Load all ontology data from DB into memory."""
|
|
try:
|
|
db = SessionLocal()
|
|
try:
|
|
return self._load_from_db(db)
|
|
finally:
|
|
db.close()
|
|
except SQLAlchemyError:
|
|
logger.warning(
|
|
"Failed to load ontology from DB — using stale cache",
|
|
exc_info=True,
|
|
)
|
|
return False
|
|
|
|
def _load_from_db(self, db) -> bool:
|
|
"""Load from DB session."""
|
|
# 1. Action types
|
|
rows = db.execute(text(
|
|
"SELECT canonical_name, phase FROM action_types"
|
|
)).fetchall()
|
|
action_phases = {r[0]: r[1] for r in rows}
|
|
|
|
# 2. Action synonyms (aliases + negative patterns)
|
|
rows = db.execute(text(
|
|
"SELECT canonical_action, synonym, pattern_type FROM action_synonyms"
|
|
)).fetchall()
|
|
|
|
alias_to_action: dict[str, str] = {}
|
|
negative_patterns: list[tuple[str, str]] = []
|
|
action_synonyms: dict[str, str] = {}
|
|
|
|
for canonical, synonym, ptype in rows:
|
|
if ptype == "negative_pattern":
|
|
negative_patterns.append((synonym, canonical))
|
|
else:
|
|
alias_to_action[synonym] = canonical
|
|
action_synonyms[synonym] = canonical
|
|
|
|
# Sort negative patterns: longest first (for priority matching)
|
|
negative_patterns.sort(key=lambda x: -len(x[0]))
|
|
|
|
# 3. Object synonyms
|
|
rows = db.execute(text(
|
|
"SELECT canonical_token, synonym FROM object_synonyms"
|
|
)).fetchall()
|
|
object_synonyms = {r[1]: r[0] for r in rows}
|
|
object_keys_sorted = sorted(object_synonyms.keys(), key=len, reverse=True)
|
|
|
|
# Commit to cache
|
|
self._action_phases = action_phases
|
|
self._alias_to_action = alias_to_action
|
|
self._negative_patterns = negative_patterns
|
|
self._action_synonyms = action_synonyms
|
|
self._object_synonyms = object_synonyms
|
|
self._object_keys_sorted = object_keys_sorted
|
|
self._loaded_at = time.monotonic()
|
|
|
|
logger.info(
|
|
"Ontology loaded: %d action_types, %d aliases, %d neg_patterns, %d object_synonyms",
|
|
len(action_phases), len(alias_to_action),
|
|
len(negative_patterns), len(object_synonyms),
|
|
)
|
|
return True
|
|
|
|
@property
|
|
def is_loaded(self) -> bool:
|
|
"""True if the cache has any data."""
|
|
return len(self._action_phases) > 0
|
|
|
|
def _ensure_loaded(self) -> None:
|
|
if self._is_stale():
|
|
self._load()
|
|
if not self.is_loaded:
|
|
raise RuntimeError("OntologyRegistry has no data")
|
|
|
|
# ── Action Classification (replaces control_ontology.classify_action) ──
|
|
|
|
def classify_action(self, text_input: str) -> str:
|
|
"""Classify text into a canonical action_type."""
|
|
self._ensure_loaded()
|
|
text_lower = text_input.lower().strip()
|
|
|
|
# Check negative patterns first
|
|
for pattern, action_type in self._negative_patterns:
|
|
if pattern in text_lower:
|
|
return action_type
|
|
|
|
# Direct alias match
|
|
if text_lower in self._alias_to_action:
|
|
return self._alias_to_action[text_lower]
|
|
|
|
# Substring match (longest first)
|
|
best_match = ""
|
|
best_action = "implement"
|
|
for alias, action_type in sorted(
|
|
self._alias_to_action.items(), key=lambda x: -len(x[0])
|
|
):
|
|
if alias in text_lower and len(alias) > len(best_match):
|
|
best_match = alias
|
|
best_action = action_type
|
|
|
|
return best_action
|
|
|
|
def get_phase(self, action_type: str) -> str:
|
|
"""Get the control_phase for an action_type."""
|
|
self._ensure_loaded()
|
|
return self._action_phases.get(action_type, "implementation")
|
|
|
|
# ── Action Normalization (replaces control_dedup.normalize_action) ──
|
|
|
|
def normalize_action(self, action: str) -> str:
|
|
"""Normalize an action verb to a canonical English form."""
|
|
self._ensure_loaded()
|
|
if not action:
|
|
return ""
|
|
action = action.strip().lower()
|
|
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
|
|
|
|
if action in self._action_synonyms:
|
|
return self._action_synonyms[action]
|
|
if action_base in self._action_synonyms:
|
|
return self._action_synonyms[action_base]
|
|
|
|
for verb, canonical in self._action_synonyms.items():
|
|
if action.startswith(verb) or verb.startswith(action):
|
|
return canonical
|
|
|
|
return action
|
|
|
|
# ── Object Normalization (replaces control_dedup.normalize_object) ──
|
|
|
|
def normalize_object(self, obj: str) -> str:
|
|
"""Normalize an object to a canonical token."""
|
|
self._ensure_loaded()
|
|
if not obj:
|
|
return ""
|
|
obj_lower = obj.strip().lower()
|
|
|
|
# Exact match
|
|
if obj_lower in self._object_synonyms:
|
|
return self._object_synonyms[obj_lower]
|
|
|
|
# Substring match (longest phrase first)
|
|
for phrase in self._object_keys_sorted:
|
|
if phrase in obj_lower:
|
|
return self._object_synonyms[phrase]
|
|
|
|
return obj_lower
|
|
|
|
def get_action_types(self) -> dict[str, str]:
|
|
"""Return all action_type → phase mappings."""
|
|
self._ensure_loaded()
|
|
return dict(self._action_phases)
|
|
|
|
def get_object_synonyms(self) -> dict[str, str]:
|
|
"""Return all object synonym → canonical mappings."""
|
|
self._ensure_loaded()
|
|
return dict(self._object_synonyms)
|
|
|
|
|
|
# Module-level singleton
|
|
_registry: Optional[OntologyRegistry] = None
|
|
|
|
|
|
def get_ontology_registry() -> OntologyRegistry:
|
|
"""Get or create the singleton OntologyRegistry instance."""
|
|
global _registry
|
|
if _registry is None:
|
|
_registry = OntologyRegistry()
|
|
return _registry
|