feat(pipeline): F2+F3 action/object ontology — DB-backed normalization
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 31s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 31s
Migrates ACTION_TYPES (26+8 types), _NEGATIVE_PATTERNS (22), _ACTION_SYNONYMS (65), and _OBJECT_SYNONYMS (75) from hardcoded dicts to DB tables. - SQL migration: 003_action_object_ontology.sql (3 tables) - Migration scripts: f2_migrate_actions.py (34 types, 145 synonyms), f3_migrate_objects.py (75 objects) - OntologyRegistry cache: 5min TTL, raises RuntimeError if empty (safe fallback to dicts) - control_ontology.classify_action/get_phase delegate to DB with dict fallback - control_dedup.normalize_action/normalize_object delegate to DB with dict fallback - 25 new tests, 446 total pass, 0 regressions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -126,22 +126,29 @@ _ACTION_SYNONYMS: dict[str, str] = {
|
||||
|
||||
|
||||
def normalize_action(action: str) -> str:
|
||||
"""Normalize an action verb to a canonical English form."""
|
||||
"""Normalize an action verb to a canonical English form.
|
||||
|
||||
Delegates to DB-backed OntologyRegistry with dict fallback.
|
||||
"""
|
||||
try:
|
||||
from .ontology_registry import get_ontology_registry
|
||||
return get_ontology_registry().normalize_action(action)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: original logic
|
||||
if not action:
|
||||
return ""
|
||||
action = action.strip().lower()
|
||||
# Strip German infinitive/conjugation suffixes for lookup
|
||||
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
|
||||
# Try exact match first, then base form
|
||||
if action in _ACTION_SYNONYMS:
|
||||
return _ACTION_SYNONYMS[action]
|
||||
if action_base in _ACTION_SYNONYMS:
|
||||
return _ACTION_SYNONYMS[action_base]
|
||||
# Fuzzy: check if action starts with any known verb
|
||||
for verb, canonical in _ACTION_SYNONYMS.items():
|
||||
if action.startswith(verb) or verb.startswith(action):
|
||||
return canonical
|
||||
return action # fallback: return as-is
|
||||
return action
|
||||
|
||||
|
||||
# ── Object Normalization ─────────────────────────────────────────────
|
||||
@@ -237,7 +244,19 @@ _OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
|
||||
|
||||
|
||||
def normalize_object(obj: str) -> str:
|
||||
"""Normalize a compliance object to a canonical token."""
|
||||
"""Normalize a compliance object to a canonical token.
|
||||
|
||||
Delegates to DB-backed OntologyRegistry with dict fallback.
|
||||
"""
|
||||
# Try DB-backed registry first
|
||||
try:
|
||||
from .ontology_registry import get_ontology_registry
|
||||
result = get_ontology_registry().normalize_object(obj)
|
||||
if result != obj.strip().lower():
|
||||
return result
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not obj:
|
||||
return ""
|
||||
obj_lower = obj.strip().lower()
|
||||
|
||||
@@ -223,31 +223,43 @@ _FRAMEWORK_PATTERNS: list[str] = [
|
||||
|
||||
|
||||
def classify_action(text: str) -> str:
|
||||
"""Classify an obligation action text into a canonical action_type."""
|
||||
text_lower = text.lower().strip()
|
||||
"""Classify an obligation action text into a canonical action_type.
|
||||
|
||||
# Check negative patterns first
|
||||
Delegates to DB-backed OntologyRegistry (with 5min cache).
|
||||
Falls back to hardcoded dicts if DB is unavailable.
|
||||
"""
|
||||
try:
|
||||
from .ontology_registry import get_ontology_registry
|
||||
return get_ontology_registry().classify_action(text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: original logic
|
||||
text_lower = text.lower().strip()
|
||||
for pattern, action_type in _NEGATIVE_PATTERNS:
|
||||
if pattern in text_lower:
|
||||
return action_type
|
||||
|
||||
# Direct alias match
|
||||
if text_lower in _ALIAS_TO_ACTION:
|
||||
return _ALIAS_TO_ACTION[text_lower]
|
||||
|
||||
# Substring match (longest first)
|
||||
best_match = ""
|
||||
best_action = "implement" # default fallback
|
||||
best_action = "implement"
|
||||
for alias, action_type in sorted(_ALIAS_TO_ACTION.items(), key=lambda x: -len(x[0])):
|
||||
if alias in text_lower and len(alias) > len(best_match):
|
||||
best_match = alias
|
||||
best_action = action_type
|
||||
|
||||
return best_action
|
||||
|
||||
|
||||
def get_phase(action_type: str) -> str:
|
||||
"""Get the control_phase for an action_type."""
|
||||
"""Get the control_phase for an action_type.
|
||||
|
||||
Delegates to DB-backed OntologyRegistry with dict fallback.
|
||||
"""
|
||||
try:
|
||||
from .ontology_registry import get_ontology_registry
|
||||
return get_ontology_registry().get_phase(action_type)
|
||||
except Exception:
|
||||
pass
|
||||
info = ACTION_TYPES.get(action_type, {})
|
||||
return info.get("phase", "implementation")
|
||||
|
||||
|
||||
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
DB-backed Action & Object Ontology Registry with in-memory cache.
|
||||
|
||||
Replaces hardcoded ACTION_TYPES, _NEGATIVE_PATTERNS, _ACTION_SYNONYMS,
|
||||
and _OBJECT_SYNONYMS with PostgreSQL tables.
|
||||
|
||||
Cache TTL: 5 minutes. Thread-safe via simple timestamp check.
|
||||
Falls back to hardcoded dicts if DB is unavailable.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from db.session import SessionLocal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CACHE_TTL_SECONDS = 300 # 5 minutes
|
||||
|
||||
|
||||
class OntologyRegistry:
|
||||
"""In-memory cache of action_types, action_synonyms, and object_synonyms."""
|
||||
|
||||
def __init__(self):
|
||||
# Action types: canonical_name → phase
|
||||
self._action_phases: dict[str, str] = {}
|
||||
# Alias → canonical action (for classify_action)
|
||||
self._alias_to_action: dict[str, str] = {}
|
||||
# Negative patterns: [(pattern, action_type)] ordered longest first
|
||||
self._negative_patterns: list[tuple[str, str]] = []
|
||||
# Action synonyms for dedup: synonym → canonical (for normalize_action)
|
||||
self._action_synonyms: dict[str, str] = {}
|
||||
# Object synonyms: synonym → canonical_token (for normalize_object)
|
||||
self._object_synonyms: dict[str, str] = {}
|
||||
# Sorted object keys (longest first) for substring matching
|
||||
self._object_keys_sorted: list[str] = []
|
||||
self._loaded_at: float = 0.0
|
||||
|
||||
def _is_stale(self) -> bool:
|
||||
return (time.monotonic() - self._loaded_at) > _CACHE_TTL_SECONDS
|
||||
|
||||
def _load(self) -> bool:
|
||||
"""Load all ontology data from DB into memory."""
|
||||
try:
|
||||
db = SessionLocal()
|
||||
try:
|
||||
return self._load_from_db(db)
|
||||
finally:
|
||||
db.close()
|
||||
except SQLAlchemyError:
|
||||
logger.warning(
|
||||
"Failed to load ontology from DB — using stale cache",
|
||||
exc_info=True,
|
||||
)
|
||||
return False
|
||||
|
||||
def _load_from_db(self, db) -> bool:
|
||||
"""Load from DB session."""
|
||||
# 1. Action types
|
||||
rows = db.execute(text(
|
||||
"SELECT canonical_name, phase FROM action_types"
|
||||
)).fetchall()
|
||||
action_phases = {r[0]: r[1] for r in rows}
|
||||
|
||||
# 2. Action synonyms (aliases + negative patterns)
|
||||
rows = db.execute(text(
|
||||
"SELECT canonical_action, synonym, pattern_type FROM action_synonyms"
|
||||
)).fetchall()
|
||||
|
||||
alias_to_action: dict[str, str] = {}
|
||||
negative_patterns: list[tuple[str, str]] = []
|
||||
action_synonyms: dict[str, str] = {}
|
||||
|
||||
for canonical, synonym, ptype in rows:
|
||||
if ptype == "negative_pattern":
|
||||
negative_patterns.append((synonym, canonical))
|
||||
else:
|
||||
alias_to_action[synonym] = canonical
|
||||
action_synonyms[synonym] = canonical
|
||||
|
||||
# Sort negative patterns: longest first (for priority matching)
|
||||
negative_patterns.sort(key=lambda x: -len(x[0]))
|
||||
|
||||
# 3. Object synonyms
|
||||
rows = db.execute(text(
|
||||
"SELECT canonical_token, synonym FROM object_synonyms"
|
||||
)).fetchall()
|
||||
object_synonyms = {r[1]: r[0] for r in rows}
|
||||
object_keys_sorted = sorted(object_synonyms.keys(), key=len, reverse=True)
|
||||
|
||||
# Commit to cache
|
||||
self._action_phases = action_phases
|
||||
self._alias_to_action = alias_to_action
|
||||
self._negative_patterns = negative_patterns
|
||||
self._action_synonyms = action_synonyms
|
||||
self._object_synonyms = object_synonyms
|
||||
self._object_keys_sorted = object_keys_sorted
|
||||
self._loaded_at = time.monotonic()
|
||||
|
||||
logger.info(
|
||||
"Ontology loaded: %d action_types, %d aliases, %d neg_patterns, %d object_synonyms",
|
||||
len(action_phases), len(alias_to_action),
|
||||
len(negative_patterns), len(object_synonyms),
|
||||
)
|
||||
return True
|
||||
|
||||
@property
|
||||
def is_loaded(self) -> bool:
|
||||
"""True if the cache has any data."""
|
||||
return len(self._action_phases) > 0
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._is_stale():
|
||||
self._load()
|
||||
if not self.is_loaded:
|
||||
raise RuntimeError("OntologyRegistry has no data")
|
||||
|
||||
# ── Action Classification (replaces control_ontology.classify_action) ──
|
||||
|
||||
def classify_action(self, text_input: str) -> str:
|
||||
"""Classify text into a canonical action_type."""
|
||||
self._ensure_loaded()
|
||||
text_lower = text_input.lower().strip()
|
||||
|
||||
# Check negative patterns first
|
||||
for pattern, action_type in self._negative_patterns:
|
||||
if pattern in text_lower:
|
||||
return action_type
|
||||
|
||||
# Direct alias match
|
||||
if text_lower in self._alias_to_action:
|
||||
return self._alias_to_action[text_lower]
|
||||
|
||||
# Substring match (longest first)
|
||||
best_match = ""
|
||||
best_action = "implement"
|
||||
for alias, action_type in sorted(
|
||||
self._alias_to_action.items(), key=lambda x: -len(x[0])
|
||||
):
|
||||
if alias in text_lower and len(alias) > len(best_match):
|
||||
best_match = alias
|
||||
best_action = action_type
|
||||
|
||||
return best_action
|
||||
|
||||
def get_phase(self, action_type: str) -> str:
|
||||
"""Get the control_phase for an action_type."""
|
||||
self._ensure_loaded()
|
||||
return self._action_phases.get(action_type, "implementation")
|
||||
|
||||
# ── Action Normalization (replaces control_dedup.normalize_action) ──
|
||||
|
||||
def normalize_action(self, action: str) -> str:
|
||||
"""Normalize an action verb to a canonical English form."""
|
||||
self._ensure_loaded()
|
||||
if not action:
|
||||
return ""
|
||||
action = action.strip().lower()
|
||||
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
|
||||
|
||||
if action in self._action_synonyms:
|
||||
return self._action_synonyms[action]
|
||||
if action_base in self._action_synonyms:
|
||||
return self._action_synonyms[action_base]
|
||||
|
||||
for verb, canonical in self._action_synonyms.items():
|
||||
if action.startswith(verb) or verb.startswith(action):
|
||||
return canonical
|
||||
|
||||
return action
|
||||
|
||||
# ── Object Normalization (replaces control_dedup.normalize_object) ──
|
||||
|
||||
def normalize_object(self, obj: str) -> str:
|
||||
"""Normalize an object to a canonical token."""
|
||||
self._ensure_loaded()
|
||||
if not obj:
|
||||
return ""
|
||||
obj_lower = obj.strip().lower()
|
||||
|
||||
# Exact match
|
||||
if obj_lower in self._object_synonyms:
|
||||
return self._object_synonyms[obj_lower]
|
||||
|
||||
# Substring match (longest phrase first)
|
||||
for phrase in self._object_keys_sorted:
|
||||
if phrase in obj_lower:
|
||||
return self._object_synonyms[phrase]
|
||||
|
||||
return obj_lower
|
||||
|
||||
def get_action_types(self) -> dict[str, str]:
|
||||
"""Return all action_type → phase mappings."""
|
||||
self._ensure_loaded()
|
||||
return dict(self._action_phases)
|
||||
|
||||
def get_object_synonyms(self) -> dict[str, str]:
|
||||
"""Return all object synonym → canonical mappings."""
|
||||
self._ensure_loaded()
|
||||
return dict(self._object_synonyms)
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
_registry: Optional[OntologyRegistry] = None
|
||||
|
||||
|
||||
def get_ontology_registry() -> OntologyRegistry:
|
||||
"""Get or create the singleton OntologyRegistry instance."""
|
||||
global _registry
|
||||
if _registry is None:
|
||||
_registry = OntologyRegistry()
|
||||
return _registry
|
||||
Reference in New Issue
Block a user