# mypy: disable-error-code="no-any-return,arg-type" """Use-Case → Controls retrieval — the SHARED layer the document agents AND the CRA finding-mapper query to pull the controls that belong to a topic. Read-only over the existing ``mc_use_case_mappings`` seed (no schema change). The seed is recall-oriented ("this MC comes from a law about the topic"); the ranking here is a deterministic *precision proxy* — is_primary + mapping confidence + cluster size, plus a keyword-relevance score derived from the use-case registry. The LLM precision pass (Phase B) refines this later; the ranking field stays the same so consumers do not change. """ from __future__ import annotations from typing import Any, Optional from sqlalchemy import text from sqlalchemy.orm import Session from compliance.data.use_case_registry import REGISTRY, is_valid_use_case from compliance.domain import NotFoundError def relevance_score( title: Optional[str], objective: Optional[str], keyword_tokens: tuple[str, ...], is_primary: Optional[bool], confidence: Optional[float], ) -> float: """Deterministic precision proxy in [0, 1]. Pure → unit-testable. Combines the recall signals already on the mapping (primary flag, mapping confidence) with a content signal: how many of the use-case's registry keyword tokens appear in the control's own representative text. The content term is what separates "actually about this topic" from "merely from a related law" — the core of the precision problem. """ haystack = f"{title or ''} {objective or ''}".lower() hits = sum(1 for kw in keyword_tokens if kw and kw in haystack) kw_score = min(hits / 3.0, 1.0) if keyword_tokens else 0.0 score = (0.5 if is_primary else 0.0) + 0.3 * float(confidence or 0.0) + 0.2 * kw_score return round(min(score, 1.0), 3) # Representative member (most severe, then lowest control_id) carries the # human-readable title/objective — master_controls.canonical_name is only the # merge token, so we surface a real member control per master. _LIST_SQL = text(""" SELECT mc.id, mc.master_control_id, mc.canonical_name, mc.total_controls, m.is_primary, m.confidence, (SELECT r.source_regulation FROM mc_regulations r WHERE r.master_control_uuid = mc.id AND r.is_primary LIMIT 1) AS primary_regulation, rep.title, rep.objective, rep.severity, rep.category FROM master_controls mc JOIN mc_use_case_mappings m ON m.master_control_uuid = mc.id AND m.use_case = :uc LEFT JOIN LATERAL ( SELECT cc.title, cc.objective, cc.severity, cc.category FROM master_control_members mcm JOIN canonical_controls cc ON cc.id = mcm.control_uuid WHERE mcm.master_control_uuid = mc.id ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END, cc.control_id LIMIT 1 ) rep ON true WHERE (:primary_only = false OR m.is_primary) ORDER BY m.is_primary DESC, m.confidence DESC NULLS LAST, mc.total_controls DESC LIMIT :lim OFFSET :off """) # Atom-grain path: the one-time Haiku classification (atom_classification) gives # per-atom relevance + sub-topic. Far more precise + organized than the master # seed. Preferred whenever the use-case has been processed. _ATOM_LIST_SQL = text(""" SELECT ac.control_uuid, ac.sub_topic, ac.canonical_obligation, cc.control_id, cc.title, cc.objective, cc.severity, cpl.source_regulation, cpl.source_article FROM atom_classification ac JOIN canonical_controls cc ON cc.id = ac.control_uuid LEFT JOIN LATERAL ( SELECT cpl.source_regulation, cpl.source_article FROM control_parent_links cpl WHERE cpl.control_uuid = ac.control_uuid LIMIT 1 ) cpl ON true WHERE ac.use_case = :uc AND ac.relevant = true AND (:sub IS NULL OR ac.sub_topic = :sub) ORDER BY ac.sub_topic NULLS LAST, CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END, cc.title LIMIT :lim OFFSET :off """) class UseCaseControlsService: """Topic → controls retrieval over the seeded use-case mappings.""" def __init__(self, db: Session) -> None: self.db = db def list_use_cases(self) -> list[dict[str, Any]]: """Registry use-cases with live counts — atom-grain (Haiku classification) plus the legacy master seed. Backs the coverage overview so every topic is visible with how many obligations it actually carries.""" counts = { row[0]: int(row[1]) for row in self.db.execute(text( "SELECT use_case, count(*) FROM mc_use_case_mappings " "GROUP BY use_case" )).fetchall() } atom: dict[str, tuple[int, int]] = {} if self.db.execute(text( "SELECT to_regclass('compliance.atom_classification')" )).scalar() is not None: atom = { row[0]: (int(row[1]), int(row[2])) for row in self.db.execute(text( "SELECT use_case, count(*), count(*) FILTER (WHERE relevant) " "FROM atom_classification GROUP BY use_case" )).fetchall() } out = [ { "key": uc.key, "label": uc.label, "group": uc.group, "regulations": list(uc.regulations), "verification_methods": list(uc.verification_methods), "mapped_controls": counts.get(uc.key, 0), "atom_total": atom.get(uc.key, (0, 0))[0], "atom_relevant": atom.get(uc.key, (0, 0))[1], } for uc in REGISTRY.values() if uc.enabled ] out.sort(key=lambda x: (x["atom_relevant"], x["mapped_controls"]), reverse=True) return out def controls_for_use_case( self, use_case: str, primary_only: bool = False, limit: int = 50, offset: int = 0, sub_topic: Optional[str] = None, ) -> dict[str, Any]: """Controls for ``use_case``. Prefers the atom-grain Haiku classification (precise + sub-topic-organized) when present; falls back to the master-grain seed otherwise.""" if not is_valid_use_case(use_case): raise NotFoundError(f"Unknown use_case '{use_case}'") uc = REGISTRY[use_case] lim = min(max(int(limit), 1), 200) off = max(int(offset), 0) if self._has_atom_grain(use_case): return self._atom_grain(uc, lim, off, sub_topic) # --- master-grain fallback (recall seed) --- count_sql = ( "SELECT count(*) FROM mc_use_case_mappings WHERE use_case = :uc" + (" AND is_primary" if primary_only else "") ) total = self.db.execute(text(count_sql), {"uc": use_case}).scalar() or 0 rows = self.db.execute(_LIST_SQL, { "uc": use_case, "primary_only": bool(primary_only), "lim": lim, "off": off, }).fetchall() controls = [ { "id": str(r.id), "master_control_id": r.master_control_id, "title": r.title or r.canonical_name, "objective": r.objective, "severity": r.severity, "category": r.category, "member_count": r.total_controls, "is_primary": bool(r.is_primary), "confidence": float(r.confidence) if r.confidence is not None else None, "primary_regulation": r.primary_regulation, "relevance": relevance_score( r.title, r.objective, uc.keyword_tokens, r.is_primary, r.confidence, ), } for r in rows ] return { "use_case": uc.key, "label": uc.label, "group": uc.group, "granularity": "master", "total": int(total), "limit": lim, "offset": off, "primary_only": bool(primary_only), "controls": controls, } def _has_atom_grain(self, use_case: str) -> bool: if self.db.execute( text("SELECT to_regclass('compliance.atom_classification')") ).scalar() is None: return False return (self.db.execute( text("SELECT count(*) FROM atom_classification WHERE use_case = :uc"), {"uc": use_case}, ).scalar() or 0) > 0 def _atom_grain( self, uc, lim: int, off: int, sub_topic: Optional[str], ) -> dict[str, Any]: total = self.db.execute(text( "SELECT count(*) FROM atom_classification " "WHERE use_case = :uc AND relevant = true " "AND (:sub IS NULL OR sub_topic = :sub)" ), {"uc": uc.key, "sub": sub_topic}).scalar() or 0 facet = { row[0]: int(row[1]) for row in self.db.execute(text( "SELECT COALESCE(sub_topic, '(none)'), count(*) " "FROM atom_classification WHERE use_case = :uc AND relevant = true " "GROUP BY 1 ORDER BY 2 DESC" ), {"uc": uc.key}).fetchall() } rows = self.db.execute(_ATOM_LIST_SQL, { "uc": uc.key, "sub": sub_topic, "lim": lim, "off": off, }).fetchall() controls = [ { "id": str(r.control_uuid), "control_id": r.control_id, "title": r.title, "objective": r.objective, "severity": r.severity, "sub_topic": r.sub_topic, "canonical_obligation": r.canonical_obligation, "source_regulation": r.source_regulation, "source_article": r.source_article, } for r in rows ] return { "use_case": uc.key, "label": uc.label, "group": uc.group, "granularity": "atom", "total": int(total), "limit": lim, "offset": off, "sub_topic": sub_topic, "subtopic_counts": facet, "controls": controls, }