breakpilot-compliance/backend-compliance/compliance/services/use_case_controls.py

# mypy: disable-error-code="no-any-return,arg-type"
"""Use-Case → Controls retrieval — the SHARED layer the document agents AND the
CRA finding-mapper query to pull the controls that belong to a topic.

Read-only over the existing ``mc_use_case_mappings`` seed (no schema change).
The seed is recall-oriented ("this MC comes from a law about the topic"); the
ranking here is a deterministic *precision proxy* — is_primary + mapping
confidence + cluster size, plus a keyword-relevance score derived from the
use-case registry. The LLM precision pass (Phase B) refines this later; the
ranking field stays the same so consumers do not change.
"""

from __future__ import annotations

from typing import Any, Optional

from sqlalchemy import bindparam, text
from sqlalchemy.orm import Session

from compliance.data.use_case_registry import REGISTRY, is_valid_use_case
from compliance.domain import NotFoundError


def relevance_score(
    title: Optional[str],
    objective: Optional[str],
    keyword_tokens: tuple[str, ...],
    is_primary: Optional[bool],
    confidence: Optional[float],
) -> float:
    """Deterministic precision proxy in [0, 1]. Pure → unit-testable.

    Combines the recall signals already on the mapping (primary flag, mapping
    confidence) with a content signal: how many of the use-case's registry
    keyword tokens appear in the control's own representative text. The content
    term is what separates "actually about this topic" from "merely from a
    related law" — the core of the precision problem.
    """
    haystack = f"{title or ''} {objective or ''}".lower()
    hits = sum(1 for kw in keyword_tokens if kw and kw in haystack)
    kw_score = min(hits / 3.0, 1.0) if keyword_tokens else 0.0
    score = (0.5 if is_primary else 0.0) + 0.3 * float(confidence or 0.0) + 0.2 * kw_score
    return round(min(score, 1.0), 3)


def tier_label(relevant: bool) -> str:
    """Soft tier instead of a hard filter: validated obligations are 'core',
    the rest are 'review' — shown but flagged for expert curation. The boundary
    'concrete vs. generic' is genuinely fuzzy; hiding 'review' dropped ~25% of
    the corpus, much of it real (filter validation 2026-06-15)."""
    return "core" if relevant else "review"


def source_type(license_rule: Optional[int]) -> str:
    """Provenance: 'own_library' = self-written (license_rule 3, no commercial
    source); 'derived' = lifted from a sourced document (license 1/2, the
    document is in source_regulation)."""
    return "own_library" if license_rule == 3 else "derived"


_OUT_OF_SCOPE_ADDRESSEES = ("aufsichtsbefugnis", "staat_eu", "dritter", "meta")


def addressee_applicable(addressee: Optional[str]) -> bool:
    """An obligation is applicable to a (potential) customer unless its addressee
    is clearly someone else: a supervisory authority's power, a member state / EU
    institution, a foreign third party, or pure meta. NULL = not yet classified =
    treated as applicable (conservative — nothing hidden by default)."""
    return addressee not in _OUT_OF_SCOPE_ADDRESSEES


def addressee_is_gov(addressee: Optional[str]) -> bool:
    """Public-body-as-obligor → an additive GOV hint (Kommune/Stadt = potential
    public-sector customer). The atom keeps its use-case; this is only a tag."""
    return addressee == "oeffentliche_stelle"


# Representative member (most severe, then lowest control_id) carries the
# human-readable title/objective — master_controls.canonical_name is only the
# merge token, so we surface a real member control per master.
_LIST_SQL = text("""
    SELECT mc.id, mc.master_control_id, mc.canonical_name, mc.total_controls,
           m.is_primary, m.confidence,
           (SELECT r.source_regulation FROM mc_regulations r
              WHERE r.master_control_uuid = mc.id AND r.is_primary LIMIT 1)
              AS primary_regulation,
           rep.title, rep.objective, rep.severity, rep.category
    FROM master_controls mc
    JOIN mc_use_case_mappings m
      ON m.master_control_uuid = mc.id AND m.use_case = :uc
    LEFT JOIN LATERAL (
        SELECT cc.title, cc.objective, cc.severity, cc.category
        FROM master_control_members mcm
        JOIN canonical_controls cc ON cc.id = mcm.control_uuid
        WHERE mcm.master_control_uuid = mc.id
        ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
                                  WHEN 'medium' THEN 2 ELSE 3 END, cc.control_id
        LIMIT 1
    ) rep ON true
    WHERE (:primary_only = false OR m.is_primary)
    ORDER BY m.is_primary DESC, m.confidence DESC NULLS LAST,
             mc.total_controls DESC
    LIMIT :lim OFFSET :off
""")


# Atom-grain path: the one-time Haiku classification (atom_classification) gives
# per-atom relevance + sub-topic. Far more precise + organized than the master
# seed. Preferred whenever the use-case has been processed.
_ATOM_LIST_SQL = text("""
    SELECT ac.control_uuid, ac.sub_topic, ac.canonical_obligation, ac.relevant,
           ac.addressee, (cs.control_uuid IS NOT NULL) AS suppressed,
           cc.control_id, cc.title, cc.objective, cc.severity, cc.license_rule,
           cpl.source_regulation, cpl.source_article
    FROM atom_classification ac
    JOIN canonical_controls cc ON cc.id = ac.control_uuid
    LEFT JOIN control_suppressions cs
      ON cs.control_uuid = ac.control_uuid AND cs.tenant_id = :tenant AND cs.active
    LEFT JOIN LATERAL (
        SELECT cpl.source_regulation, cpl.source_article
        FROM control_parent_links cpl
        WHERE cpl.control_uuid = ac.control_uuid LIMIT 1
    ) cpl ON true
    WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true)
      AND (:incl_oos = true OR ac.addressee IS NULL
           OR ac.addressee NOT IN ('aufsichtsbefugnis','staat_eu','dritter','meta'))
      AND (:incl_suppressed = true OR cs.control_uuid IS NULL)
      AND (:sub IS NULL OR ac.sub_topic = :sub)
    ORDER BY ac.relevant DESC, ac.sub_topic NULLS LAST,
             CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
                              WHEN 'medium' THEN 2 ELSE 3 END, cc.title
    LIMIT :lim OFFSET :off
""")


# Same WHERE as the list (tier + addressee + suppression) → exact `total` for
# pagination, without arithmetic over overlapping filters.
_ATOM_COUNT_SQL = text("""
    SELECT count(*)
    FROM atom_classification ac
    LEFT JOIN control_suppressions cs
      ON cs.control_uuid = ac.control_uuid AND cs.tenant_id = :tenant AND cs.active
    WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true)
      AND (:incl_oos = true OR ac.addressee IS NULL
           OR ac.addressee NOT IN ('aufsichtsbefugnis','staat_eu','dritter','meta'))
      AND (:incl_suppressed = true OR cs.control_uuid IS NULL)
      AND (:sub IS NULL OR ac.sub_topic = :sub)
""")


# Breadth fast-path: top-N atom controls for MANY (use_case, sub_topic) pairs in
# ONE query. The CRA enrichment only needs this list — NOT the counts/facets/total
# that controls_for_use_case also computes (those are 5 extra aggregate scans per
# call, discarded by the caller). On prod (atom_classification currently lacks the
# (use_case, sub_topic) index after the DB swap) collapsing ~6 queries × N pairs
# into one scan is the difference between ~38s and a few seconds.
_ATOM_BREADTH_BATCH_SQL = text("""
    SELECT q.use_case, q.sub_topic, q.control_id, q.title, q.severity,
           q.source_regulation, q.source_article
    FROM (
        SELECT ac.use_case, ac.sub_topic, cc.control_id, cc.title, cc.severity,
               cpl.source_regulation, cpl.source_article,
               row_number() OVER (
                   PARTITION BY ac.use_case, ac.sub_topic
                   ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
                                             WHEN 'medium' THEN 2 ELSE 3 END, cc.title
               ) AS rn
        FROM atom_classification ac
        JOIN canonical_controls cc ON cc.id = ac.control_uuid
        LEFT JOIN LATERAL (
            SELECT cpl.source_regulation, cpl.source_article
            FROM control_parent_links cpl
            WHERE cpl.control_uuid = ac.control_uuid LIMIT 1
        ) cpl ON true
        WHERE ac.relevant = true
          AND (ac.addressee IS NULL OR ac.addressee NOT IN
               ('aufsichtsbefugnis','staat_eu','dritter','meta'))
          AND (ac.use_case, ac.sub_topic) IN :pairs
    ) q
    WHERE q.rn <= :per
""").bindparams(bindparam("pairs", expanding=True))

# Process-level memo: does the atom table exist? (never changes at runtime)
_ATOM_TABLE_EXISTS: dict[str, Optional[bool]] = {"v": None}


class UseCaseControlsService:
    """Topic → controls retrieval over the seeded use-case mappings."""

    def __init__(self, db: Session) -> None:
        self.db = db

    def _atom_table_exists(self) -> bool:
        if _ATOM_TABLE_EXISTS["v"] is None:
            _ATOM_TABLE_EXISTS["v"] = self.db.execute(
                text("SELECT to_regclass('compliance.atom_classification')")
            ).scalar() is not None
        return bool(_ATOM_TABLE_EXISTS["v"])

    def breadth_controls_batch(
        self, pairs, per: int = 3,
    ) -> dict[tuple[str, str], list[dict[str, Any]]]:
        """Top-``per`` atom controls for each (use_case, sub_topic) pair, in ONE
        query. Returns {(use_case, sub_topic): [control dicts]}. Best-effort:
        empty dict on any error or when the atom table is absent (caller then
        leaves breadth empty — never breaks the assessment)."""
        uniq = sorted({(uc, st) for uc, st in pairs if uc and st})
        if not uniq or not self._atom_table_exists():
            return {}
        try:
            rows = self.db.execute(
                _ATOM_BREADTH_BATCH_SQL,
                {"pairs": uniq, "per": min(max(int(per), 1), 50)},
            ).fetchall()
        except Exception:
            return {}
        out: dict[tuple[str, str], list[dict[str, Any]]] = {}
        for r in rows:
            out.setdefault((r.use_case, r.sub_topic), []).append({
                "control_id": r.control_id, "title": r.title,
                "source_regulation": r.source_regulation,
                "source_article": r.source_article,
                "severity": r.severity, "use_case": r.use_case,
            })
        return out

    def list_use_cases(self) -> list[dict[str, Any]]:
        """Registry use-cases with live counts — atom-grain (Haiku classification)
        plus the legacy master seed. Backs the coverage overview so every topic is
        visible with how many obligations it actually carries."""
        counts = {
            row[0]: int(row[1])
            for row in self.db.execute(text(
                "SELECT use_case, count(*) FROM mc_use_case_mappings "
                "GROUP BY use_case"
            )).fetchall()
        }
        atom: dict[str, tuple[int, int]] = {}
        if self.db.execute(text(
            "SELECT to_regclass('compliance.atom_classification')"
        )).scalar() is not None:
            atom = {
                row[0]: (int(row[1]), int(row[2]))
                for row in self.db.execute(text(
                    "SELECT use_case, count(*), count(*) FILTER (WHERE relevant) "
                    "FROM atom_classification GROUP BY use_case"
                )).fetchall()
            }
        out = [
            {
                "key": uc.key,
                "label": uc.label,
                "group": uc.group,
                "regulations": list(uc.regulations),
                "verification_methods": list(uc.verification_methods),
                "mapped_controls": counts.get(uc.key, 0),
                "atom_total": atom.get(uc.key, (0, 0))[0],
                "atom_relevant": atom.get(uc.key, (0, 0))[1],
            }
            for uc in REGISTRY.values() if uc.enabled
        ]
        out.sort(key=lambda x: (x["atom_relevant"], x["mapped_controls"]),
                 reverse=True)
        return out

    def controls_for_use_case(
        self,
        use_case: str,
        primary_only: bool = False,
        limit: int = 50,
        offset: int = 0,
        sub_topic: Optional[str] = None,
        tier: str = "core",
        include_out_of_scope: bool = False,
        tenant_id: Optional[str] = None,
        include_suppressed: bool = False,
    ) -> dict[str, Any]:
        """Controls for ``use_case``. Prefers the atom-grain Haiku classification
        (precise + sub-topic-organized) when present; falls back to the
        master-grain seed otherwise.

        ``tier`` (atom-grain only): 'core' = validated obligations only (default,
        keeps the agent/CRA callers precise); 'all' = everything incl. the
        'review' tier (shown, flagged) so the human browse view loses nothing.
        ``include_out_of_scope``: by default out-of-scope addressees (authority
        power / member-state / foreign / meta) are hidden (advisory, never
        deleted); set true to surface them.
        ``tenant_id`` (+ ``include_suppressed``): controls the tenant marked
        not-applicable are hidden by default (reversible, audited); None tenant
        (agent/CRA path) = no suppression filter."""
        if not is_valid_use_case(use_case):
            raise NotFoundError(f"Unknown use_case '{use_case}'")
        uc = REGISTRY[use_case]
        lim = min(max(int(limit), 1), 200)
        off = max(int(offset), 0)
        tier = tier if tier in ("core", "all") else "core"

        if self._has_atom_grain(use_case):
            return self._atom_grain(uc, lim, off, sub_topic, tier,
                                    bool(include_out_of_scope),
                                    tenant_id, bool(include_suppressed))

        # --- master-grain fallback (recall seed) ---
        count_sql = (
            "SELECT count(*) FROM mc_use_case_mappings WHERE use_case = :uc"
            + (" AND is_primary" if primary_only else "")
        )
        total = self.db.execute(text(count_sql), {"uc": use_case}).scalar() or 0
        rows = self.db.execute(_LIST_SQL, {
            "uc": use_case, "primary_only": bool(primary_only), "lim": lim, "off": off,
        }).fetchall()
        controls = [
            {
                "id": str(r.id),
                "master_control_id": r.master_control_id,
                "title": r.title or r.canonical_name,
                "objective": r.objective,
                "severity": r.severity,
                "category": r.category,
                "member_count": r.total_controls,
                "is_primary": bool(r.is_primary),
                "confidence": float(r.confidence) if r.confidence is not None else None,
                "primary_regulation": r.primary_regulation,
                "relevance": relevance_score(
                    r.title, r.objective, uc.keyword_tokens, r.is_primary, r.confidence,
                ),
            }
            for r in rows
        ]
        return {
            "use_case": uc.key, "label": uc.label, "group": uc.group,
            "granularity": "master", "total": int(total), "limit": lim, "offset": off,
            "primary_only": bool(primary_only), "controls": controls,
        }

    def _has_atom_grain(self, use_case: str) -> bool:
        if self.db.execute(
            text("SELECT to_regclass('compliance.atom_classification')")
        ).scalar() is None:
            return False
        return (self.db.execute(
            text("SELECT count(*) FROM atom_classification WHERE use_case = :uc"),
            {"uc": use_case},
        ).scalar() or 0) > 0

    def _atom_grain(
        self, uc, lim: int, off: int, sub_topic: Optional[str], tier: str = "core",
        include_out_of_scope: bool = False, tenant_id: Optional[str] = None,
        include_suppressed: bool = False,
    ) -> dict[str, Any]:
        all_flag = tier == "all"
        p = {"uc": uc.key, "all": all_flag, "incl_oos": include_out_of_scope,
             "incl_suppressed": include_suppressed, "tenant": tenant_id,
             "sub": sub_topic}
        counts = self.db.execute(text(
            "SELECT count(*) FILTER (WHERE relevant), "
            "count(*) FILTER (WHERE NOT relevant), "
            "count(*) FILTER (WHERE addressee IN "
            "  ('aufsichtsbefugnis','staat_eu','dritter','meta') "
            "  AND (:all = true OR relevant = true)) "
            "FROM atom_classification "
            "WHERE use_case = :uc AND (:sub IS NULL OR sub_topic = :sub)"
        ), {"uc": uc.key, "all": all_flag, "sub": sub_topic}).first()
        core_count = int((counts[0] if counts else 0) or 0)
        review_count = int((counts[1] if counts else 0) or 0)
        oos_count = int((counts[2] if counts else 0) or 0)
        suppressed_count = 0
        if tenant_id:
            suppressed_count = int(self.db.execute(text(
                "SELECT count(*) FROM atom_classification ac "
                "JOIN control_suppressions cs ON cs.control_uuid = ac.control_uuid "
                "  AND cs.tenant_id = :tenant AND cs.active "
                "WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true) "
                "AND (:sub IS NULL OR ac.sub_topic = :sub)"
            ), {"uc": uc.key, "all": all_flag, "tenant": tenant_id,
                "sub": sub_topic}).scalar() or 0)
        total = int(self.db.execute(_ATOM_COUNT_SQL, p).scalar() or 0)
        facet = {
            row[0]: int(row[1])
            for row in self.db.execute(text(
                "SELECT COALESCE(ac.sub_topic, '(none)'), count(*) "
                "FROM atom_classification ac "
                "LEFT JOIN control_suppressions cs ON cs.control_uuid = ac.control_uuid "
                "  AND cs.tenant_id = :tenant AND cs.active "
                "WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true) "
                "AND (:incl_oos = true OR ac.addressee IS NULL OR ac.addressee NOT IN "
                "    ('aufsichtsbefugnis','staat_eu','dritter','meta')) "
                "AND (:incl_suppressed = true OR cs.control_uuid IS NULL) "
                "GROUP BY 1 ORDER BY 2 DESC"
            ), p).fetchall()
        }
        rows = self.db.execute(
            _ATOM_LIST_SQL, {**p, "lim": lim, "off": off}).fetchall()
        controls = [
            {
                "id": str(r.control_uuid),
                "control_id": r.control_id,
                "title": r.title,
                "objective": r.objective,
                "severity": r.severity,
                "sub_topic": r.sub_topic,
                "canonical_obligation": r.canonical_obligation,
                "source_regulation": r.source_regulation,
                "source_article": r.source_article,
                "relevant": bool(r.relevant),
                "tier": tier_label(r.relevant),
                "source_type": source_type(r.license_rule),
                "addressee": r.addressee,
                "applicable": addressee_applicable(r.addressee),
                "is_gov": addressee_is_gov(r.addressee),
                "suppressed": bool(r.suppressed),
            }
            for r in rows
        ]
        return {
            "use_case": uc.key, "label": uc.label, "group": uc.group,
            "granularity": "atom", "tier": tier, "total": total,
            "core_count": core_count, "review_count": review_count,
            "out_of_scope_count": oos_count, "suppressed_count": suppressed_count,
            "include_out_of_scope": bool(include_out_of_scope),
            "include_suppressed": bool(include_suppressed),
            "limit": lim, "offset": off,
            "sub_topic": sub_topic, "subtopic_counts": facet, "controls": controls,
        }