7aabfbe5b5
Geteilte Schicht für alle Surfaces (Workspace-Anwälte, Cyber-Risiko-Projekt,
Admin): ein Mandant markiert ein Control als "nicht anwendbar" → in seinen
Use-Case-Ansichten (und künftig Repo-Scans) ausgeblendet.
- Migration 156: compliance.control_suppressions (PK tenant_id+control_uuid),
reversibel (active + reverted_*), auditierbar (actor/reason/created_at).
[migration-approved]
- Service control_suppression: suppress/revert/list_suppressions +
suppressed_control_uuids (geteilter Filter).
- Routes: GET/POST /v1/controls/suppressions + POST .../{uuid}/revert (X-Tenant-ID).
- controls_for_use_case: optionaler X-Tenant-ID + include_suppressed; suppressed
per Default versteckt (nie gelöscht), suppressed_count, suppressed-Flag pro
Control. Agenten/CRA ohne Tenant unberührt.
- Tests: Request-Validierung + import-safety (E2E-Zyklus gegen macmini bewiesen).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
355 lines
16 KiB
Python
355 lines
16 KiB
Python
# mypy: disable-error-code="no-any-return,arg-type"
|
|
"""Use-Case → Controls retrieval — the SHARED layer the document agents AND the
|
|
CRA finding-mapper query to pull the controls that belong to a topic.
|
|
|
|
Read-only over the existing ``mc_use_case_mappings`` seed (no schema change).
|
|
The seed is recall-oriented ("this MC comes from a law about the topic"); the
|
|
ranking here is a deterministic *precision proxy* — is_primary + mapping
|
|
confidence + cluster size, plus a keyword-relevance score derived from the
|
|
use-case registry. The LLM precision pass (Phase B) refines this later; the
|
|
ranking field stays the same so consumers do not change.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Optional
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from compliance.data.use_case_registry import REGISTRY, is_valid_use_case
|
|
from compliance.domain import NotFoundError
|
|
|
|
|
|
def relevance_score(
|
|
title: Optional[str],
|
|
objective: Optional[str],
|
|
keyword_tokens: tuple[str, ...],
|
|
is_primary: Optional[bool],
|
|
confidence: Optional[float],
|
|
) -> float:
|
|
"""Deterministic precision proxy in [0, 1]. Pure → unit-testable.
|
|
|
|
Combines the recall signals already on the mapping (primary flag, mapping
|
|
confidence) with a content signal: how many of the use-case's registry
|
|
keyword tokens appear in the control's own representative text. The content
|
|
term is what separates "actually about this topic" from "merely from a
|
|
related law" — the core of the precision problem.
|
|
"""
|
|
haystack = f"{title or ''} {objective or ''}".lower()
|
|
hits = sum(1 for kw in keyword_tokens if kw and kw in haystack)
|
|
kw_score = min(hits / 3.0, 1.0) if keyword_tokens else 0.0
|
|
score = (0.5 if is_primary else 0.0) + 0.3 * float(confidence or 0.0) + 0.2 * kw_score
|
|
return round(min(score, 1.0), 3)
|
|
|
|
|
|
def tier_label(relevant: bool) -> str:
|
|
"""Soft tier instead of a hard filter: validated obligations are 'core',
|
|
the rest are 'review' — shown but flagged for expert curation. The boundary
|
|
'concrete vs. generic' is genuinely fuzzy; hiding 'review' dropped ~25% of
|
|
the corpus, much of it real (filter validation 2026-06-15)."""
|
|
return "core" if relevant else "review"
|
|
|
|
|
|
def source_type(license_rule: Optional[int]) -> str:
|
|
"""Provenance: 'own_library' = self-written (license_rule 3, no commercial
|
|
source); 'derived' = lifted from a sourced document (license 1/2, the
|
|
document is in source_regulation)."""
|
|
return "own_library" if license_rule == 3 else "derived"
|
|
|
|
|
|
_OUT_OF_SCOPE_ADDRESSEES = ("aufsichtsbefugnis", "staat_eu", "dritter", "meta")
|
|
|
|
|
|
def addressee_applicable(addressee: Optional[str]) -> bool:
|
|
"""An obligation is applicable to a (potential) customer unless its addressee
|
|
is clearly someone else: a supervisory authority's power, a member state / EU
|
|
institution, a foreign third party, or pure meta. NULL = not yet classified =
|
|
treated as applicable (conservative — nothing hidden by default)."""
|
|
return addressee not in _OUT_OF_SCOPE_ADDRESSEES
|
|
|
|
|
|
def addressee_is_gov(addressee: Optional[str]) -> bool:
|
|
"""Public-body-as-obligor → an additive GOV hint (Kommune/Stadt = potential
|
|
public-sector customer). The atom keeps its use-case; this is only a tag."""
|
|
return addressee == "oeffentliche_stelle"
|
|
|
|
|
|
# Representative member (most severe, then lowest control_id) carries the
|
|
# human-readable title/objective — master_controls.canonical_name is only the
|
|
# merge token, so we surface a real member control per master.
|
|
_LIST_SQL = text("""
|
|
SELECT mc.id, mc.master_control_id, mc.canonical_name, mc.total_controls,
|
|
m.is_primary, m.confidence,
|
|
(SELECT r.source_regulation FROM mc_regulations r
|
|
WHERE r.master_control_uuid = mc.id AND r.is_primary LIMIT 1)
|
|
AS primary_regulation,
|
|
rep.title, rep.objective, rep.severity, rep.category
|
|
FROM master_controls mc
|
|
JOIN mc_use_case_mappings m
|
|
ON m.master_control_uuid = mc.id AND m.use_case = :uc
|
|
LEFT JOIN LATERAL (
|
|
SELECT cc.title, cc.objective, cc.severity, cc.category
|
|
FROM master_control_members mcm
|
|
JOIN canonical_controls cc ON cc.id = mcm.control_uuid
|
|
WHERE mcm.master_control_uuid = mc.id
|
|
ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
|
|
WHEN 'medium' THEN 2 ELSE 3 END, cc.control_id
|
|
LIMIT 1
|
|
) rep ON true
|
|
WHERE (:primary_only = false OR m.is_primary)
|
|
ORDER BY m.is_primary DESC, m.confidence DESC NULLS LAST,
|
|
mc.total_controls DESC
|
|
LIMIT :lim OFFSET :off
|
|
""")
|
|
|
|
|
|
# Atom-grain path: the one-time Haiku classification (atom_classification) gives
|
|
# per-atom relevance + sub-topic. Far more precise + organized than the master
|
|
# seed. Preferred whenever the use-case has been processed.
|
|
_ATOM_LIST_SQL = text("""
|
|
SELECT ac.control_uuid, ac.sub_topic, ac.canonical_obligation, ac.relevant,
|
|
ac.addressee, (cs.control_uuid IS NOT NULL) AS suppressed,
|
|
cc.control_id, cc.title, cc.objective, cc.severity, cc.license_rule,
|
|
cpl.source_regulation, cpl.source_article
|
|
FROM atom_classification ac
|
|
JOIN canonical_controls cc ON cc.id = ac.control_uuid
|
|
LEFT JOIN control_suppressions cs
|
|
ON cs.control_uuid = ac.control_uuid AND cs.tenant_id = :tenant AND cs.active
|
|
LEFT JOIN LATERAL (
|
|
SELECT cpl.source_regulation, cpl.source_article
|
|
FROM control_parent_links cpl
|
|
WHERE cpl.control_uuid = ac.control_uuid LIMIT 1
|
|
) cpl ON true
|
|
WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true)
|
|
AND (:incl_oos = true OR ac.addressee IS NULL
|
|
OR ac.addressee NOT IN ('aufsichtsbefugnis','staat_eu','dritter','meta'))
|
|
AND (:incl_suppressed = true OR cs.control_uuid IS NULL)
|
|
AND (:sub IS NULL OR ac.sub_topic = :sub)
|
|
ORDER BY ac.relevant DESC, ac.sub_topic NULLS LAST,
|
|
CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
|
|
WHEN 'medium' THEN 2 ELSE 3 END, cc.title
|
|
LIMIT :lim OFFSET :off
|
|
""")
|
|
|
|
|
|
# Same WHERE as the list (tier + addressee + suppression) → exact `total` for
|
|
# pagination, without arithmetic over overlapping filters.
|
|
_ATOM_COUNT_SQL = text("""
|
|
SELECT count(*)
|
|
FROM atom_classification ac
|
|
LEFT JOIN control_suppressions cs
|
|
ON cs.control_uuid = ac.control_uuid AND cs.tenant_id = :tenant AND cs.active
|
|
WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true)
|
|
AND (:incl_oos = true OR ac.addressee IS NULL
|
|
OR ac.addressee NOT IN ('aufsichtsbefugnis','staat_eu','dritter','meta'))
|
|
AND (:incl_suppressed = true OR cs.control_uuid IS NULL)
|
|
AND (:sub IS NULL OR ac.sub_topic = :sub)
|
|
""")
|
|
|
|
|
|
class UseCaseControlsService:
|
|
"""Topic → controls retrieval over the seeded use-case mappings."""
|
|
|
|
def __init__(self, db: Session) -> None:
|
|
self.db = db
|
|
|
|
def list_use_cases(self) -> list[dict[str, Any]]:
|
|
"""Registry use-cases with live counts — atom-grain (Haiku classification)
|
|
plus the legacy master seed. Backs the coverage overview so every topic is
|
|
visible with how many obligations it actually carries."""
|
|
counts = {
|
|
row[0]: int(row[1])
|
|
for row in self.db.execute(text(
|
|
"SELECT use_case, count(*) FROM mc_use_case_mappings "
|
|
"GROUP BY use_case"
|
|
)).fetchall()
|
|
}
|
|
atom: dict[str, tuple[int, int]] = {}
|
|
if self.db.execute(text(
|
|
"SELECT to_regclass('compliance.atom_classification')"
|
|
)).scalar() is not None:
|
|
atom = {
|
|
row[0]: (int(row[1]), int(row[2]))
|
|
for row in self.db.execute(text(
|
|
"SELECT use_case, count(*), count(*) FILTER (WHERE relevant) "
|
|
"FROM atom_classification GROUP BY use_case"
|
|
)).fetchall()
|
|
}
|
|
out = [
|
|
{
|
|
"key": uc.key,
|
|
"label": uc.label,
|
|
"group": uc.group,
|
|
"regulations": list(uc.regulations),
|
|
"verification_methods": list(uc.verification_methods),
|
|
"mapped_controls": counts.get(uc.key, 0),
|
|
"atom_total": atom.get(uc.key, (0, 0))[0],
|
|
"atom_relevant": atom.get(uc.key, (0, 0))[1],
|
|
}
|
|
for uc in REGISTRY.values() if uc.enabled
|
|
]
|
|
out.sort(key=lambda x: (x["atom_relevant"], x["mapped_controls"]),
|
|
reverse=True)
|
|
return out
|
|
|
|
def controls_for_use_case(
|
|
self,
|
|
use_case: str,
|
|
primary_only: bool = False,
|
|
limit: int = 50,
|
|
offset: int = 0,
|
|
sub_topic: Optional[str] = None,
|
|
tier: str = "core",
|
|
include_out_of_scope: bool = False,
|
|
tenant_id: Optional[str] = None,
|
|
include_suppressed: bool = False,
|
|
) -> dict[str, Any]:
|
|
"""Controls for ``use_case``. Prefers the atom-grain Haiku classification
|
|
(precise + sub-topic-organized) when present; falls back to the
|
|
master-grain seed otherwise.
|
|
|
|
``tier`` (atom-grain only): 'core' = validated obligations only (default,
|
|
keeps the agent/CRA callers precise); 'all' = everything incl. the
|
|
'review' tier (shown, flagged) so the human browse view loses nothing.
|
|
``include_out_of_scope``: by default out-of-scope addressees (authority
|
|
power / member-state / foreign / meta) are hidden (advisory, never
|
|
deleted); set true to surface them.
|
|
``tenant_id`` (+ ``include_suppressed``): controls the tenant marked
|
|
not-applicable are hidden by default (reversible, audited); None tenant
|
|
(agent/CRA path) = no suppression filter."""
|
|
if not is_valid_use_case(use_case):
|
|
raise NotFoundError(f"Unknown use_case '{use_case}'")
|
|
uc = REGISTRY[use_case]
|
|
lim = min(max(int(limit), 1), 200)
|
|
off = max(int(offset), 0)
|
|
tier = tier if tier in ("core", "all") else "core"
|
|
|
|
if self._has_atom_grain(use_case):
|
|
return self._atom_grain(uc, lim, off, sub_topic, tier,
|
|
bool(include_out_of_scope),
|
|
tenant_id, bool(include_suppressed))
|
|
|
|
# --- master-grain fallback (recall seed) ---
|
|
count_sql = (
|
|
"SELECT count(*) FROM mc_use_case_mappings WHERE use_case = :uc"
|
|
+ (" AND is_primary" if primary_only else "")
|
|
)
|
|
total = self.db.execute(text(count_sql), {"uc": use_case}).scalar() or 0
|
|
rows = self.db.execute(_LIST_SQL, {
|
|
"uc": use_case, "primary_only": bool(primary_only), "lim": lim, "off": off,
|
|
}).fetchall()
|
|
controls = [
|
|
{
|
|
"id": str(r.id),
|
|
"master_control_id": r.master_control_id,
|
|
"title": r.title or r.canonical_name,
|
|
"objective": r.objective,
|
|
"severity": r.severity,
|
|
"category": r.category,
|
|
"member_count": r.total_controls,
|
|
"is_primary": bool(r.is_primary),
|
|
"confidence": float(r.confidence) if r.confidence is not None else None,
|
|
"primary_regulation": r.primary_regulation,
|
|
"relevance": relevance_score(
|
|
r.title, r.objective, uc.keyword_tokens, r.is_primary, r.confidence,
|
|
),
|
|
}
|
|
for r in rows
|
|
]
|
|
return {
|
|
"use_case": uc.key, "label": uc.label, "group": uc.group,
|
|
"granularity": "master", "total": int(total), "limit": lim, "offset": off,
|
|
"primary_only": bool(primary_only), "controls": controls,
|
|
}
|
|
|
|
def _has_atom_grain(self, use_case: str) -> bool:
|
|
if self.db.execute(
|
|
text("SELECT to_regclass('compliance.atom_classification')")
|
|
).scalar() is None:
|
|
return False
|
|
return (self.db.execute(
|
|
text("SELECT count(*) FROM atom_classification WHERE use_case = :uc"),
|
|
{"uc": use_case},
|
|
).scalar() or 0) > 0
|
|
|
|
def _atom_grain(
|
|
self, uc, lim: int, off: int, sub_topic: Optional[str], tier: str = "core",
|
|
include_out_of_scope: bool = False, tenant_id: Optional[str] = None,
|
|
include_suppressed: bool = False,
|
|
) -> dict[str, Any]:
|
|
all_flag = tier == "all"
|
|
p = {"uc": uc.key, "all": all_flag, "incl_oos": include_out_of_scope,
|
|
"incl_suppressed": include_suppressed, "tenant": tenant_id,
|
|
"sub": sub_topic}
|
|
counts = self.db.execute(text(
|
|
"SELECT count(*) FILTER (WHERE relevant), "
|
|
"count(*) FILTER (WHERE NOT relevant), "
|
|
"count(*) FILTER (WHERE addressee IN "
|
|
" ('aufsichtsbefugnis','staat_eu','dritter','meta') "
|
|
" AND (:all = true OR relevant = true)) "
|
|
"FROM atom_classification "
|
|
"WHERE use_case = :uc AND (:sub IS NULL OR sub_topic = :sub)"
|
|
), {"uc": uc.key, "all": all_flag, "sub": sub_topic}).first()
|
|
core_count = int((counts[0] if counts else 0) or 0)
|
|
review_count = int((counts[1] if counts else 0) or 0)
|
|
oos_count = int((counts[2] if counts else 0) or 0)
|
|
suppressed_count = 0
|
|
if tenant_id:
|
|
suppressed_count = int(self.db.execute(text(
|
|
"SELECT count(*) FROM atom_classification ac "
|
|
"JOIN control_suppressions cs ON cs.control_uuid = ac.control_uuid "
|
|
" AND cs.tenant_id = :tenant AND cs.active "
|
|
"WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true) "
|
|
"AND (:sub IS NULL OR ac.sub_topic = :sub)"
|
|
), {"uc": uc.key, "all": all_flag, "tenant": tenant_id,
|
|
"sub": sub_topic}).scalar() or 0)
|
|
total = int(self.db.execute(_ATOM_COUNT_SQL, p).scalar() or 0)
|
|
facet = {
|
|
row[0]: int(row[1])
|
|
for row in self.db.execute(text(
|
|
"SELECT COALESCE(ac.sub_topic, '(none)'), count(*) "
|
|
"FROM atom_classification ac "
|
|
"LEFT JOIN control_suppressions cs ON cs.control_uuid = ac.control_uuid "
|
|
" AND cs.tenant_id = :tenant AND cs.active "
|
|
"WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true) "
|
|
"AND (:incl_oos = true OR ac.addressee IS NULL OR ac.addressee NOT IN "
|
|
" ('aufsichtsbefugnis','staat_eu','dritter','meta')) "
|
|
"AND (:incl_suppressed = true OR cs.control_uuid IS NULL) "
|
|
"GROUP BY 1 ORDER BY 2 DESC"
|
|
), p).fetchall()
|
|
}
|
|
rows = self.db.execute(
|
|
_ATOM_LIST_SQL, {**p, "lim": lim, "off": off}).fetchall()
|
|
controls = [
|
|
{
|
|
"id": str(r.control_uuid),
|
|
"control_id": r.control_id,
|
|
"title": r.title,
|
|
"objective": r.objective,
|
|
"severity": r.severity,
|
|
"sub_topic": r.sub_topic,
|
|
"canonical_obligation": r.canonical_obligation,
|
|
"source_regulation": r.source_regulation,
|
|
"source_article": r.source_article,
|
|
"relevant": bool(r.relevant),
|
|
"tier": tier_label(r.relevant),
|
|
"source_type": source_type(r.license_rule),
|
|
"addressee": r.addressee,
|
|
"applicable": addressee_applicable(r.addressee),
|
|
"is_gov": addressee_is_gov(r.addressee),
|
|
"suppressed": bool(r.suppressed),
|
|
}
|
|
for r in rows
|
|
]
|
|
return {
|
|
"use_case": uc.key, "label": uc.label, "group": uc.group,
|
|
"granularity": "atom", "tier": tier, "total": total,
|
|
"core_count": core_count, "review_count": review_count,
|
|
"out_of_scope_count": oos_count, "suppressed_count": suppressed_count,
|
|
"include_out_of_scope": bool(include_out_of_scope),
|
|
"include_suppressed": bool(include_suppressed),
|
|
"limit": lim, "offset": off,
|
|
"sub_topic": sub_topic, "subtopic_counts": facet, "controls": controls,
|
|
}
|