Files
breakpilot-compliance/backend-compliance/compliance/services/use_case_controls.py
T
Benjamin Bönisch 72093e5501
CI / detect-changes (push) Successful in 17s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 13s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 25s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 30s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
fix(cra): Scanner-Findings vollstaendig mappen + assess-from-scanner-Latenz senken
Punkt 2 (Coverage): semgrep/gdpr-Findings ohne CWE blieben unmapped (~21%).
Der Mapper nutzt jetzt den scanner rule_id + gezielte Keywords (gdpr ->
Datenminimierung CRA-AI-17, path-traversal/prototype-pollution -> CRA-AI-20,
nginx-header/Docker-Hardening -> CRA-AI-1/4, insecure-websocket -> CRA-AI-15).
Reale Scanner-Daten: unmapped 19/92 -> 0/92 (Coverage 100%).

Punkt 3 (Latenz): enrich_findings_with_breadth lief ~6 Aggregat-Queries je
(use_case,sub_topic)-Paar, nutzte aber nur die Liste. Jetzt EINE batched Query
(breadth_controls_batch) fuer alle Paare + Prozess-Cache (TTL 1800s). macmini:
cold 0,23s / warm 0,000s. Prod-Root-Cause: atom_classification ohne
(use_case,sub_topic)-Index nach DB-Swap -> Index dem DB-Owner empfohlen.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-17 13:17:51 +02:00

425 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# mypy: disable-error-code="no-any-return,arg-type"
"""Use-Case → Controls retrieval — the SHARED layer the document agents AND the
CRA finding-mapper query to pull the controls that belong to a topic.
Read-only over the existing ``mc_use_case_mappings`` seed (no schema change).
The seed is recall-oriented ("this MC comes from a law about the topic"); the
ranking here is a deterministic *precision proxy* — is_primary + mapping
confidence + cluster size, plus a keyword-relevance score derived from the
use-case registry. The LLM precision pass (Phase B) refines this later; the
ranking field stays the same so consumers do not change.
"""
from __future__ import annotations
from typing import Any, Optional
from sqlalchemy import bindparam, text
from sqlalchemy.orm import Session
from compliance.data.use_case_registry import REGISTRY, is_valid_use_case
from compliance.domain import NotFoundError
def relevance_score(
title: Optional[str],
objective: Optional[str],
keyword_tokens: tuple[str, ...],
is_primary: Optional[bool],
confidence: Optional[float],
) -> float:
"""Deterministic precision proxy in [0, 1]. Pure → unit-testable.
Combines the recall signals already on the mapping (primary flag, mapping
confidence) with a content signal: how many of the use-case's registry
keyword tokens appear in the control's own representative text. The content
term is what separates "actually about this topic" from "merely from a
related law" — the core of the precision problem.
"""
haystack = f"{title or ''} {objective or ''}".lower()
hits = sum(1 for kw in keyword_tokens if kw and kw in haystack)
kw_score = min(hits / 3.0, 1.0) if keyword_tokens else 0.0
score = (0.5 if is_primary else 0.0) + 0.3 * float(confidence or 0.0) + 0.2 * kw_score
return round(min(score, 1.0), 3)
def tier_label(relevant: bool) -> str:
"""Soft tier instead of a hard filter: validated obligations are 'core',
the rest are 'review' — shown but flagged for expert curation. The boundary
'concrete vs. generic' is genuinely fuzzy; hiding 'review' dropped ~25% of
the corpus, much of it real (filter validation 2026-06-15)."""
return "core" if relevant else "review"
def source_type(license_rule: Optional[int]) -> str:
"""Provenance: 'own_library' = self-written (license_rule 3, no commercial
source); 'derived' = lifted from a sourced document (license 1/2, the
document is in source_regulation)."""
return "own_library" if license_rule == 3 else "derived"
_OUT_OF_SCOPE_ADDRESSEES = ("aufsichtsbefugnis", "staat_eu", "dritter", "meta")
def addressee_applicable(addressee: Optional[str]) -> bool:
"""An obligation is applicable to a (potential) customer unless its addressee
is clearly someone else: a supervisory authority's power, a member state / EU
institution, a foreign third party, or pure meta. NULL = not yet classified =
treated as applicable (conservative — nothing hidden by default)."""
return addressee not in _OUT_OF_SCOPE_ADDRESSEES
def addressee_is_gov(addressee: Optional[str]) -> bool:
"""Public-body-as-obligor → an additive GOV hint (Kommune/Stadt = potential
public-sector customer). The atom keeps its use-case; this is only a tag."""
return addressee == "oeffentliche_stelle"
# Representative member (most severe, then lowest control_id) carries the
# human-readable title/objective — master_controls.canonical_name is only the
# merge token, so we surface a real member control per master.
_LIST_SQL = text("""
SELECT mc.id, mc.master_control_id, mc.canonical_name, mc.total_controls,
m.is_primary, m.confidence,
(SELECT r.source_regulation FROM mc_regulations r
WHERE r.master_control_uuid = mc.id AND r.is_primary LIMIT 1)
AS primary_regulation,
rep.title, rep.objective, rep.severity, rep.category
FROM master_controls mc
JOIN mc_use_case_mappings m
ON m.master_control_uuid = mc.id AND m.use_case = :uc
LEFT JOIN LATERAL (
SELECT cc.title, cc.objective, cc.severity, cc.category
FROM master_control_members mcm
JOIN canonical_controls cc ON cc.id = mcm.control_uuid
WHERE mcm.master_control_uuid = mc.id
ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
WHEN 'medium' THEN 2 ELSE 3 END, cc.control_id
LIMIT 1
) rep ON true
WHERE (:primary_only = false OR m.is_primary)
ORDER BY m.is_primary DESC, m.confidence DESC NULLS LAST,
mc.total_controls DESC
LIMIT :lim OFFSET :off
""")
# Atom-grain path: the one-time Haiku classification (atom_classification) gives
# per-atom relevance + sub-topic. Far more precise + organized than the master
# seed. Preferred whenever the use-case has been processed.
_ATOM_LIST_SQL = text("""
SELECT ac.control_uuid, ac.sub_topic, ac.canonical_obligation, ac.relevant,
ac.addressee, (cs.control_uuid IS NOT NULL) AS suppressed,
cc.control_id, cc.title, cc.objective, cc.severity, cc.license_rule,
cpl.source_regulation, cpl.source_article
FROM atom_classification ac
JOIN canonical_controls cc ON cc.id = ac.control_uuid
LEFT JOIN control_suppressions cs
ON cs.control_uuid = ac.control_uuid AND cs.tenant_id = :tenant AND cs.active
LEFT JOIN LATERAL (
SELECT cpl.source_regulation, cpl.source_article
FROM control_parent_links cpl
WHERE cpl.control_uuid = ac.control_uuid LIMIT 1
) cpl ON true
WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true)
AND (:incl_oos = true OR ac.addressee IS NULL
OR ac.addressee NOT IN ('aufsichtsbefugnis','staat_eu','dritter','meta'))
AND (:incl_suppressed = true OR cs.control_uuid IS NULL)
AND (:sub IS NULL OR ac.sub_topic = :sub)
ORDER BY ac.relevant DESC, ac.sub_topic NULLS LAST,
CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
WHEN 'medium' THEN 2 ELSE 3 END, cc.title
LIMIT :lim OFFSET :off
""")
# Same WHERE as the list (tier + addressee + suppression) → exact `total` for
# pagination, without arithmetic over overlapping filters.
_ATOM_COUNT_SQL = text("""
SELECT count(*)
FROM atom_classification ac
LEFT JOIN control_suppressions cs
ON cs.control_uuid = ac.control_uuid AND cs.tenant_id = :tenant AND cs.active
WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true)
AND (:incl_oos = true OR ac.addressee IS NULL
OR ac.addressee NOT IN ('aufsichtsbefugnis','staat_eu','dritter','meta'))
AND (:incl_suppressed = true OR cs.control_uuid IS NULL)
AND (:sub IS NULL OR ac.sub_topic = :sub)
""")
# Breadth fast-path: top-N atom controls for MANY (use_case, sub_topic) pairs in
# ONE query. The CRA enrichment only needs this list — NOT the counts/facets/total
# that controls_for_use_case also computes (those are 5 extra aggregate scans per
# call, discarded by the caller). On prod (atom_classification currently lacks the
# (use_case, sub_topic) index after the DB swap) collapsing ~6 queries × N pairs
# into one scan is the difference between ~38s and a few seconds.
_ATOM_BREADTH_BATCH_SQL = text("""
SELECT q.use_case, q.sub_topic, q.control_id, q.title, q.severity,
q.source_regulation, q.source_article
FROM (
SELECT ac.use_case, ac.sub_topic, cc.control_id, cc.title, cc.severity,
cpl.source_regulation, cpl.source_article,
row_number() OVER (
PARTITION BY ac.use_case, ac.sub_topic
ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
WHEN 'medium' THEN 2 ELSE 3 END, cc.title
) AS rn
FROM atom_classification ac
JOIN canonical_controls cc ON cc.id = ac.control_uuid
LEFT JOIN LATERAL (
SELECT cpl.source_regulation, cpl.source_article
FROM control_parent_links cpl
WHERE cpl.control_uuid = ac.control_uuid LIMIT 1
) cpl ON true
WHERE ac.relevant = true
AND (ac.addressee IS NULL OR ac.addressee NOT IN
('aufsichtsbefugnis','staat_eu','dritter','meta'))
AND (ac.use_case, ac.sub_topic) IN :pairs
) q
WHERE q.rn <= :per
""").bindparams(bindparam("pairs", expanding=True))
# Process-level memo: does the atom table exist? (never changes at runtime)
_ATOM_TABLE_EXISTS: dict[str, Optional[bool]] = {"v": None}
class UseCaseControlsService:
"""Topic → controls retrieval over the seeded use-case mappings."""
def __init__(self, db: Session) -> None:
self.db = db
def _atom_table_exists(self) -> bool:
if _ATOM_TABLE_EXISTS["v"] is None:
_ATOM_TABLE_EXISTS["v"] = self.db.execute(
text("SELECT to_regclass('compliance.atom_classification')")
).scalar() is not None
return bool(_ATOM_TABLE_EXISTS["v"])
def breadth_controls_batch(
self, pairs, per: int = 3,
) -> dict[tuple[str, str], list[dict[str, Any]]]:
"""Top-``per`` atom controls for each (use_case, sub_topic) pair, in ONE
query. Returns {(use_case, sub_topic): [control dicts]}. Best-effort:
empty dict on any error or when the atom table is absent (caller then
leaves breadth empty — never breaks the assessment)."""
uniq = sorted({(uc, st) for uc, st in pairs if uc and st})
if not uniq or not self._atom_table_exists():
return {}
try:
rows = self.db.execute(
_ATOM_BREADTH_BATCH_SQL,
{"pairs": uniq, "per": min(max(int(per), 1), 50)},
).fetchall()
except Exception:
return {}
out: dict[tuple[str, str], list[dict[str, Any]]] = {}
for r in rows:
out.setdefault((r.use_case, r.sub_topic), []).append({
"control_id": r.control_id, "title": r.title,
"source_regulation": r.source_regulation,
"source_article": r.source_article,
"severity": r.severity, "use_case": r.use_case,
})
return out
def list_use_cases(self) -> list[dict[str, Any]]:
"""Registry use-cases with live counts — atom-grain (Haiku classification)
plus the legacy master seed. Backs the coverage overview so every topic is
visible with how many obligations it actually carries."""
counts = {
row[0]: int(row[1])
for row in self.db.execute(text(
"SELECT use_case, count(*) FROM mc_use_case_mappings "
"GROUP BY use_case"
)).fetchall()
}
atom: dict[str, tuple[int, int]] = {}
if self.db.execute(text(
"SELECT to_regclass('compliance.atom_classification')"
)).scalar() is not None:
atom = {
row[0]: (int(row[1]), int(row[2]))
for row in self.db.execute(text(
"SELECT use_case, count(*), count(*) FILTER (WHERE relevant) "
"FROM atom_classification GROUP BY use_case"
)).fetchall()
}
out = [
{
"key": uc.key,
"label": uc.label,
"group": uc.group,
"regulations": list(uc.regulations),
"verification_methods": list(uc.verification_methods),
"mapped_controls": counts.get(uc.key, 0),
"atom_total": atom.get(uc.key, (0, 0))[0],
"atom_relevant": atom.get(uc.key, (0, 0))[1],
}
for uc in REGISTRY.values() if uc.enabled
]
out.sort(key=lambda x: (x["atom_relevant"], x["mapped_controls"]),
reverse=True)
return out
def controls_for_use_case(
self,
use_case: str,
primary_only: bool = False,
limit: int = 50,
offset: int = 0,
sub_topic: Optional[str] = None,
tier: str = "core",
include_out_of_scope: bool = False,
tenant_id: Optional[str] = None,
include_suppressed: bool = False,
) -> dict[str, Any]:
"""Controls for ``use_case``. Prefers the atom-grain Haiku classification
(precise + sub-topic-organized) when present; falls back to the
master-grain seed otherwise.
``tier`` (atom-grain only): 'core' = validated obligations only (default,
keeps the agent/CRA callers precise); 'all' = everything incl. the
'review' tier (shown, flagged) so the human browse view loses nothing.
``include_out_of_scope``: by default out-of-scope addressees (authority
power / member-state / foreign / meta) are hidden (advisory, never
deleted); set true to surface them.
``tenant_id`` (+ ``include_suppressed``): controls the tenant marked
not-applicable are hidden by default (reversible, audited); None tenant
(agent/CRA path) = no suppression filter."""
if not is_valid_use_case(use_case):
raise NotFoundError(f"Unknown use_case '{use_case}'")
uc = REGISTRY[use_case]
lim = min(max(int(limit), 1), 200)
off = max(int(offset), 0)
tier = tier if tier in ("core", "all") else "core"
if self._has_atom_grain(use_case):
return self._atom_grain(uc, lim, off, sub_topic, tier,
bool(include_out_of_scope),
tenant_id, bool(include_suppressed))
# --- master-grain fallback (recall seed) ---
count_sql = (
"SELECT count(*) FROM mc_use_case_mappings WHERE use_case = :uc"
+ (" AND is_primary" if primary_only else "")
)
total = self.db.execute(text(count_sql), {"uc": use_case}).scalar() or 0
rows = self.db.execute(_LIST_SQL, {
"uc": use_case, "primary_only": bool(primary_only), "lim": lim, "off": off,
}).fetchall()
controls = [
{
"id": str(r.id),
"master_control_id": r.master_control_id,
"title": r.title or r.canonical_name,
"objective": r.objective,
"severity": r.severity,
"category": r.category,
"member_count": r.total_controls,
"is_primary": bool(r.is_primary),
"confidence": float(r.confidence) if r.confidence is not None else None,
"primary_regulation": r.primary_regulation,
"relevance": relevance_score(
r.title, r.objective, uc.keyword_tokens, r.is_primary, r.confidence,
),
}
for r in rows
]
return {
"use_case": uc.key, "label": uc.label, "group": uc.group,
"granularity": "master", "total": int(total), "limit": lim, "offset": off,
"primary_only": bool(primary_only), "controls": controls,
}
def _has_atom_grain(self, use_case: str) -> bool:
if self.db.execute(
text("SELECT to_regclass('compliance.atom_classification')")
).scalar() is None:
return False
return (self.db.execute(
text("SELECT count(*) FROM atom_classification WHERE use_case = :uc"),
{"uc": use_case},
).scalar() or 0) > 0
def _atom_grain(
self, uc, lim: int, off: int, sub_topic: Optional[str], tier: str = "core",
include_out_of_scope: bool = False, tenant_id: Optional[str] = None,
include_suppressed: bool = False,
) -> dict[str, Any]:
all_flag = tier == "all"
p = {"uc": uc.key, "all": all_flag, "incl_oos": include_out_of_scope,
"incl_suppressed": include_suppressed, "tenant": tenant_id,
"sub": sub_topic}
counts = self.db.execute(text(
"SELECT count(*) FILTER (WHERE relevant), "
"count(*) FILTER (WHERE NOT relevant), "
"count(*) FILTER (WHERE addressee IN "
" ('aufsichtsbefugnis','staat_eu','dritter','meta') "
" AND (:all = true OR relevant = true)) "
"FROM atom_classification "
"WHERE use_case = :uc AND (:sub IS NULL OR sub_topic = :sub)"
), {"uc": uc.key, "all": all_flag, "sub": sub_topic}).first()
core_count = int((counts[0] if counts else 0) or 0)
review_count = int((counts[1] if counts else 0) or 0)
oos_count = int((counts[2] if counts else 0) or 0)
suppressed_count = 0
if tenant_id:
suppressed_count = int(self.db.execute(text(
"SELECT count(*) FROM atom_classification ac "
"JOIN control_suppressions cs ON cs.control_uuid = ac.control_uuid "
" AND cs.tenant_id = :tenant AND cs.active "
"WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true) "
"AND (:sub IS NULL OR ac.sub_topic = :sub)"
), {"uc": uc.key, "all": all_flag, "tenant": tenant_id,
"sub": sub_topic}).scalar() or 0)
total = int(self.db.execute(_ATOM_COUNT_SQL, p).scalar() or 0)
facet = {
row[0]: int(row[1])
for row in self.db.execute(text(
"SELECT COALESCE(ac.sub_topic, '(none)'), count(*) "
"FROM atom_classification ac "
"LEFT JOIN control_suppressions cs ON cs.control_uuid = ac.control_uuid "
" AND cs.tenant_id = :tenant AND cs.active "
"WHERE ac.use_case = :uc AND (:all = true OR ac.relevant = true) "
"AND (:incl_oos = true OR ac.addressee IS NULL OR ac.addressee NOT IN "
" ('aufsichtsbefugnis','staat_eu','dritter','meta')) "
"AND (:incl_suppressed = true OR cs.control_uuid IS NULL) "
"GROUP BY 1 ORDER BY 2 DESC"
), p).fetchall()
}
rows = self.db.execute(
_ATOM_LIST_SQL, {**p, "lim": lim, "off": off}).fetchall()
controls = [
{
"id": str(r.control_uuid),
"control_id": r.control_id,
"title": r.title,
"objective": r.objective,
"severity": r.severity,
"sub_topic": r.sub_topic,
"canonical_obligation": r.canonical_obligation,
"source_regulation": r.source_regulation,
"source_article": r.source_article,
"relevant": bool(r.relevant),
"tier": tier_label(r.relevant),
"source_type": source_type(r.license_rule),
"addressee": r.addressee,
"applicable": addressee_applicable(r.addressee),
"is_gov": addressee_is_gov(r.addressee),
"suppressed": bool(r.suppressed),
}
for r in rows
]
return {
"use_case": uc.key, "label": uc.label, "group": uc.group,
"granularity": "atom", "tier": tier, "total": total,
"core_count": core_count, "review_count": review_count,
"out_of_scope_count": oos_count, "suppressed_count": suppressed_count,
"include_out_of_scope": bool(include_out_of_scope),
"include_suppressed": bool(include_suppressed),
"limit": lim, "offset": off,
"sub_topic": sub_topic, "subtopic_counts": facet, "controls": controls,
}