Files
breakpilot-compliance/backend-compliance/compliance/services/use_case_controls.py
T
Benjamin Admin 00f304fed9
CI / detect-changes (push) Successful in 14s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 11s
CI / validate-canonical-controls (push) Failing after 5s
CI / loc-budget (push) Successful in 22s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / test-go (push) Successful in 1m11s
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m41s
CI / iace-gt-coverage (push) Failing after 5s
CI / test-python-backend (push) Failing after 5s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(controls): 5 neue Use Cases + Machinery-Fix + Korpus-/Lizenz-Übersicht
- Registry: arbeitsrecht, gesellschaftsrecht, insolvenzrecht, csrd, bafin_it
  + Mapper-Regeln für zuvor ungemappte Quell-Gesetze, Machinery-Guide 2006/42
  -> maschinen. Jetzt 43 Use Cases (Achse 1 / license 1+2 vollständig).
- corpus_overview Service + GET /v1/controls/corpus: Quell-Dokumente mit
  Lizenz-Tier + atom-Count + Use-Case + kuratiertem Lizenz-Katalog.
- list_use_cases trägt atom_classification-Counts (atom_total/atom_relevant).
- Frontend /sdk/coverage: Use-Case-Übersicht + Korpus-Dokumente + Lizenz-Katalog.
- Tests: registry-Mappings (neue Domänen), corpus tier-labels, coverage-helpers.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-14 21:49:22 +02:00

239 lines
9.9 KiB
Python

# mypy: disable-error-code="no-any-return,arg-type"
"""Use-Case → Controls retrieval — the SHARED layer the document agents AND the
CRA finding-mapper query to pull the controls that belong to a topic.
Read-only over the existing ``mc_use_case_mappings`` seed (no schema change).
The seed is recall-oriented ("this MC comes from a law about the topic"); the
ranking here is a deterministic *precision proxy* — is_primary + mapping
confidence + cluster size, plus a keyword-relevance score derived from the
use-case registry. The LLM precision pass (Phase B) refines this later; the
ranking field stays the same so consumers do not change.
"""
from __future__ import annotations
from typing import Any, Optional
from sqlalchemy import text
from sqlalchemy.orm import Session
from compliance.data.use_case_registry import REGISTRY, is_valid_use_case
from compliance.domain import NotFoundError
def relevance_score(
title: Optional[str],
objective: Optional[str],
keyword_tokens: tuple[str, ...],
is_primary: Optional[bool],
confidence: Optional[float],
) -> float:
"""Deterministic precision proxy in [0, 1]. Pure → unit-testable.
Combines the recall signals already on the mapping (primary flag, mapping
confidence) with a content signal: how many of the use-case's registry
keyword tokens appear in the control's own representative text. The content
term is what separates "actually about this topic" from "merely from a
related law" — the core of the precision problem.
"""
haystack = f"{title or ''} {objective or ''}".lower()
hits = sum(1 for kw in keyword_tokens if kw and kw in haystack)
kw_score = min(hits / 3.0, 1.0) if keyword_tokens else 0.0
score = (0.5 if is_primary else 0.0) + 0.3 * float(confidence or 0.0) + 0.2 * kw_score
return round(min(score, 1.0), 3)
# Representative member (most severe, then lowest control_id) carries the
# human-readable title/objective — master_controls.canonical_name is only the
# merge token, so we surface a real member control per master.
_LIST_SQL = text("""
SELECT mc.id, mc.master_control_id, mc.canonical_name, mc.total_controls,
m.is_primary, m.confidence,
(SELECT r.source_regulation FROM mc_regulations r
WHERE r.master_control_uuid = mc.id AND r.is_primary LIMIT 1)
AS primary_regulation,
rep.title, rep.objective, rep.severity, rep.category
FROM master_controls mc
JOIN mc_use_case_mappings m
ON m.master_control_uuid = mc.id AND m.use_case = :uc
LEFT JOIN LATERAL (
SELECT cc.title, cc.objective, cc.severity, cc.category
FROM master_control_members mcm
JOIN canonical_controls cc ON cc.id = mcm.control_uuid
WHERE mcm.master_control_uuid = mc.id
ORDER BY CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
WHEN 'medium' THEN 2 ELSE 3 END, cc.control_id
LIMIT 1
) rep ON true
WHERE (:primary_only = false OR m.is_primary)
ORDER BY m.is_primary DESC, m.confidence DESC NULLS LAST,
mc.total_controls DESC
LIMIT :lim OFFSET :off
""")
# Atom-grain path: the one-time Haiku classification (atom_classification) gives
# per-atom relevance + sub-topic. Far more precise + organized than the master
# seed. Preferred whenever the use-case has been processed.
_ATOM_LIST_SQL = text("""
SELECT ac.control_uuid, ac.sub_topic, ac.canonical_obligation,
cc.control_id, cc.title, cc.objective, cc.severity,
(SELECT cpl.source_regulation FROM control_parent_links cpl
WHERE cpl.control_uuid = ac.control_uuid LIMIT 1) AS source_regulation
FROM atom_classification ac
JOIN canonical_controls cc ON cc.id = ac.control_uuid
WHERE ac.use_case = :uc AND ac.relevant = true
AND (:sub IS NULL OR ac.sub_topic = :sub)
ORDER BY ac.sub_topic NULLS LAST,
CASE cc.severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1
WHEN 'medium' THEN 2 ELSE 3 END, cc.title
LIMIT :lim OFFSET :off
""")
class UseCaseControlsService:
"""Topic → controls retrieval over the seeded use-case mappings."""
def __init__(self, db: Session) -> None:
self.db = db
def list_use_cases(self) -> list[dict[str, Any]]:
"""Registry use-cases with live counts — atom-grain (Haiku classification)
plus the legacy master seed. Backs the coverage overview so every topic is
visible with how many obligations it actually carries."""
counts = {
row[0]: int(row[1])
for row in self.db.execute(text(
"SELECT use_case, count(*) FROM mc_use_case_mappings "
"GROUP BY use_case"
)).fetchall()
}
atom: dict[str, tuple[int, int]] = {}
if self.db.execute(text(
"SELECT to_regclass('compliance.atom_classification')"
)).scalar() is not None:
atom = {
row[0]: (int(row[1]), int(row[2]))
for row in self.db.execute(text(
"SELECT use_case, count(*), count(*) FILTER (WHERE relevant) "
"FROM atom_classification GROUP BY use_case"
)).fetchall()
}
out = [
{
"key": uc.key,
"label": uc.label,
"group": uc.group,
"regulations": list(uc.regulations),
"verification_methods": list(uc.verification_methods),
"mapped_controls": counts.get(uc.key, 0),
"atom_total": atom.get(uc.key, (0, 0))[0],
"atom_relevant": atom.get(uc.key, (0, 0))[1],
}
for uc in REGISTRY.values() if uc.enabled
]
out.sort(key=lambda x: (x["atom_relevant"], x["mapped_controls"]),
reverse=True)
return out
def controls_for_use_case(
self,
use_case: str,
primary_only: bool = False,
limit: int = 50,
offset: int = 0,
sub_topic: Optional[str] = None,
) -> dict[str, Any]:
"""Controls for ``use_case``. Prefers the atom-grain Haiku classification
(precise + sub-topic-organized) when present; falls back to the
master-grain seed otherwise."""
if not is_valid_use_case(use_case):
raise NotFoundError(f"Unknown use_case '{use_case}'")
uc = REGISTRY[use_case]
lim = min(max(int(limit), 1), 200)
off = max(int(offset), 0)
if self._has_atom_grain(use_case):
return self._atom_grain(uc, lim, off, sub_topic)
# --- master-grain fallback (recall seed) ---
count_sql = (
"SELECT count(*) FROM mc_use_case_mappings WHERE use_case = :uc"
+ (" AND is_primary" if primary_only else "")
)
total = self.db.execute(text(count_sql), {"uc": use_case}).scalar() or 0
rows = self.db.execute(_LIST_SQL, {
"uc": use_case, "primary_only": bool(primary_only), "lim": lim, "off": off,
}).fetchall()
controls = [
{
"id": str(r.id),
"master_control_id": r.master_control_id,
"title": r.title or r.canonical_name,
"objective": r.objective,
"severity": r.severity,
"category": r.category,
"member_count": r.total_controls,
"is_primary": bool(r.is_primary),
"confidence": float(r.confidence) if r.confidence is not None else None,
"primary_regulation": r.primary_regulation,
"relevance": relevance_score(
r.title, r.objective, uc.keyword_tokens, r.is_primary, r.confidence,
),
}
for r in rows
]
return {
"use_case": uc.key, "label": uc.label, "group": uc.group,
"granularity": "master", "total": int(total), "limit": lim, "offset": off,
"primary_only": bool(primary_only), "controls": controls,
}
def _has_atom_grain(self, use_case: str) -> bool:
if self.db.execute(
text("SELECT to_regclass('compliance.atom_classification')")
).scalar() is None:
return False
return (self.db.execute(
text("SELECT count(*) FROM atom_classification WHERE use_case = :uc"),
{"uc": use_case},
).scalar() or 0) > 0
def _atom_grain(
self, uc, lim: int, off: int, sub_topic: Optional[str],
) -> dict[str, Any]:
total = self.db.execute(text(
"SELECT count(*) FROM atom_classification "
"WHERE use_case = :uc AND relevant = true "
"AND (:sub IS NULL OR sub_topic = :sub)"
), {"uc": uc.key, "sub": sub_topic}).scalar() or 0
facet = {
row[0]: int(row[1])
for row in self.db.execute(text(
"SELECT COALESCE(sub_topic, '(none)'), count(*) "
"FROM atom_classification WHERE use_case = :uc AND relevant = true "
"GROUP BY 1 ORDER BY 2 DESC"
), {"uc": uc.key}).fetchall()
}
rows = self.db.execute(_ATOM_LIST_SQL, {
"uc": uc.key, "sub": sub_topic, "lim": lim, "off": off,
}).fetchall()
controls = [
{
"id": str(r.control_uuid),
"control_id": r.control_id,
"title": r.title,
"objective": r.objective,
"severity": r.severity,
"sub_topic": r.sub_topic,
"canonical_obligation": r.canonical_obligation,
"source_regulation": r.source_regulation,
}
for r in rows
]
return {
"use_case": uc.key, "label": uc.label, "group": uc.group,
"granularity": "atom", "total": int(total), "limit": lim, "offset": off,
"sub_topic": sub_topic, "subtopic_counts": facet, "controls": controls,
}