Files
breakpilot-compliance/backend-compliance/compliance/api/canonical_control_routes.py
Sharang Parnerkar c43d9da6d0 merge: sync with origin/main, take upstream on conflicts
# Conflicts:
#	admin-compliance/lib/sdk/types.ts
#	admin-compliance/lib/sdk/vendor-compliance/types.ts
2026-04-16 16:26:48 +02:00

1982 lines
77 KiB
Python

"""
FastAPI routes for the Canonical Control Library.
Independently authored security controls anchored in open-source frameworks
(OWASP, NIST, ENISA). No proprietary nomenclature.
Endpoints:
GET /v1/canonical/frameworks — All frameworks
GET /v1/canonical/frameworks/{framework_id} — Framework details
GET /v1/canonical/frameworks/{framework_id}/controls — Controls of a framework
GET /v1/canonical/controls — All controls (filterable)
GET /v1/canonical/controls/{control_id} — Single control
GET /v1/canonical/controls/{control_id}/traceability — Traceability chain
GET /v1/canonical/controls/{control_id}/similar — Find similar controls
POST /v1/canonical/controls — Create a control
PUT /v1/canonical/controls/{control_id} — Update a control
DELETE /v1/canonical/controls/{control_id} — Delete a control
GET /v1/canonical/categories — Category list
GET /v1/canonical/sources — Source registry
GET /v1/canonical/licenses — License matrix
POST /v1/canonical/controls/{control_id}/similarity-check — Too-close check
"""
from __future__ import annotations
import json
import logging
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from compliance.api._http_errors import translate_domain_errors
from compliance.schemas.canonical_control import (
ControlCreateRequest,
ControlResponse,
ControlUpdateRequest,
FrameworkResponse,
SimilarityCheckRequest,
SimilarityCheckResponse,
)
from compliance.services.canonical_control_service import (
CanonicalControlService,
_control_row, # re-exported for legacy test imports
)
router = APIRouter(prefix="/v1/canonical", tags=["canonical-controls"])
# =============================================================================
# RESPONSE MODELS
# =============================================================================
class FrameworkResponse(BaseModel):
id: str
framework_id: str
name: str
version: str
description: Optional[str] = None
owner: Optional[str] = None
policy_version: Optional[str] = None
release_state: str
created_at: str
updated_at: str
class ControlResponse(BaseModel):
id: str
framework_id: str
control_id: str
title: str
objective: str
rationale: str
scope: dict
requirements: list
test_procedure: list
evidence: list
severity: str
risk_score: Optional[float] = None
implementation_effort: Optional[str] = None
evidence_confidence: Optional[float] = None
open_anchors: list
release_state: str
tags: list
license_rule: Optional[int] = None
source_original_text: Optional[str] = None
source_citation: Optional[dict] = None
customer_visible: Optional[bool] = None
verification_method: Optional[str] = None
category: Optional[str] = None
evidence_type: Optional[str] = None
target_audience: Optional[str] = None
generation_metadata: Optional[dict] = None
generation_strategy: Optional[str] = "ungrouped"
applicable_industries: Optional[list] = None
applicable_company_size: Optional[list] = None
scope_conditions: Optional[dict] = None
created_at: str
updated_at: str
class ControlCreateRequest(BaseModel):
framework_id: str # e.g. 'bp_security_v1'
control_id: str # e.g. 'AUTH-003'
title: str
objective: str
rationale: str
scope: dict = {}
requirements: list = []
test_procedure: list = []
evidence: list = []
severity: str = "medium"
risk_score: Optional[float] = None
implementation_effort: Optional[str] = None
evidence_confidence: Optional[float] = None
open_anchors: list = []
release_state: str = "draft"
tags: list = []
license_rule: Optional[int] = None
source_original_text: Optional[str] = None
source_citation: Optional[dict] = None
customer_visible: Optional[bool] = True
verification_method: Optional[str] = None
category: Optional[str] = None
evidence_type: Optional[str] = None
target_audience: Optional[str] = None
generation_metadata: Optional[dict] = None
applicable_industries: Optional[list] = None
applicable_company_size: Optional[list] = None
scope_conditions: Optional[dict] = None
class ControlUpdateRequest(BaseModel):
title: Optional[str] = None
objective: Optional[str] = None
rationale: Optional[str] = None
scope: Optional[dict] = None
requirements: Optional[list] = None
test_procedure: Optional[list] = None
evidence: Optional[list] = None
severity: Optional[str] = None
risk_score: Optional[float] = None
implementation_effort: Optional[str] = None
evidence_confidence: Optional[float] = None
open_anchors: Optional[list] = None
release_state: Optional[str] = None
tags: Optional[list] = None
license_rule: Optional[int] = None
source_original_text: Optional[str] = None
source_citation: Optional[dict] = None
customer_visible: Optional[bool] = None
verification_method: Optional[str] = None
category: Optional[str] = None
evidence_type: Optional[str] = None
target_audience: Optional[str] = None
generation_metadata: Optional[dict] = None
applicable_industries: Optional[list] = None
applicable_company_size: Optional[list] = None
scope_conditions: Optional[dict] = None
class SimilarityCheckRequest(BaseModel):
source_text: str
candidate_text: str
class SimilarityCheckResponse(BaseModel):
max_exact_run: int
token_overlap: float
ngram_jaccard: float
embedding_cosine: float
lcs_ratio: float
status: str
details: dict
# =============================================================================
# HELPERS
# =============================================================================
_CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort,
evidence_confidence, open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, verification_method, category, evidence_type,
target_audience, generation_metadata, generation_strategy,
applicable_industries, applicable_company_size, scope_conditions,
parent_control_uuid, decomposition_method, pipeline_version,
(SELECT p.control_id FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_id,
(SELECT p.title FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_title,
created_at, updated_at"""
def _row_to_dict(row, columns: list[str]) -> dict[str, Any]:
"""Generic row → dict converter."""
return {col: (getattr(row, col).isoformat() if hasattr(getattr(row, col, None), 'isoformat') else getattr(row, col)) for col in columns}
# =============================================================================
# FRAMEWORKS
# =============================================================================
@router.get("/frameworks")
async def list_frameworks(
service: CanonicalControlService = Depends(get_canonical_service),
) -> list[dict[str, Any]]:
"""List all registered control frameworks."""
with translate_domain_errors():
return service.list_frameworks()
@router.get("/frameworks/{framework_id}")
async def get_framework(
framework_id: str,
service: CanonicalControlService = Depends(get_canonical_service),
) -> dict[str, Any]:
"""Get a single framework by its framework_id."""
with translate_domain_errors():
return service.get_framework(framework_id)
@router.get("/frameworks/{framework_id}/controls")
async def list_framework_controls(
framework_id: str,
severity: Optional[str] = Query(None),
release_state: Optional[str] = Query(None),
verification_method: Optional[str] = Query(None),
category: Optional[str] = Query(None),
target_audience: Optional[str] = Query(None),
):
"""List controls belonging to a framework."""
with SessionLocal() as db:
# Resolve framework UUID
fw = db.execute(
text("SELECT id FROM canonical_control_frameworks WHERE framework_id = :fid"),
{"fid": framework_id},
).fetchone()
if not fw:
raise HTTPException(status_code=404, detail="Framework not found")
query = f"""
SELECT {_CONTROL_COLS}
FROM canonical_controls
WHERE framework_id = :fw_id
"""
params: dict[str, Any] = {"fw_id": str(fw.id)}
if severity:
query += " AND severity = :sev"
params["sev"] = severity
if release_state:
query += " AND release_state = :rs"
params["rs"] = release_state
if verification_method:
query += " AND verification_method = :vm"
params["vm"] = verification_method
if category:
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience::jsonb @> (:ta)::jsonb"
params["ta"] = json.dumps([target_audience])
query += " ORDER BY control_id"
rows = db.execute(text(query), params).fetchall()
return [_control_row(r) for r in rows]
# =============================================================================
# CONTROLS
# =============================================================================
@router.get("/controls")
async def list_controls(
severity: Optional[str] = Query(None),
domain: Optional[str] = Query(None),
release_state: Optional[str] = Query(None),
verification_method: Optional[str] = Query(None),
category: Optional[str] = Query(None),
evidence_type: Optional[str] = Query(None, description="Filter: code, process, hybrid"),
target_audience: Optional[str] = Query(None),
source: Optional[str] = Query(None, description="Filter by source_citation->source"),
search: Optional[str] = Query(None, description="Full-text search in control_id, title, objective"),
control_type: Optional[str] = Query(None, description="Filter: atomic, rich, or all"),
exclude_duplicates: bool = Query(False, description="Exclude controls with release_state='duplicate'"),
sort: Optional[str] = Query("control_id", description="Sort field: control_id, created_at, severity"),
order: Optional[str] = Query("asc", description="Sort order: asc or desc"),
limit: Optional[int] = Query(None, ge=1, le=5000, description="Max results"),
offset: Optional[int] = Query(None, ge=0, description="Offset for pagination"),
):
"""List canonical controls with filters, search, sorting and pagination."""
query = f"""
SELECT {_CONTROL_COLS}
FROM canonical_controls
WHERE 1=1
"""
params: dict[str, Any] = {}
if exclude_duplicates:
query += " AND release_state != 'duplicate'"
if severity:
query += " AND severity = :sev"
params["sev"] = severity
if domain:
query += " AND LEFT(control_id, LENGTH(:dom)) = :dom"
params["dom"] = domain.upper()
if release_state:
query += " AND release_state = :rs"
params["rs"] = release_state
if verification_method:
if verification_method == "__none__":
query += " AND verification_method IS NULL"
else:
query += " AND verification_method = :vm"
params["vm"] = verification_method
if category:
if category == "__none__":
query += " AND category IS NULL"
else:
query += " AND category = :cat"
params["cat"] = category
if evidence_type:
if evidence_type == "__none__":
query += " AND evidence_type IS NULL"
else:
query += " AND evidence_type = :et"
params["et"] = evidence_type
if target_audience:
query += " AND target_audience LIKE :ta_pattern"
params["ta_pattern"] = f'%"{target_audience}"%'
if source:
if source == "__none__":
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
else:
query += " AND source_citation->>'source' = :src"
params["src"] = source
if control_type == "atomic":
query += " AND decomposition_method = 'pass0b'"
elif control_type == "rich":
query += " AND (decomposition_method IS NULL OR decomposition_method != 'pass0b')"
elif control_type == "eigenentwicklung":
query += """ AND generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL"""
if search:
query += " AND (control_id ILIKE :q OR title ILIKE :q OR objective ILIKE :q)"
params["q"] = f"%{search}%"
# Sorting
sort_col = "control_id"
if sort in ("created_at", "updated_at", "severity", "control_id"):
sort_col = sort
elif sort == "source":
sort_col = "source_citation->>'source'"
sort_dir = "DESC" if order and order.lower() == "desc" else "ASC"
if sort == "source":
# Group by source first, then by control_id within each source
query += f" ORDER BY {sort_col} {sort_dir} NULLS LAST, control_id ASC"
else:
query += f" ORDER BY {sort_col} {sort_dir}"
if limit is not None:
query += " LIMIT :lim"
params["lim"] = limit
if offset is not None:
query += " OFFSET :off"
params["off"] = offset
with SessionLocal() as db:
rows = db.execute(text(query), params).fetchall()
return [_control_row(r) for r in rows]
@router.get("/controls-count")
async def count_controls(
severity: Optional[str] = Query(None),
domain: Optional[str] = Query(None),
release_state: Optional[str] = Query(None),
verification_method: Optional[str] = Query(None),
category: Optional[str] = Query(None),
evidence_type: Optional[str] = Query(None),
target_audience: Optional[str] = Query(None),
source: Optional[str] = Query(None),
search: Optional[str] = Query(None),
control_type: Optional[str] = Query(None),
exclude_duplicates: bool = Query(False, description="Exclude controls with release_state='duplicate'"),
):
"""Count controls matching filters (for pagination)."""
query = "SELECT count(*) FROM canonical_controls WHERE 1=1"
params: dict[str, Any] = {}
if exclude_duplicates:
query += " AND release_state != 'duplicate'"
if severity:
query += " AND severity = :sev"
params["sev"] = severity
if domain:
query += " AND LEFT(control_id, LENGTH(:dom)) = :dom"
params["dom"] = domain.upper()
if release_state:
query += " AND release_state = :rs"
params["rs"] = release_state
if verification_method:
if verification_method == "__none__":
query += " AND verification_method IS NULL"
else:
query += " AND verification_method = :vm"
params["vm"] = verification_method
if category:
if category == "__none__":
query += " AND category IS NULL"
else:
query += " AND category = :cat"
params["cat"] = category
if evidence_type:
if evidence_type == "__none__":
query += " AND evidence_type IS NULL"
else:
query += " AND evidence_type = :et"
params["et"] = evidence_type
if target_audience:
query += " AND target_audience LIKE :ta_pattern"
params["ta_pattern"] = f'%"{target_audience}"%'
if source:
if source == "__none__":
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
else:
query += " AND source_citation->>'source' = :src"
params["src"] = source
if control_type == "atomic":
query += " AND decomposition_method = 'pass0b'"
elif control_type == "rich":
query += " AND (decomposition_method IS NULL OR decomposition_method != 'pass0b')"
elif control_type == "eigenentwicklung":
query += """ AND generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL"""
if search:
query += " AND (control_id ILIKE :q OR title ILIKE :q OR objective ILIKE :q)"
params["q"] = f"%{search}%"
with SessionLocal() as db:
total = db.execute(text(query), params).scalar()
return {"total": total}
@router.get("/controls-meta")
async def controls_meta(
severity: Optional[str] = Query(None),
domain: Optional[str] = Query(None),
release_state: Optional[str] = Query(None),
verification_method: Optional[str] = Query(None),
category: Optional[str] = Query(None),
evidence_type: Optional[str] = Query(None),
target_audience: Optional[str] = Query(None),
source: Optional[str] = Query(None),
search: Optional[str] = Query(None),
control_type: Optional[str] = Query(None),
exclude_duplicates: bool = Query(False),
):
"""Return faceted metadata for filter dropdowns.
Each facet's counts respect ALL active filters EXCEPT the facet's own,
so dropdowns always show how many items each option would yield.
"""
def _build_where(skip: Optional[str] = None) -> tuple[str, dict[str, Any]]:
clauses = ["1=1"]
p: dict[str, Any] = {}
if exclude_duplicates:
clauses.append("release_state != 'duplicate'")
if severity and skip != "severity":
clauses.append("severity = :sev")
p["sev"] = severity
if domain and skip != "domain":
clauses.append("LEFT(control_id, LENGTH(:dom)) = :dom")
p["dom"] = domain.upper()
if release_state and skip != "release_state":
clauses.append("release_state = :rs")
p["rs"] = release_state
if verification_method and skip != "verification_method":
if verification_method == "__none__":
clauses.append("verification_method IS NULL")
else:
clauses.append("verification_method = :vm")
p["vm"] = verification_method
if category and skip != "category":
if category == "__none__":
clauses.append("category IS NULL")
else:
clauses.append("category = :cat")
p["cat"] = category
if evidence_type and skip != "evidence_type":
if evidence_type == "__none__":
clauses.append("evidence_type IS NULL")
else:
clauses.append("evidence_type = :et")
p["et"] = evidence_type
if target_audience and skip != "target_audience":
clauses.append("target_audience LIKE :ta_pattern")
p["ta_pattern"] = f'%"{target_audience}"%'
if source and skip != "source":
if source == "__none__":
clauses.append("(source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')")
else:
clauses.append("source_citation->>'source' = :src")
p["src"] = source
if control_type and skip != "control_type":
if control_type == "atomic":
clauses.append("decomposition_method = 'pass0b'")
elif control_type == "rich":
clauses.append("(decomposition_method IS NULL OR decomposition_method != 'pass0b')")
elif control_type == "eigenentwicklung":
clauses.append("""generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL""")
if search and skip != "search":
clauses.append("(control_id ILIKE :q OR title ILIKE :q OR objective ILIKE :q)")
p["q"] = f"%{search}%"
return " AND ".join(clauses), p
with SessionLocal() as db:
# Total with ALL filters
w_all, p_all = _build_where()
total = db.execute(text(f"SELECT count(*) FROM canonical_controls WHERE {w_all}"), p_all).scalar()
# Domain facet (skip domain filter so user sees all domains)
w_dom, p_dom = _build_where(skip="domain")
domains = db.execute(text(f"""
SELECT UPPER(SPLIT_PART(control_id, '-', 1)) as domain, count(*) as cnt
FROM canonical_controls WHERE {w_dom}
GROUP BY domain ORDER BY domain
"""), p_dom).fetchall()
# Source facet (skip source filter)
w_src, p_src = _build_where(skip="source")
sources = db.execute(text(f"""
SELECT source_citation->>'source' as src, count(*) as cnt
FROM canonical_controls
WHERE {w_src}
AND source_citation->>'source' IS NOT NULL AND source_citation->>'source' != ''
GROUP BY src ORDER BY cnt DESC
"""), p_src).fetchall()
no_source = db.execute(text(f"""
SELECT count(*) FROM canonical_controls
WHERE {w_src}
AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')
"""), p_src).scalar()
# Type facet (skip control_type filter)
w_typ, p_typ = _build_where(skip="control_type")
atomic_count = db.execute(text(f"""
SELECT count(*) FROM canonical_controls
WHERE {w_typ} AND decomposition_method = 'pass0b'
"""), p_typ).scalar() or 0
eigenentwicklung_count = db.execute(text(f"""
SELECT count(*) FROM canonical_controls
WHERE {w_typ}
AND generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL
"""), p_typ).scalar() or 0
rich_count = db.execute(text(f"""
SELECT count(*) FROM canonical_controls
WHERE {w_typ}
AND (decomposition_method IS NULL OR decomposition_method != 'pass0b')
"""), p_typ).scalar() or 0
# Severity facet (skip severity filter)
w_sev, p_sev = _build_where(skip="severity")
severity_counts = db.execute(text(f"""
SELECT severity, count(*) as cnt
FROM canonical_controls WHERE {w_sev}
GROUP BY severity ORDER BY severity
"""), p_sev).fetchall()
# Verification method facet (include NULLs as __none__)
w_vm, p_vm = _build_where(skip="verification_method")
vm_counts = db.execute(text(f"""
SELECT COALESCE(verification_method, '__none__') as vm, count(*) as cnt
FROM canonical_controls WHERE {w_vm}
GROUP BY vm ORDER BY vm
"""), p_vm).fetchall()
# Category facet (include NULLs as __none__)
w_cat, p_cat = _build_where(skip="category")
cat_counts = db.execute(text(f"""
SELECT COALESCE(category, '__none__') as cat, count(*) as cnt
FROM canonical_controls WHERE {w_cat}
GROUP BY cat ORDER BY cnt DESC
"""), p_cat).fetchall()
# Evidence type facet (include NULLs as __none__)
w_et, p_et = _build_where(skip="evidence_type")
et_counts = db.execute(text(f"""
SELECT COALESCE(evidence_type, '__none__') as et, count(*) as cnt
FROM canonical_controls WHERE {w_et}
GROUP BY et ORDER BY et
"""), p_et).fetchall()
# Release state facet
w_rs, p_rs = _build_where(skip="release_state")
rs_counts = db.execute(text(f"""
SELECT release_state, count(*) as cnt
FROM canonical_controls WHERE {w_rs}
GROUP BY release_state ORDER BY release_state
"""), p_rs).fetchall()
return {
"total": total,
"domains": [{"domain": r[0], "count": r[1]} for r in domains],
"sources": [{"source": r[0], "count": r[1]} for r in sources],
"no_source_count": no_source,
"type_counts": {
"rich": rich_count,
"atomic": atomic_count,
"eigenentwicklung": eigenentwicklung_count,
},
"severity_counts": {r[0]: r[1] for r in severity_counts},
"verification_method_counts": {r[0]: r[1] for r in vm_counts},
"category_counts": {r[0]: r[1] for r in cat_counts},
"evidence_type_counts": {r[0]: r[1] for r in et_counts},
"release_state_counts": {r[0]: r[1] for r in rs_counts},
}
@router.get("/controls/atomic-stats")
async def atomic_stats():
"""Return aggregated statistics for atomic controls (masters only)."""
with SessionLocal() as db:
total_active = db.execute(text("""
SELECT count(*) FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state NOT IN ('duplicate', 'deprecated', 'rejected')
""")).scalar() or 0
total_duplicate = db.execute(text("""
SELECT count(*) FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state = 'duplicate'
""")).scalar() or 0
by_domain = db.execute(text("""
SELECT UPPER(SPLIT_PART(control_id, '-', 1)) AS domain, count(*) AS cnt
FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state NOT IN ('duplicate', 'deprecated', 'rejected')
GROUP BY domain ORDER BY cnt DESC
""")).fetchall()
by_regulation = db.execute(text("""
SELECT cpl.source_regulation AS regulation, count(DISTINCT cc.id) AS cnt
FROM canonical_controls cc
JOIN control_parent_links cpl ON cpl.control_uuid = cc.id
WHERE cc.decomposition_method = 'pass0b'
AND cc.release_state NOT IN ('duplicate', 'deprecated', 'rejected')
AND cpl.source_regulation IS NOT NULL
GROUP BY cpl.source_regulation ORDER BY cnt DESC
""")).fetchall()
avg_coverage = db.execute(text("""
SELECT COALESCE(AVG(reg_count), 0)
FROM (
SELECT cc.id, count(DISTINCT cpl.source_regulation) AS reg_count
FROM canonical_controls cc
LEFT JOIN control_parent_links cpl ON cpl.control_uuid = cc.id
WHERE cc.decomposition_method = 'pass0b'
AND cc.release_state NOT IN ('duplicate', 'deprecated', 'rejected')
GROUP BY cc.id
) sub
""")).scalar() or 0
return {
"total_active": total_active,
"total_duplicate": total_duplicate,
"by_domain": [{"domain": r[0], "count": r[1]} for r in by_domain],
"by_regulation": [{"regulation": r[0], "count": r[1]} for r in by_regulation],
"avg_regulation_coverage": round(float(avg_coverage), 1),
}
@router.get("/controls/v1-enrichment-stats")
async def v1_enrichment_stats_endpoint():
"""
Uebersicht: Wie viele v1 Controls haben regulatorische Abdeckung?
"""
from compliance.services.v1_enrichment import get_v1_enrichment_stats
return await get_v1_enrichment_stats()
@router.get("/controls/{control_id}")
async def get_control(
control_id: str,
service: CanonicalControlService = Depends(get_canonical_service),
) -> dict[str, Any]:
"""Get a single canonical control by its control_id (e.g. AUTH-001)."""
with SessionLocal() as db:
row = db.execute(
text(f"""
SELECT {_CONTROL_COLS}
FROM canonical_controls
WHERE control_id = :cid
"""),
{"cid": control_id.upper()},
).fetchone()
if not row:
raise HTTPException(status_code=404, detail="Control not found")
return _control_row(row)
@router.get("/controls/{control_id}/traceability")
async def get_control_traceability(control_id: str):
"""Get the full traceability chain for a control.
For atomic controls: shows all parent links with source regulations,
articles, and the obligation chain.
For rich controls: shows child atomic controls derived from them.
"""
with SessionLocal() as db:
# Get control UUID
ctrl = db.execute(
text("""
SELECT id, control_id, title, parent_control_uuid,
decomposition_method, source_citation
FROM canonical_controls WHERE control_id = :cid
"""),
{"cid": control_id.upper()},
).fetchone()
if not ctrl:
raise HTTPException(status_code=404, detail="Control not found")
result: dict[str, Any] = {
"control_id": ctrl.control_id,
"title": ctrl.title,
"is_atomic": ctrl.decomposition_method == "pass0b",
}
ctrl_uuid = str(ctrl.id)
# Parent links (M:N) — for atomic controls
parent_links = db.execute(
text("""
SELECT cpl.parent_control_uuid, cpl.link_type,
cpl.confidence, cpl.source_regulation,
cpl.source_article, cpl.obligation_candidate_id,
cc.control_id AS parent_control_id,
cc.title AS parent_title,
cc.source_citation AS parent_citation,
oc.obligation_text, oc.action, oc.object,
oc.normative_strength
FROM control_parent_links cpl
JOIN canonical_controls cc ON cc.id = cpl.parent_control_uuid
LEFT JOIN obligation_candidates oc ON oc.id = cpl.obligation_candidate_id
WHERE cpl.control_uuid = CAST(:uid AS uuid)
ORDER BY cpl.source_regulation, cpl.source_article
"""),
{"uid": ctrl_uuid},
).fetchall()
result["parent_links"] = [
{
"parent_control_id": pl.parent_control_id,
"parent_title": pl.parent_title,
"link_type": pl.link_type,
"confidence": float(pl.confidence) if pl.confidence else 1.0,
"source_regulation": pl.source_regulation,
"source_article": pl.source_article,
"parent_citation": pl.parent_citation,
"obligation": {
"text": pl.obligation_text,
"action": pl.action,
"object": pl.object,
"normative_strength": pl.normative_strength,
} if pl.obligation_text else None,
}
for pl in parent_links
]
# Also include the 1:1 parent (backwards compat) if not already in links
if ctrl.parent_control_uuid:
parent_uuids_in_links = {
str(pl.parent_control_uuid) for pl in parent_links
}
parent_uuid_str = str(ctrl.parent_control_uuid)
if parent_uuid_str not in parent_uuids_in_links:
legacy = db.execute(
text("""
SELECT control_id, title, source_citation
FROM canonical_controls WHERE id = CAST(:uid AS uuid)
"""),
{"uid": parent_uuid_str},
).fetchone()
if legacy:
result["parent_links"].insert(0, {
"parent_control_id": legacy.control_id,
"parent_title": legacy.title,
"link_type": "decomposition",
"confidence": 1.0,
"source_regulation": None,
"source_article": None,
"parent_citation": legacy.source_citation,
"obligation": None,
})
# Child controls — for rich controls
children = db.execute(
text("""
SELECT control_id, title, category, severity,
decomposition_method
FROM canonical_controls
WHERE parent_control_uuid = CAST(:uid AS uuid)
ORDER BY control_id
"""),
{"uid": ctrl_uuid},
).fetchall()
result["children"] = [
{
"control_id": ch.control_id,
"title": ch.title,
"category": ch.category,
"severity": ch.severity,
"decomposition_method": ch.decomposition_method,
}
for ch in children
]
# Unique source regulations count
regs = set()
for pl in result["parent_links"]:
if pl.get("source_regulation"):
regs.add(pl["source_regulation"])
result["source_count"] = len(regs)
return result
@router.get("/controls/{control_id}/provenance")
async def get_control_provenance(control_id: str):
"""Get full provenance chain for a control — extends traceability with
obligations, document references, merged duplicates, and regulations summary.
"""
with SessionLocal() as db:
ctrl = db.execute(
text("""
SELECT id, control_id, title, parent_control_uuid,
decomposition_method, source_citation
FROM canonical_controls WHERE control_id = :cid
"""),
{"cid": control_id.upper()},
).fetchone()
if not ctrl:
raise HTTPException(status_code=404, detail="Control not found")
ctrl_uuid = str(ctrl.id)
is_atomic = ctrl.decomposition_method == "pass0b"
result: dict[str, Any] = {
"control_id": ctrl.control_id,
"title": ctrl.title,
"is_atomic": is_atomic,
}
# --- Parent links (same as traceability) ---
parent_links = db.execute(
text("""
SELECT cpl.parent_control_uuid, cpl.link_type,
cpl.confidence, cpl.source_regulation,
cpl.source_article, cpl.obligation_candidate_id,
cc.control_id AS parent_control_id,
cc.title AS parent_title,
cc.source_citation AS parent_citation,
oc.obligation_text, oc.action, oc.object,
oc.normative_strength
FROM control_parent_links cpl
JOIN canonical_controls cc ON cc.id = cpl.parent_control_uuid
LEFT JOIN obligation_candidates oc ON oc.id = cpl.obligation_candidate_id
WHERE cpl.control_uuid = CAST(:uid AS uuid)
ORDER BY cpl.source_regulation, cpl.source_article
"""),
{"uid": ctrl_uuid},
).fetchall()
result["parent_links"] = [
{
"parent_control_id": pl.parent_control_id,
"parent_title": pl.parent_title,
"link_type": pl.link_type,
"confidence": float(pl.confidence) if pl.confidence else 1.0,
"source_regulation": pl.source_regulation,
"source_article": pl.source_article,
"parent_citation": pl.parent_citation,
"obligation": {
"text": pl.obligation_text,
"action": pl.action,
"object": pl.object,
"normative_strength": pl.normative_strength,
} if pl.obligation_text else None,
}
for pl in parent_links
]
# Legacy 1:1 parent (backwards compat)
if ctrl.parent_control_uuid:
parent_uuids_in_links = {
str(pl.parent_control_uuid) for pl in parent_links
}
parent_uuid_str = str(ctrl.parent_control_uuid)
if parent_uuid_str not in parent_uuids_in_links:
legacy = db.execute(
text("""
SELECT control_id, title, source_citation
FROM canonical_controls WHERE id = CAST(:uid AS uuid)
"""),
{"uid": parent_uuid_str},
).fetchone()
if legacy:
result["parent_links"].insert(0, {
"parent_control_id": legacy.control_id,
"parent_title": legacy.title,
"link_type": "decomposition",
"confidence": 1.0,
"source_regulation": None,
"source_article": None,
"parent_citation": legacy.source_citation,
"obligation": None,
})
# --- Children ---
children = db.execute(
text("""
SELECT control_id, title, category, severity,
decomposition_method
FROM canonical_controls
WHERE parent_control_uuid = CAST(:uid AS uuid)
ORDER BY control_id
"""),
{"uid": ctrl_uuid},
).fetchall()
result["children"] = [
{
"control_id": ch.control_id,
"title": ch.title,
"category": ch.category,
"severity": ch.severity,
"decomposition_method": ch.decomposition_method,
}
for ch in children
]
# Source count
regs = set()
for pl in result["parent_links"]:
if pl.get("source_regulation"):
regs.add(pl["source_regulation"])
result["source_count"] = len(regs)
# --- Obligations (for Rich Controls) ---
obligations = db.execute(
text("""
SELECT candidate_id, obligation_text, action, object,
normative_strength, release_state
FROM obligation_candidates
WHERE parent_control_uuid = CAST(:uid AS uuid)
AND release_state NOT IN ('rejected', 'merged', 'duplicate')
ORDER BY candidate_id
"""),
{"uid": ctrl_uuid},
).fetchall()
result["obligations"] = [
{
"candidate_id": ob.candidate_id,
"obligation_text": ob.obligation_text,
"action": ob.action,
"object": ob.object,
"normative_strength": ob.normative_strength,
"release_state": ob.release_state,
}
for ob in obligations
]
result["obligation_count"] = len(obligations)
# --- Document References ---
doc_refs = db.execute(
text("""
SELECT DISTINCT oe.regulation_code, oe.article, oe.paragraph,
oe.extraction_method, oe.confidence
FROM obligation_extractions oe
WHERE oe.control_uuid = CAST(:uid AS uuid)
OR oe.obligation_id IN (
SELECT oc.candidate_id FROM obligation_candidates oc
JOIN control_parent_links cpl ON cpl.obligation_candidate_id = oc.id
WHERE cpl.control_uuid = CAST(:uid AS uuid)
)
ORDER BY oe.regulation_code, oe.article
"""),
{"uid": ctrl_uuid},
).fetchall()
result["document_references"] = [
{
"regulation_code": dr.regulation_code,
"article": dr.article,
"paragraph": dr.paragraph,
"extraction_method": dr.extraction_method,
"confidence": float(dr.confidence) if dr.confidence else None,
}
for dr in doc_refs
]
# --- Merged Duplicates ---
merged = db.execute(
text("""
SELECT cc.control_id, cc.title,
(SELECT cpl.source_regulation FROM control_parent_links cpl
WHERE cpl.control_uuid = cc.id LIMIT 1) AS source_regulation
FROM canonical_controls cc
WHERE cc.merged_into_uuid = CAST(:uid AS uuid)
AND cc.release_state = 'duplicate'
ORDER BY cc.control_id
"""),
{"uid": ctrl_uuid},
).fetchall()
result["merged_duplicates"] = [
{
"control_id": m.control_id,
"title": m.title,
"source_regulation": m.source_regulation,
}
for m in merged
]
result["merged_duplicates_count"] = len(merged)
# --- Regulations Summary (aggregated from parent_links + doc_refs) ---
reg_map: dict[str, dict[str, Any]] = {}
for pl in result["parent_links"]:
reg = pl.get("source_regulation")
if not reg:
continue
if reg not in reg_map:
reg_map[reg] = {"articles": set(), "link_types": set()}
if pl.get("source_article"):
reg_map[reg]["articles"].add(pl["source_article"])
reg_map[reg]["link_types"].add(pl.get("link_type", "decomposition"))
for dr in result["document_references"]:
reg = dr.get("regulation_code")
if not reg:
continue
if reg not in reg_map:
reg_map[reg] = {"articles": set(), "link_types": set()}
if dr.get("article"):
reg_map[reg]["articles"].add(dr["article"])
result["regulations_summary"] = [
{
"regulation_code": reg,
"articles": sorted(info["articles"]),
"link_types": sorted(info["link_types"]),
}
for reg, info in sorted(reg_map.items())
]
return result
# =============================================================================
# NORMATIVE STRENGTH BACKFILL
# =============================================================================
@router.post("/controls/backfill-normative-strength")
async def backfill_normative_strength(
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
):
"""
Korrigiert normative_strength auf obligation_candidates basierend auf
dem source_type der Quell-Regulierung.
Dreistufiges Modell:
- law (Gesetz): normative_strength bleibt unveraendert
- guideline (Leitlinie): max 'should'
- framework (Framework): max 'can'
Fuer Controls mit mehreren Parent-Links gilt der hoechste source_type.
"""
from compliance.data.source_type_classification import (
classify_source_regulation,
cap_normative_strength,
)
with SessionLocal() as db:
# 1. Alle Obligations mit source_citation des Parent Controls laden
obligations = db.execute(text("""
SELECT oc.id, oc.candidate_id, oc.normative_strength,
cc.source_citation->>'source' AS parent_source
FROM obligation_candidates oc
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
WHERE oc.release_state NOT IN ('rejected', 'merged', 'duplicate')
AND oc.normative_strength IS NOT NULL
ORDER BY oc.candidate_id
""")).fetchall()
# 2. Normative strength korrigieren basierend auf source_type
changes = []
stats = {"total": len(obligations), "unchanged": 0, "capped_to_should": 0, "capped_to_may": 0, "no_source": 0}
for obl in obligations:
if not obl.parent_source:
stats["no_source"] += 1
continue
source_type = classify_source_regulation(obl.parent_source)
new_strength = cap_normative_strength(obl.normative_strength, source_type)
if new_strength != obl.normative_strength:
changes.append({
"id": str(obl.id),
"candidate_id": obl.candidate_id,
"old_strength": obl.normative_strength,
"new_strength": new_strength,
"source_type": source_type,
"source_regulation": obl.parent_source,
})
if new_strength == "should":
stats["capped_to_should"] += 1
elif new_strength == "may":
stats["capped_to_may"] += 1
else:
stats["unchanged"] += 1
# 4. Aenderungen anwenden (wenn kein dry_run)
if not dry_run and changes:
for change in changes:
db.execute(text("""
UPDATE obligation_candidates
SET normative_strength = :new_strength
WHERE id = CAST(:oid AS uuid)
"""), {"new_strength": change["new_strength"], "oid": change["id"]})
db.commit()
return {
"dry_run": dry_run,
"stats": stats,
"total_changes": len(changes),
"sample_changes": changes[:20],
}
# =============================================================================
# OBLIGATION DEDUPLICATION
# =============================================================================
@router.post("/obligations/dedup")
async def dedup_obligations(
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
batch_size: int = Query(0, description="0 = alle auf einmal"),
offset: int = Query(0, description="Offset fuer Batch-Verarbeitung"),
):
"""
Markiert doppelte obligation_candidates als 'duplicate'.
Duplikate = mehrere Eintraege mit gleichem candidate_id.
Pro candidate_id wird der aelteste Eintrag (MIN(created_at)) behalten,
alle anderen erhalten release_state='duplicate' und merged_into_id
zeigt auf den behaltenen Eintrag.
"""
with SessionLocal() as db:
# 1. Finde alle candidate_ids mit mehr als einem Eintrag
# (nur noch nicht-deduplizierte beruecksichtigen)
dup_query = """
SELECT candidate_id, count(*) as cnt
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
ORDER BY candidate_id
"""
if batch_size > 0:
dup_query += f" LIMIT {batch_size} OFFSET {offset}"
dup_groups = db.execute(text(dup_query)).fetchall()
total_groups = db.execute(text("""
SELECT count(*) FROM (
SELECT candidate_id
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
) sub
""")).scalar()
# 2. Pro Gruppe: aeltesten behalten, Rest als duplicate markieren
kept_count = 0
duplicate_count = 0
sample_changes: list[dict[str, Any]] = []
for grp in dup_groups:
cid = grp.candidate_id
# Alle Eintraege fuer dieses candidate_id holen
entries = db.execute(text("""
SELECT id, candidate_id, obligation_text, release_state, created_at
FROM obligation_candidates
WHERE candidate_id = :cid
AND release_state NOT IN ('rejected', 'merged', 'duplicate')
ORDER BY created_at ASC, id ASC
"""), {"cid": cid}).fetchall()
if len(entries) < 2:
continue
keeper = entries[0] # aeltester Eintrag
duplicates = entries[1:]
kept_count += 1
duplicate_count += len(duplicates)
if len(sample_changes) < 20:
sample_changes.append({
"candidate_id": cid,
"kept_id": str(keeper.id),
"kept_text": keeper.obligation_text[:100],
"duplicate_count": len(duplicates),
"duplicate_ids": [str(d.id) for d in duplicates],
})
if not dry_run:
for dup in duplicates:
db.execute(text("""
UPDATE obligation_candidates
SET release_state = 'duplicate',
merged_into_id = CAST(:keeper_id AS uuid),
quality_flags = COALESCE(quality_flags, '{}'::jsonb)
|| jsonb_build_object(
'dedup_reason', 'duplicate of ' || :keeper_cid,
'dedup_kept_id', :keeper_id_str,
'dedup_at', NOW()::text
)
WHERE id = CAST(:dup_id AS uuid)
"""), {
"keeper_id": str(keeper.id),
"keeper_cid": cid,
"keeper_id_str": str(keeper.id),
"dup_id": str(dup.id),
})
if not dry_run and duplicate_count > 0:
db.commit()
return {
"dry_run": dry_run,
"stats": {
"total_duplicate_groups": total_groups,
"processed_groups": len(dup_groups),
"kept": kept_count,
"marked_duplicate": duplicate_count,
},
"sample_changes": sample_changes,
}
@router.get("/obligations/dedup-stats")
async def dedup_obligations_stats():
"""Statistiken ueber den aktuellen Dedup-Status der Obligations."""
with SessionLocal() as db:
total = db.execute(text(
"SELECT count(*) FROM obligation_candidates"
)).scalar()
by_state = db.execute(text("""
SELECT release_state, count(*) as cnt
FROM obligation_candidates
GROUP BY release_state
ORDER BY release_state
""")).fetchall()
dup_groups = db.execute(text("""
SELECT count(*) FROM (
SELECT candidate_id
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
) sub
""")).scalar()
removable = db.execute(text("""
SELECT COALESCE(sum(cnt - 1), 0) FROM (
SELECT candidate_id, count(*) as cnt
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
) sub
""")).scalar()
return {
"total_obligations": total,
"by_state": {r.release_state: r.cnt for r in by_state},
"pending_duplicate_groups": dup_groups,
"pending_removable_duplicates": removable,
}
# =============================================================================
# EVIDENCE TYPE BACKFILL
# =============================================================================
# Domains that are primarily technical (code-verifiable)
_CODE_DOMAINS = frozenset({
"SEC", "AUTH", "CRYPT", "CRYP", "CRY", "NET", "LOG", "ACC", "APP", "SYS",
"CI", "CONT", "API", "CLOUD", "IAC", "SAST", "DAST", "DEP", "SBOM",
"WEB", "DEV", "SDL", "PKI", "HSM", "TEE", "TPM", "CRX", "CRF",
"FWU", "STO", "RUN", "VUL", "MAL", "PLT", "AUT",
})
# Domains that are primarily process-based (document-verifiable)
_PROCESS_DOMAINS = frozenset({
"GOV", "ORG", "COMP", "LEGAL", "HR", "TRAIN", "AML", "FIN",
"RISK", "AUDIT", "AUD", "PROC", "DOC", "PHYS", "PHY", "PRIV", "DPO",
"BCDR", "BCP", "VENDOR", "SUPPLY", "SUP", "CERT", "POLICY",
"ENV", "HLT", "TRD", "LAB", "PER", "REL", "ISM", "COM",
"GAM", "RIS", "PCA", "GNT", "HCA", "RES", "ISS",
})
# Domains that are typically hybrid
_HYBRID_DOMAINS = frozenset({
"DATA", "AI", "INC", "ID", "IAM", "IDF", "IDP", "IDA", "IDN",
"OPS", "MNT", "INT", "BCK",
})
def _classify_evidence_type(control_id: str, category: str | None) -> str:
"""Heuristic: classify a control as code/process/hybrid based on domain prefix."""
domain = control_id.split("-")[0].upper() if control_id else ""
if domain in _CODE_DOMAINS:
return "code"
if domain in _PROCESS_DOMAINS:
return "process"
if domain in _HYBRID_DOMAINS:
return "hybrid"
# Fallback: use category if available
code_categories = {"encryption", "authentication", "network", "application", "system", "identity"}
process_categories = {"compliance", "personnel", "physical", "governance", "risk"}
if category in code_categories:
return "code"
if category in process_categories:
return "process"
return "process" # Conservative default
@router.post("/controls/backfill-evidence-type")
async def backfill_evidence_type(
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
):
"""
Klassifiziert Controls als code/process/hybrid basierend auf Domain-Prefix.
Heuristik:
- SEC, AUTH, CRYPT, NET, LOG, ... → code
- GOV, ORG, COMP, LEGAL, HR, ... → process
- DATA, AI, INC → hybrid
"""
with SessionLocal() as db:
rows = db.execute(text("""
SELECT id, control_id, category, evidence_type
FROM canonical_controls
WHERE release_state NOT IN ('rejected', 'merged')
ORDER BY control_id
""")).fetchall()
changes = []
stats = {"total": len(rows), "already_set": 0, "code": 0, "process": 0, "hybrid": 0}
for row in rows:
if row.evidence_type is not None:
stats["already_set"] += 1
continue
new_type = _classify_evidence_type(row.control_id, row.category)
stats[new_type] += 1
changes.append({
"id": str(row.id),
"control_id": row.control_id,
"evidence_type": new_type,
})
if not dry_run and changes:
for change in changes:
db.execute(text("""
UPDATE canonical_controls
SET evidence_type = :et
WHERE id = CAST(:cid AS uuid)
"""), {"et": change["evidence_type"], "cid": change["id"]})
db.commit()
return {
"dry_run": dry_run,
"stats": stats,
"total_changes": len(changes),
"sample_changes": changes[:20],
}
# =============================================================================
# RATIONALE BACKFILL (LLM)
# =============================================================================
@router.post("/controls/backfill-rationale")
async def backfill_rationale(
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
batch_size: int = Query(50, description="Parent-Controls pro Durchlauf"),
offset: int = Query(0, description="Offset fuer Paginierung (Parent-Index)"),
):
"""
Generiert sinnvolle Begruendungen fuer atomare Controls per LLM.
Optimierung: Gruppiert nach Parent-Control (~7k Parents statt ~86k Einzel-Calls).
Pro Parent-Gruppe wird EIN LLM-Aufruf gemacht, der eine gemeinsame
Begruendung fuer alle Kinder erzeugt.
Workflow:
1. dry_run=true → Statistiken anzeigen
2. dry_run=false&batch_size=50&offset=0 → Erste 50 Parents verarbeiten
3. Wiederholen mit offset=50, 100, ... bis fertig
"""
from compliance.services.llm_provider import get_llm_provider
with SessionLocal() as db:
# 1. Parent-Controls mit Kindern laden (nur wo rationale = Placeholder)
parents = db.execute(text("""
SELECT p.id AS parent_uuid, p.control_id, p.title, p.category,
p.source_citation->>'source' AS source_name,
COUNT(c.id) AS child_count
FROM canonical_controls p
JOIN canonical_controls c ON c.parent_control_uuid = p.id
WHERE c.rationale = 'Aus Obligation abgeleitet.'
AND c.release_state NOT IN ('rejected', 'merged')
GROUP BY p.id, p.control_id, p.title, p.category,
p.source_citation->>'source'
ORDER BY p.control_id
""")).fetchall()
total_parents = len(parents)
total_children = sum(p.child_count for p in parents)
if dry_run:
return {
"dry_run": True,
"total_parents": total_parents,
"total_children": total_children,
"estimated_llm_calls": total_parents,
"sample_parents": [
{
"control_id": p.control_id,
"title": p.title,
"source": p.source_name,
"child_count": p.child_count,
}
for p in parents[:10]
],
}
# 2. Batch auswählen
batch = parents[offset : offset + batch_size]
if not batch:
return {
"dry_run": False,
"message": "Kein weiterer Batch — alle Parents verarbeitet.",
"total_parents": total_parents,
"offset": offset,
"processed": 0,
}
provider = get_llm_provider()
processed = 0
children_updated = 0
errors = []
sample_rationales = []
for parent in batch:
parent_uuid = str(parent.parent_uuid)
source = parent.source_name or "Regulierung"
# LLM-Prompt
prompt = (
f"Du bist Compliance-Experte. Erklaere in 1-2 Saetzen auf Deutsch, "
f"WARUM aus dem uebergeordneten Control atomare Teilmassnahmen "
f"abgeleitet wurden.\n\n"
f"Uebergeordnetes Control: {parent.control_id}{parent.title}\n"
f"Regulierung: {source}\n"
f"Kategorie: {parent.category or 'k.A.'}\n"
f"Anzahl atomarer Controls: {parent.child_count}\n\n"
f"Schreibe NUR die Begruendung (1-2 Saetze). Kein Markdown, "
f"keine Aufzaehlung, kein Praefix. "
f"Erklaere den regulatorischen Hintergrund und warum die "
f"Zerlegung in atomare, testbare Massnahmen notwendig ist."
)
try:
response = await provider.complete(
prompt=prompt,
max_tokens=256,
temperature=0.3,
)
rationale = response.content.strip()
# Bereinigen: Anfuehrungszeichen, Markdown entfernen
rationale = rationale.strip('"').strip("'").strip()
if rationale.startswith("Begründung:") or rationale.startswith("Begruendung:"):
rationale = rationale.split(":", 1)[1].strip()
# Laenge begrenzen (max 500 Zeichen)
if len(rationale) > 500:
rationale = rationale[:497] + "..."
if not rationale or len(rationale) < 10:
errors.append({
"control_id": parent.control_id,
"error": "LLM-Antwort zu kurz oder leer",
})
continue
# Alle Kinder dieses Parents updaten
result = db.execute(
text("""
UPDATE canonical_controls
SET rationale = :rationale
WHERE parent_control_uuid = CAST(:pid AS uuid)
AND rationale = 'Aus Obligation abgeleitet.'
AND release_state NOT IN ('rejected', 'merged')
"""),
{"rationale": rationale, "pid": parent_uuid},
)
children_updated += result.rowcount
processed += 1
if len(sample_rationales) < 5:
sample_rationales.append({
"parent": parent.control_id,
"title": parent.title,
"rationale": rationale,
"children_updated": result.rowcount,
})
except Exception as e:
logger.error(f"LLM error for {parent.control_id}: {e}")
errors.append({
"control_id": parent.control_id,
"error": str(e)[:200],
})
# Rollback um DB-Session nach Fehler nutzbar zu halten
try:
db.rollback()
except Exception:
pass
db.commit()
return {
"dry_run": False,
"offset": offset,
"batch_size": batch_size,
"next_offset": offset + batch_size if offset + batch_size < total_parents else None,
"processed_parents": processed,
"children_updated": children_updated,
"total_parents": total_parents,
"total_children": total_children,
"errors": errors[:10],
"sample_rationales": sample_rationales,
}
# =============================================================================
# CONTROL CRUD (CREATE / UPDATE / DELETE)
# =============================================================================
@router.post("/controls", status_code=201)
async def create_control(
body: ControlCreateRequest,
service: CanonicalControlService = Depends(get_canonical_service),
) -> dict[str, Any]:
"""Create a new canonical control."""
import json as _json
import re
# Validate control_id format
if not re.match(r"^[A-Z]{2,6}-[0-9]{3}$", body.control_id):
raise HTTPException(status_code=400, detail="control_id must match DOMAIN-NNN (e.g. AUTH-001)")
if body.severity not in ("low", "medium", "high", "critical"):
raise HTTPException(status_code=400, detail="severity must be low/medium/high/critical")
if body.risk_score is not None and not (0 <= body.risk_score <= 10):
raise HTTPException(status_code=400, detail="risk_score must be 0..10")
with SessionLocal() as db:
# Resolve framework
fw = db.execute(
text("SELECT id FROM canonical_control_frameworks WHERE framework_id = :fid"),
{"fid": body.framework_id},
).fetchone()
if not fw:
raise HTTPException(status_code=404, detail=f"Framework '{body.framework_id}' not found")
# Check duplicate
existing = db.execute(
text("SELECT id FROM canonical_controls WHERE framework_id = :fid AND control_id = :cid"),
{"fid": str(fw.id), "cid": body.control_id},
).fetchone()
if existing:
raise HTTPException(status_code=409, detail=f"Control '{body.control_id}' already exists")
row = db.execute(
text(f"""
INSERT INTO canonical_controls (
framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort, evidence_confidence,
open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, verification_method, category, evidence_type,
target_audience, generation_metadata,
applicable_industries, applicable_company_size, scope_conditions
) VALUES (
:fw_id, :cid, :title, :objective, :rationale,
CAST(:scope AS jsonb), CAST(:requirements AS jsonb),
CAST(:test_procedure AS jsonb), CAST(:evidence AS jsonb),
:severity, :risk_score, :effort, :confidence,
CAST(:anchors AS jsonb), :release_state, CAST(:tags AS jsonb),
:license_rule, :source_original_text,
CAST(:source_citation AS jsonb),
:customer_visible, :verification_method, :category, :evidence_type,
:target_audience, CAST(:generation_metadata AS jsonb),
CAST(:applicable_industries AS jsonb),
CAST(:applicable_company_size AS jsonb),
CAST(:scope_conditions AS jsonb)
)
RETURNING {_CONTROL_COLS}
"""),
{
"fw_id": str(fw.id),
"cid": body.control_id,
"title": body.title,
"objective": body.objective,
"rationale": body.rationale,
"scope": _json.dumps(body.scope),
"requirements": _json.dumps(body.requirements),
"test_procedure": _json.dumps(body.test_procedure),
"evidence": _json.dumps(body.evidence),
"severity": body.severity,
"risk_score": body.risk_score,
"effort": body.implementation_effort,
"confidence": body.evidence_confidence,
"anchors": _json.dumps(body.open_anchors),
"release_state": body.release_state,
"tags": _json.dumps(body.tags),
"license_rule": body.license_rule,
"source_original_text": body.source_original_text,
"source_citation": _json.dumps(body.source_citation) if body.source_citation else None,
"customer_visible": body.customer_visible,
"verification_method": body.verification_method,
"category": body.category,
"evidence_type": body.evidence_type,
"target_audience": body.target_audience,
"generation_metadata": _json.dumps(body.generation_metadata) if body.generation_metadata else None,
"applicable_industries": _json.dumps(body.applicable_industries) if body.applicable_industries else None,
"applicable_company_size": _json.dumps(body.applicable_company_size) if body.applicable_company_size else None,
"scope_conditions": _json.dumps(body.scope_conditions) if body.scope_conditions else None,
},
).fetchone()
db.commit()
return _control_row(row)
@router.put("/controls/{control_id}")
async def update_control(
control_id: str,
body: ControlUpdateRequest,
service: CanonicalControlService = Depends(get_canonical_service),
) -> dict[str, Any]:
"""Update an existing canonical control (partial update)."""
import json as _json
updates = body.dict(exclude_none=True)
if not updates:
raise HTTPException(status_code=400, detail="No fields to update")
if "severity" in updates and updates["severity"] not in ("low", "medium", "high", "critical"):
raise HTTPException(status_code=400, detail="severity must be low/medium/high/critical")
if "risk_score" in updates and updates["risk_score"] is not None and not (0 <= updates["risk_score"] <= 10):
raise HTTPException(status_code=400, detail="risk_score must be 0..10")
# Build dynamic SET clause
set_parts = []
params: dict[str, Any] = {"cid": control_id.upper()}
json_fields = {"scope", "requirements", "test_procedure", "evidence", "open_anchors", "tags",
"source_citation", "generation_metadata"}
for key, val in updates.items():
col = key
if key in json_fields:
set_parts.append(f"{col} = CAST(:{key} AS jsonb)")
params[key] = _json.dumps(val)
else:
set_parts.append(f"{col} = :{key}")
params[key] = val
set_parts.append("updated_at = NOW()")
with SessionLocal() as db:
row = db.execute(
text(f"""
UPDATE canonical_controls
SET {', '.join(set_parts)}
WHERE control_id = :cid
RETURNING {_CONTROL_COLS}
"""),
params,
).fetchone()
if not row:
raise HTTPException(status_code=404, detail="Control not found")
db.commit()
return _control_row(row)
@router.delete("/controls/{control_id}", status_code=204)
async def delete_control(
control_id: str,
service: CanonicalControlService = Depends(get_canonical_service),
) -> None:
"""Delete a canonical control."""
with translate_domain_errors():
service.delete_control(control_id)
# =============================================================================
# SIMILARITY CHECK
# =============================================================================
@router.post("/controls/{control_id}/similarity-check")
async def similarity_check(
control_id: str,
body: SimilarityCheckRequest,
service: CanonicalControlService = Depends(get_canonical_service),
) -> dict[str, Any]:
"""Run the too-close detector against a source/candidate text pair."""
with translate_domain_errors():
return await service.similarity_check(control_id, body)
# =============================================================================
# CATEGORIES
# =============================================================================
@router.get("/categories")
async def list_categories():
"""List all canonical control categories."""
with SessionLocal() as db:
rows = db.execute(
text("SELECT category_id, label_de, label_en, sort_order FROM canonical_control_categories ORDER BY sort_order")
).fetchall()
return [
{
"category_id": r.category_id,
"label_de": r.label_de,
"label_en": r.label_en,
"sort_order": r.sort_order,
}
for r in rows
]
# =============================================================================
# SIMILAR CONTROLS (Embedding-based dedup)
# =============================================================================
@router.get("/controls/{control_id}/similar")
async def find_similar_controls(
control_id: str,
threshold: float = Query(0.85, ge=0.5, le=1.0),
limit: int = Query(20, ge=1, le=100),
):
"""Find controls similar to the given one using embedding cosine similarity."""
with SessionLocal() as db:
# Get the target control's embedding
target = db.execute(
text("""
SELECT id, control_id, title, objective
FROM canonical_controls
WHERE control_id = :cid
"""),
{"cid": control_id.upper()},
).fetchone()
if not target:
raise HTTPException(status_code=404, detail="Control not found")
# Find similar controls using pg_vector cosine distance if available,
# otherwise fall back to text-based matching via objective similarity
try:
rows = db.execute(
text("""
SELECT c.control_id, c.title, c.severity, c.release_state,
c.tags, c.license_rule, c.verification_method, c.category,
1 - (c.embedding <=> t.embedding) AS similarity
FROM canonical_controls c, canonical_controls t
WHERE t.control_id = :cid
AND c.control_id != :cid
AND c.release_state != 'deprecated'
AND c.embedding IS NOT NULL
AND t.embedding IS NOT NULL
AND 1 - (c.embedding <=> t.embedding) >= :threshold
ORDER BY similarity DESC
LIMIT :lim
"""),
{"cid": control_id.upper(), "threshold": threshold, "lim": limit},
).fetchall()
return [
{
"control_id": r.control_id,
"title": r.title,
"severity": r.severity,
"release_state": r.release_state,
"tags": r.tags or [],
"license_rule": r.license_rule,
"verification_method": r.verification_method,
"category": r.category,
"similarity": round(float(r.similarity), 4),
}
for r in rows
]
except Exception as e:
logger.warning("Embedding similarity query failed (no embedding column?): %s", e)
return []
# =============================================================================
# SOURCES & LICENSES
# =============================================================================
@router.get("/sources")
async def list_sources(
service: CanonicalControlService = Depends(get_canonical_service),
) -> Any:
"""List all registered sources with permission flags."""
with translate_domain_errors():
return service.list_sources()
@router.get("/licenses")
async def list_licenses(
service: CanonicalControlService = Depends(get_canonical_service),
) -> Any:
"""Return the license matrix."""
with translate_domain_errors():
return service.list_licenses()
# =============================================================================
# V1 ENRICHMENT (Eigenentwicklung → Regulatorische Abdeckung)
# =============================================================================
@router.post("/controls/enrich-v1-matches")
async def enrich_v1_matches_endpoint(
dry_run: bool = Query(True, description="Nur zaehlen, nicht schreiben"),
batch_size: int = Query(100, description="Controls pro Durchlauf"),
offset: int = Query(0, description="Offset fuer Paginierung"),
):
"""
Findet regulatorische Abdeckung fuer v1 Eigenentwicklung Controls.
Eigenentwicklung = generation_strategy='ungrouped', pipeline_version=1,
source_citation IS NULL, parent_control_uuid IS NULL.
Workflow:
1. dry_run=true → Statistiken anzeigen
2. dry_run=false&batch_size=100&offset=0 → Erste 100 verarbeiten
3. Wiederholen mit next_offset bis fertig
"""
from compliance.services.v1_enrichment import enrich_v1_matches
return await enrich_v1_matches(
dry_run=dry_run,
batch_size=batch_size,
offset=offset,
)
@router.get("/controls/{control_id}/v1-matches")
async def get_v1_matches_endpoint(control_id: str):
"""
Gibt regulatorische Matches fuer ein v1 Control zurueck.
Returns:
Liste von Matches mit Control-Details, Source, Score.
"""
from compliance.services.v1_enrichment import get_v1_matches
# Resolve control_id to UUID
with SessionLocal() as db:
row = db.execute(text("""
SELECT id FROM canonical_controls WHERE control_id = :cid
"""), {"cid": control_id}).fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
return await get_v1_matches(str(row.id))
# =============================================================================
# INTERNAL HELPERS
# =============================================================================
def _control_row(r) -> dict:
return {
"id": str(r.id),
"framework_id": str(r.framework_id),
"control_id": r.control_id,
"title": r.title,
"objective": r.objective,
"rationale": r.rationale,
"scope": r.scope,
"requirements": r.requirements,
"test_procedure": r.test_procedure,
"evidence": r.evidence,
"severity": r.severity,
"risk_score": float(r.risk_score) if r.risk_score is not None else None,
"implementation_effort": r.implementation_effort,
"evidence_confidence": float(r.evidence_confidence) if r.evidence_confidence is not None else None,
"open_anchors": r.open_anchors,
"release_state": r.release_state,
"tags": r.tags or [],
"license_rule": r.license_rule,
"source_original_text": r.source_original_text,
"source_citation": r.source_citation,
"customer_visible": r.customer_visible,
"verification_method": r.verification_method,
"category": r.category,
"evidence_type": getattr(r, "evidence_type", None),
"target_audience": r.target_audience,
"generation_metadata": r.generation_metadata,
"generation_strategy": getattr(r, "generation_strategy", "ungrouped"),
"applicable_industries": getattr(r, "applicable_industries", None),
"applicable_company_size": getattr(r, "applicable_company_size", None),
"scope_conditions": getattr(r, "scope_conditions", None),
"parent_control_uuid": str(r.parent_control_uuid) if getattr(r, "parent_control_uuid", None) else None,
"parent_control_id": getattr(r, "parent_control_id", None),
"parent_control_title": getattr(r, "parent_control_title", None),
"decomposition_method": getattr(r, "decomposition_method", None),
"pipeline_version": getattr(r, "pipeline_version", None),
"created_at": r.created_at.isoformat() if r.created_at else None,
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
}