Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
290 lines
9.3 KiB
Python
290 lines
9.3 KiB
Python
"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
|
|
import os
|
|
import re
|
|
import json
|
|
import uuid
|
|
import psycopg2
|
|
import urllib.parse
|
|
|
|
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
|
|
|
|
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
|
|
sp853 = json.load(f)["catalog"]
|
|
|
|
# ── Extract all OSCAL controls ──
|
|
def extract_controls(catalog):
|
|
"""Extract all active controls with full data."""
|
|
controls = []
|
|
for fam in catalog.get("groups", []):
|
|
fam_id = fam.get("id", "").upper()
|
|
fam_title = fam.get("title", "")
|
|
|
|
for ctrl in fam.get("controls", []):
|
|
result = extract_single(ctrl, fam_title)
|
|
if result:
|
|
controls.append(result)
|
|
# Enhancements
|
|
for enh in ctrl.get("controls", []):
|
|
result = extract_single(enh, fam_title)
|
|
if result:
|
|
controls.append(result)
|
|
return controls
|
|
|
|
def extract_single(ctrl, family_title):
|
|
"""Extract a single control or enhancement."""
|
|
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
|
if props.get("status") == "withdrawn":
|
|
return None
|
|
|
|
label = props.get("label", ctrl["id"].upper())
|
|
title = ctrl.get("title", "")
|
|
|
|
# Extract statement (main requirement text)
|
|
statement = ""
|
|
for part in ctrl.get("parts", []):
|
|
if part.get("name") == "statement":
|
|
statement = part.get("prose", "")
|
|
# Sub-items (a., b., c., etc.)
|
|
for sub in part.get("parts", []):
|
|
sub_prose = sub.get("prose", "")
|
|
sub_label = ""
|
|
for sp in sub.get("props", []):
|
|
if sp["name"] == "label":
|
|
sub_label = sp.get("value", "")
|
|
if sub_label:
|
|
statement += f"\n{sub_label} {sub_prose}"
|
|
elif sub_prose:
|
|
statement += f"\n{sub_prose}"
|
|
# Nested sub-sub-items
|
|
for subsub in sub.get("parts", []):
|
|
ss_prose = subsub.get("prose", "")
|
|
ss_label = ""
|
|
for sp in subsub.get("props", []):
|
|
if sp["name"] == "label":
|
|
ss_label = sp.get("value", "")
|
|
if ss_label:
|
|
statement += f"\n {ss_label} {ss_prose}"
|
|
elif ss_prose:
|
|
statement += f"\n {ss_prose}"
|
|
|
|
# Extract guidance
|
|
guidance = ""
|
|
for part in ctrl.get("parts", []):
|
|
if part.get("name") == "guidance":
|
|
guidance = part.get("prose", "")
|
|
|
|
# Cross-references
|
|
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
|
|
|
|
# Parameters
|
|
params = []
|
|
for p in ctrl.get("params", []):
|
|
param_id = p.get("id", "")
|
|
param_label = p.get("label", "")
|
|
guidelines = ""
|
|
for g in p.get("guidelines", []):
|
|
guidelines += g.get("prose", "")
|
|
select_choices = []
|
|
if "select" in p:
|
|
for choice in p["select"].get("choice", []):
|
|
select_choices.append(choice)
|
|
params.append({
|
|
"id": param_id,
|
|
"label": param_label,
|
|
"guidelines": guidelines,
|
|
"choices": select_choices,
|
|
})
|
|
|
|
return {
|
|
"label": label,
|
|
"title": title,
|
|
"family": family_title,
|
|
"statement": statement.strip(),
|
|
"guidance": guidance.strip(),
|
|
"related": related,
|
|
"params": params,
|
|
"is_enhancement": "(" in label,
|
|
}
|
|
|
|
all_oscal = extract_controls(sp853)
|
|
print(f"Total OSCAL active controls: {len(all_oscal)}")
|
|
|
|
# ── Normalize label for comparison ──
|
|
def normalize_label(label):
|
|
label = re.sub(r'-0+(\d)', r'-\1', label)
|
|
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
|
|
return label.upper()
|
|
|
|
# ── DB connection ──
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
cur = conn.cursor()
|
|
|
|
# Get existing labels
|
|
cur.execute("""
|
|
SELECT DISTINCT source_citation->>'article' as article
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
|
AND source_citation->>'article' IS NOT NULL
|
|
""")
|
|
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
|
|
print(f"Existing DB labels (normalized): {len(existing_labels)}")
|
|
|
|
# Get highest control_id numbers per prefix
|
|
cur.execute("""
|
|
SELECT control_id FROM compliance.canonical_controls
|
|
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
|
|
ORDER BY control_id
|
|
""")
|
|
existing_ids = set(r[0] for r in cur.fetchall())
|
|
|
|
# Find next available ID per prefix
|
|
def next_control_id(prefix, existing):
|
|
"""Find next available control_id like SEC-1234."""
|
|
max_num = 0
|
|
pattern = re.compile(rf'^{prefix}-(\d+)$')
|
|
for eid in existing:
|
|
m = pattern.match(eid)
|
|
if m:
|
|
max_num = max(max_num, int(m.group(1)))
|
|
return max_num
|
|
|
|
# Map NIST families to our control_id prefixes
|
|
FAMILY_PREFIX = {
|
|
"Access Control": "ACC",
|
|
"Awareness and Training": "GOV",
|
|
"Audit and Accountability": "LOG",
|
|
"Assessment, Authorization, and Monitoring": "GOV",
|
|
"Configuration Management": "COMP",
|
|
"Contingency Planning": "INC",
|
|
"Identification and Authentication": "AUTH",
|
|
"Incident Response": "INC",
|
|
"Maintenance": "COMP",
|
|
"Media Protection": "DATA",
|
|
"Physical and Environmental Protection": "SEC",
|
|
"Planning": "GOV",
|
|
"Program Management": "GOV",
|
|
"Personnel Security": "GOV",
|
|
"Personally Identifiable Information Processing and Transparency": "DATA",
|
|
"Risk Assessment": "GOV",
|
|
"System and Services Acquisition": "COMP",
|
|
"System and Communications Protection": "NET",
|
|
"System and Information Integrity": "SEC",
|
|
"Supply Chain Risk Management": "COMP",
|
|
}
|
|
|
|
# Track next IDs
|
|
prefix_counters = {}
|
|
for prefix in set(FAMILY_PREFIX.values()):
|
|
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
|
|
print(f"Starting counters: {prefix_counters}")
|
|
|
|
# ── Filter to only new controls ──
|
|
to_import = []
|
|
for ctrl in all_oscal:
|
|
norm = normalize_label(ctrl["label"])
|
|
if norm not in existing_labels:
|
|
to_import.append(ctrl)
|
|
|
|
print(f"\nControls to import: {len(to_import)}")
|
|
|
|
# ── Import ──
|
|
imported = 0
|
|
for ctrl in to_import:
|
|
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
|
|
prefix_counters[prefix] += 1
|
|
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
|
|
|
|
# Build title: "NIST {label}: {title}"
|
|
title = f"NIST {ctrl['label']}: {ctrl['title']}"
|
|
|
|
# source_original_text = statement (the official requirement text)
|
|
source_text = ctrl["statement"]
|
|
if not source_text:
|
|
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
|
|
|
|
# objective = guidance text
|
|
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
|
|
|
|
# source_citation
|
|
citation = {
|
|
"source": "NIST SP 800-53 Rev. 5",
|
|
"article": ctrl["label"],
|
|
"article_type": "control",
|
|
"source_type": "standard",
|
|
"oscal_import": True,
|
|
}
|
|
if ctrl["related"]:
|
|
citation["related_controls"] = ctrl["related"][:20]
|
|
if ctrl["params"]:
|
|
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
|
|
|
|
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
|
|
new_id = str(uuid.uuid4())
|
|
cur.execute("""
|
|
INSERT INTO compliance.canonical_controls
|
|
(id, framework_id, control_id, title, objective, rationale,
|
|
severity, source_original_text,
|
|
source_citation, pipeline_version, release_state,
|
|
generation_strategy, category)
|
|
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
|
|
""", (
|
|
new_id,
|
|
FRAMEWORK_ID,
|
|
control_id,
|
|
title[:500],
|
|
objective[:5000],
|
|
source_text[:10000],
|
|
json.dumps(citation, ensure_ascii=False),
|
|
ctrl["family"],
|
|
))
|
|
imported += 1
|
|
|
|
conn.commit()
|
|
print(f"\nImported: {imported} new controls")
|
|
|
|
# ── Verify ──
|
|
cur.execute("""
|
|
SELECT count(*),
|
|
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
|
""")
|
|
total, active = cur.fetchone()
|
|
print(f"\nSP 800-53 after import: {total} total, {active} active")
|
|
|
|
cur.execute("""
|
|
SELECT release_state, count(*)
|
|
FROM compliance.canonical_controls
|
|
GROUP BY release_state
|
|
ORDER BY count(*) DESC
|
|
""")
|
|
print(f"\nDB release_state gesamt:")
|
|
for row in cur.fetchall():
|
|
print(f" {row[0]:15s}: {row[1]:5d}")
|
|
|
|
cur.execute("""
|
|
SELECT count(*)
|
|
FROM compliance.canonical_controls
|
|
WHERE release_state NOT IN ('duplicate', 'too_close')
|
|
""")
|
|
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
|
|
|
|
# ── Import stats by family ──
|
|
fam_counts = {}
|
|
for ctrl in to_import:
|
|
fam = ctrl["family"]
|
|
fam_counts[fam] = fam_counts.get(fam, 0) + 1
|
|
|
|
print(f"\nImportiert nach Family:")
|
|
for fam in sorted(fam_counts.keys()):
|
|
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
|
|
|
|
conn.close()
|