feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
289
scripts/qa/oscal_import.py
Normal file
289
scripts/qa/oscal_import.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import uuid
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
|
||||
|
||||
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
|
||||
sp853 = json.load(f)["catalog"]
|
||||
|
||||
# ── Extract all OSCAL controls ──
|
||||
def extract_controls(catalog):
|
||||
"""Extract all active controls with full data."""
|
||||
controls = []
|
||||
for fam in catalog.get("groups", []):
|
||||
fam_id = fam.get("id", "").upper()
|
||||
fam_title = fam.get("title", "")
|
||||
|
||||
for ctrl in fam.get("controls", []):
|
||||
result = extract_single(ctrl, fam_title)
|
||||
if result:
|
||||
controls.append(result)
|
||||
# Enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
result = extract_single(enh, fam_title)
|
||||
if result:
|
||||
controls.append(result)
|
||||
return controls
|
||||
|
||||
def extract_single(ctrl, family_title):
|
||||
"""Extract a single control or enhancement."""
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
if props.get("status") == "withdrawn":
|
||||
return None
|
||||
|
||||
label = props.get("label", ctrl["id"].upper())
|
||||
title = ctrl.get("title", "")
|
||||
|
||||
# Extract statement (main requirement text)
|
||||
statement = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
statement = part.get("prose", "")
|
||||
# Sub-items (a., b., c., etc.)
|
||||
for sub in part.get("parts", []):
|
||||
sub_prose = sub.get("prose", "")
|
||||
sub_label = ""
|
||||
for sp in sub.get("props", []):
|
||||
if sp["name"] == "label":
|
||||
sub_label = sp.get("value", "")
|
||||
if sub_label:
|
||||
statement += f"\n{sub_label} {sub_prose}"
|
||||
elif sub_prose:
|
||||
statement += f"\n{sub_prose}"
|
||||
# Nested sub-sub-items
|
||||
for subsub in sub.get("parts", []):
|
||||
ss_prose = subsub.get("prose", "")
|
||||
ss_label = ""
|
||||
for sp in subsub.get("props", []):
|
||||
if sp["name"] == "label":
|
||||
ss_label = sp.get("value", "")
|
||||
if ss_label:
|
||||
statement += f"\n {ss_label} {ss_prose}"
|
||||
elif ss_prose:
|
||||
statement += f"\n {ss_prose}"
|
||||
|
||||
# Extract guidance
|
||||
guidance = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "guidance":
|
||||
guidance = part.get("prose", "")
|
||||
|
||||
# Cross-references
|
||||
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
|
||||
|
||||
# Parameters
|
||||
params = []
|
||||
for p in ctrl.get("params", []):
|
||||
param_id = p.get("id", "")
|
||||
param_label = p.get("label", "")
|
||||
guidelines = ""
|
||||
for g in p.get("guidelines", []):
|
||||
guidelines += g.get("prose", "")
|
||||
select_choices = []
|
||||
if "select" in p:
|
||||
for choice in p["select"].get("choice", []):
|
||||
select_choices.append(choice)
|
||||
params.append({
|
||||
"id": param_id,
|
||||
"label": param_label,
|
||||
"guidelines": guidelines,
|
||||
"choices": select_choices,
|
||||
})
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"title": title,
|
||||
"family": family_title,
|
||||
"statement": statement.strip(),
|
||||
"guidance": guidance.strip(),
|
||||
"related": related,
|
||||
"params": params,
|
||||
"is_enhancement": "(" in label,
|
||||
}
|
||||
|
||||
all_oscal = extract_controls(sp853)
|
||||
print(f"Total OSCAL active controls: {len(all_oscal)}")
|
||||
|
||||
# ── Normalize label for comparison ──
|
||||
def normalize_label(label):
|
||||
label = re.sub(r'-0+(\d)', r'-\1', label)
|
||||
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
|
||||
return label.upper()
|
||||
|
||||
# ── DB connection ──
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get existing labels
|
||||
cur.execute("""
|
||||
SELECT DISTINCT source_citation->>'article' as article
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
||||
AND source_citation->>'article' IS NOT NULL
|
||||
""")
|
||||
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
|
||||
print(f"Existing DB labels (normalized): {len(existing_labels)}")
|
||||
|
||||
# Get highest control_id numbers per prefix
|
||||
cur.execute("""
|
||||
SELECT control_id FROM compliance.canonical_controls
|
||||
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
|
||||
ORDER BY control_id
|
||||
""")
|
||||
existing_ids = set(r[0] for r in cur.fetchall())
|
||||
|
||||
# Find next available ID per prefix
|
||||
def next_control_id(prefix, existing):
|
||||
"""Find next available control_id like SEC-1234."""
|
||||
max_num = 0
|
||||
pattern = re.compile(rf'^{prefix}-(\d+)$')
|
||||
for eid in existing:
|
||||
m = pattern.match(eid)
|
||||
if m:
|
||||
max_num = max(max_num, int(m.group(1)))
|
||||
return max_num
|
||||
|
||||
# Map NIST families to our control_id prefixes
|
||||
FAMILY_PREFIX = {
|
||||
"Access Control": "ACC",
|
||||
"Awareness and Training": "GOV",
|
||||
"Audit and Accountability": "LOG",
|
||||
"Assessment, Authorization, and Monitoring": "GOV",
|
||||
"Configuration Management": "COMP",
|
||||
"Contingency Planning": "INC",
|
||||
"Identification and Authentication": "AUTH",
|
||||
"Incident Response": "INC",
|
||||
"Maintenance": "COMP",
|
||||
"Media Protection": "DATA",
|
||||
"Physical and Environmental Protection": "SEC",
|
||||
"Planning": "GOV",
|
||||
"Program Management": "GOV",
|
||||
"Personnel Security": "GOV",
|
||||
"Personally Identifiable Information Processing and Transparency": "DATA",
|
||||
"Risk Assessment": "GOV",
|
||||
"System and Services Acquisition": "COMP",
|
||||
"System and Communications Protection": "NET",
|
||||
"System and Information Integrity": "SEC",
|
||||
"Supply Chain Risk Management": "COMP",
|
||||
}
|
||||
|
||||
# Track next IDs
|
||||
prefix_counters = {}
|
||||
for prefix in set(FAMILY_PREFIX.values()):
|
||||
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
|
||||
print(f"Starting counters: {prefix_counters}")
|
||||
|
||||
# ── Filter to only new controls ──
|
||||
to_import = []
|
||||
for ctrl in all_oscal:
|
||||
norm = normalize_label(ctrl["label"])
|
||||
if norm not in existing_labels:
|
||||
to_import.append(ctrl)
|
||||
|
||||
print(f"\nControls to import: {len(to_import)}")
|
||||
|
||||
# ── Import ──
|
||||
imported = 0
|
||||
for ctrl in to_import:
|
||||
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
|
||||
prefix_counters[prefix] += 1
|
||||
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
|
||||
|
||||
# Build title: "NIST {label}: {title}"
|
||||
title = f"NIST {ctrl['label']}: {ctrl['title']}"
|
||||
|
||||
# source_original_text = statement (the official requirement text)
|
||||
source_text = ctrl["statement"]
|
||||
if not source_text:
|
||||
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
|
||||
|
||||
# objective = guidance text
|
||||
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
|
||||
|
||||
# source_citation
|
||||
citation = {
|
||||
"source": "NIST SP 800-53 Rev. 5",
|
||||
"article": ctrl["label"],
|
||||
"article_type": "control",
|
||||
"source_type": "standard",
|
||||
"oscal_import": True,
|
||||
}
|
||||
if ctrl["related"]:
|
||||
citation["related_controls"] = ctrl["related"][:20]
|
||||
if ctrl["params"]:
|
||||
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
|
||||
|
||||
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
|
||||
new_id = str(uuid.uuid4())
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls
|
||||
(id, framework_id, control_id, title, objective, rationale,
|
||||
severity, source_original_text,
|
||||
source_citation, pipeline_version, release_state,
|
||||
generation_strategy, category)
|
||||
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
|
||||
""", (
|
||||
new_id,
|
||||
FRAMEWORK_ID,
|
||||
control_id,
|
||||
title[:500],
|
||||
objective[:5000],
|
||||
source_text[:10000],
|
||||
json.dumps(citation, ensure_ascii=False),
|
||||
ctrl["family"],
|
||||
))
|
||||
imported += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\nImported: {imported} new controls")
|
||||
|
||||
# ── Verify ──
|
||||
cur.execute("""
|
||||
SELECT count(*),
|
||||
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
||||
""")
|
||||
total, active = cur.fetchone()
|
||||
print(f"\nSP 800-53 after import: {total} total, {active} active")
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY release_state
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
print(f"\nDB release_state gesamt:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]:15s}: {row[1]:5d}")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
|
||||
|
||||
# ── Import stats by family ──
|
||||
fam_counts = {}
|
||||
for ctrl in to_import:
|
||||
fam = ctrl["family"]
|
||||
fam_counts[fam] = fam_counts.get(fam, 0) + 1
|
||||
|
||||
print(f"\nImportiert nach Family:")
|
||||
for fam in sorted(fam_counts.keys()):
|
||||
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user