Files
breakpilot-compliance/scripts/qa/oscal_import.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

290 lines
9.3 KiB
Python

"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
import os
import re
import json
import uuid
import psycopg2
import urllib.parse
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
sp853 = json.load(f)["catalog"]
# ── Extract all OSCAL controls ──
def extract_controls(catalog):
"""Extract all active controls with full data."""
controls = []
for fam in catalog.get("groups", []):
fam_id = fam.get("id", "").upper()
fam_title = fam.get("title", "")
for ctrl in fam.get("controls", []):
result = extract_single(ctrl, fam_title)
if result:
controls.append(result)
# Enhancements
for enh in ctrl.get("controls", []):
result = extract_single(enh, fam_title)
if result:
controls.append(result)
return controls
def extract_single(ctrl, family_title):
"""Extract a single control or enhancement."""
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
if props.get("status") == "withdrawn":
return None
label = props.get("label", ctrl["id"].upper())
title = ctrl.get("title", "")
# Extract statement (main requirement text)
statement = ""
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
statement = part.get("prose", "")
# Sub-items (a., b., c., etc.)
for sub in part.get("parts", []):
sub_prose = sub.get("prose", "")
sub_label = ""
for sp in sub.get("props", []):
if sp["name"] == "label":
sub_label = sp.get("value", "")
if sub_label:
statement += f"\n{sub_label} {sub_prose}"
elif sub_prose:
statement += f"\n{sub_prose}"
# Nested sub-sub-items
for subsub in sub.get("parts", []):
ss_prose = subsub.get("prose", "")
ss_label = ""
for sp in subsub.get("props", []):
if sp["name"] == "label":
ss_label = sp.get("value", "")
if ss_label:
statement += f"\n {ss_label} {ss_prose}"
elif ss_prose:
statement += f"\n {ss_prose}"
# Extract guidance
guidance = ""
for part in ctrl.get("parts", []):
if part.get("name") == "guidance":
guidance = part.get("prose", "")
# Cross-references
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
# Parameters
params = []
for p in ctrl.get("params", []):
param_id = p.get("id", "")
param_label = p.get("label", "")
guidelines = ""
for g in p.get("guidelines", []):
guidelines += g.get("prose", "")
select_choices = []
if "select" in p:
for choice in p["select"].get("choice", []):
select_choices.append(choice)
params.append({
"id": param_id,
"label": param_label,
"guidelines": guidelines,
"choices": select_choices,
})
return {
"label": label,
"title": title,
"family": family_title,
"statement": statement.strip(),
"guidance": guidance.strip(),
"related": related,
"params": params,
"is_enhancement": "(" in label,
}
all_oscal = extract_controls(sp853)
print(f"Total OSCAL active controls: {len(all_oscal)}")
# ── Normalize label for comparison ──
def normalize_label(label):
label = re.sub(r'-0+(\d)', r'-\1', label)
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
return label.upper()
# ── DB connection ──
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get existing labels
cur.execute("""
SELECT DISTINCT source_citation->>'article' as article
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
AND source_citation->>'article' IS NOT NULL
""")
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
print(f"Existing DB labels (normalized): {len(existing_labels)}")
# Get highest control_id numbers per prefix
cur.execute("""
SELECT control_id FROM compliance.canonical_controls
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
ORDER BY control_id
""")
existing_ids = set(r[0] for r in cur.fetchall())
# Find next available ID per prefix
def next_control_id(prefix, existing):
"""Find next available control_id like SEC-1234."""
max_num = 0
pattern = re.compile(rf'^{prefix}-(\d+)$')
for eid in existing:
m = pattern.match(eid)
if m:
max_num = max(max_num, int(m.group(1)))
return max_num
# Map NIST families to our control_id prefixes
FAMILY_PREFIX = {
"Access Control": "ACC",
"Awareness and Training": "GOV",
"Audit and Accountability": "LOG",
"Assessment, Authorization, and Monitoring": "GOV",
"Configuration Management": "COMP",
"Contingency Planning": "INC",
"Identification and Authentication": "AUTH",
"Incident Response": "INC",
"Maintenance": "COMP",
"Media Protection": "DATA",
"Physical and Environmental Protection": "SEC",
"Planning": "GOV",
"Program Management": "GOV",
"Personnel Security": "GOV",
"Personally Identifiable Information Processing and Transparency": "DATA",
"Risk Assessment": "GOV",
"System and Services Acquisition": "COMP",
"System and Communications Protection": "NET",
"System and Information Integrity": "SEC",
"Supply Chain Risk Management": "COMP",
}
# Track next IDs
prefix_counters = {}
for prefix in set(FAMILY_PREFIX.values()):
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
print(f"Starting counters: {prefix_counters}")
# ── Filter to only new controls ──
to_import = []
for ctrl in all_oscal:
norm = normalize_label(ctrl["label"])
if norm not in existing_labels:
to_import.append(ctrl)
print(f"\nControls to import: {len(to_import)}")
# ── Import ──
imported = 0
for ctrl in to_import:
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
prefix_counters[prefix] += 1
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
# Build title: "NIST {label}: {title}"
title = f"NIST {ctrl['label']}: {ctrl['title']}"
# source_original_text = statement (the official requirement text)
source_text = ctrl["statement"]
if not source_text:
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
# objective = guidance text
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
# source_citation
citation = {
"source": "NIST SP 800-53 Rev. 5",
"article": ctrl["label"],
"article_type": "control",
"source_type": "standard",
"oscal_import": True,
}
if ctrl["related"]:
citation["related_controls"] = ctrl["related"][:20]
if ctrl["params"]:
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
new_id = str(uuid.uuid4())
cur.execute("""
INSERT INTO compliance.canonical_controls
(id, framework_id, control_id, title, objective, rationale,
severity, source_original_text,
source_citation, pipeline_version, release_state,
generation_strategy, category)
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
""", (
new_id,
FRAMEWORK_ID,
control_id,
title[:500],
objective[:5000],
source_text[:10000],
json.dumps(citation, ensure_ascii=False),
ctrl["family"],
))
imported += 1
conn.commit()
print(f"\nImported: {imported} new controls")
# ── Verify ──
cur.execute("""
SELECT count(*),
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
""")
total, active = cur.fetchone()
print(f"\nSP 800-53 after import: {total} total, {active} active")
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
""")
print(f"\nDB release_state gesamt:")
for row in cur.fetchall():
print(f" {row[0]:15s}: {row[1]:5d}")
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close')
""")
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
# ── Import stats by family ──
fam_counts = {}
for ctrl in to_import:
fam = ctrl["family"]
fam_counts[fam] = fam_counts.get(fam, 0) + 1
print(f"\nImportiert nach Family:")
for fam in sorted(fam_counts.keys()):
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
conn.close()