Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
358 lines
15 KiB
Python
358 lines
15 KiB
Python
"""Phase 5: Source Normalization + Duplicate Hard Delete.
|
|
|
|
Steps:
|
|
1. OSCAL controls: add source_regulation to generation_metadata
|
|
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
|
|
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
|
|
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
|
|
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
|
|
6. Clean up canonical_processed_chunks generated_control_ids
|
|
|
|
Usage:
|
|
export DATABASE_URL='postgresql://...'
|
|
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
|
|
"""
|
|
import os
|
|
import sys
|
|
import json
|
|
import psycopg2
|
|
import urllib.parse
|
|
|
|
DRY_RUN = "--dry-run" in sys.argv
|
|
STEP_ONLY = None
|
|
for arg in sys.argv:
|
|
if arg.startswith("--step"):
|
|
idx = sys.argv.index(arg)
|
|
if idx + 1 < len(sys.argv):
|
|
STEP_ONLY = int(sys.argv[idx + 1])
|
|
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
cur = conn.cursor()
|
|
|
|
def should_run(step):
|
|
return STEP_ONLY is None or STEP_ONLY == step
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Step 1: OSCAL controls — add source_regulation to generation_metadata
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if should_run(1):
|
|
print("=" * 70)
|
|
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
|
|
print("=" * 70)
|
|
|
|
cur.execute("""
|
|
SELECT count(*)
|
|
FROM compliance.canonical_controls
|
|
WHERE generation_strategy = 'oscal_import'
|
|
AND (generation_metadata->>'source_regulation' IS NULL
|
|
OR generation_metadata->>'source_regulation' = '')
|
|
""")
|
|
count = cur.fetchone()[0]
|
|
print(f" OSCAL controls without source_regulation: {count}")
|
|
|
|
if count > 0:
|
|
if DRY_RUN:
|
|
print(f" [DRY RUN] Would update {count} controls")
|
|
else:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
|
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
|
|
WHERE generation_strategy = 'oscal_import'
|
|
AND (generation_metadata->>'source_regulation' IS NULL
|
|
OR generation_metadata->>'source_regulation' = '')
|
|
""")
|
|
print(f" Updated: {cur.rowcount}")
|
|
print()
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Step 2: v3 controls with NULL source — tag source as best guess
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if should_run(2):
|
|
print("=" * 70)
|
|
print("STEP 2: Fix v3 controls with NULL source")
|
|
print("=" * 70)
|
|
|
|
# These 20 controls are v3/document_grouped with no source or regulation.
|
|
# Based on title analysis, they cover:
|
|
# - Data protection/privacy topics (DSGVO-adjacent)
|
|
# - Software security (OWASP/NIST-adjacent)
|
|
# - Mobile security (OWASP MASVS-adjacent)
|
|
# Mark them as 'needs_review' and add a flag.
|
|
cur.execute("""
|
|
SELECT id, control_id, title
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' IS NULL
|
|
AND pipeline_version = 3
|
|
AND release_state NOT IN ('duplicate', 'too_close')
|
|
""")
|
|
v3_null = cur.fetchall()
|
|
print(f" v3 controls with NULL source: {len(v3_null)}")
|
|
|
|
if v3_null:
|
|
if DRY_RUN:
|
|
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
|
|
else:
|
|
for ctrl_id_uuid, control_id, title in v3_null:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET release_state = 'needs_review',
|
|
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
|
|| '{"missing_source": true}'::jsonb
|
|
WHERE id = %s
|
|
""", (ctrl_id_uuid,))
|
|
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
|
|
print()
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Step 3: Fix empty-string source (DATA-631)
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if should_run(3):
|
|
print("=" * 70)
|
|
print("STEP 3: Fix empty-string source")
|
|
print("=" * 70)
|
|
|
|
cur.execute("""
|
|
SELECT id, control_id, title,
|
|
generation_metadata->>'source_regulation' as reg
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' = ''
|
|
AND release_state NOT IN ('duplicate', 'too_close')
|
|
""")
|
|
empty_src = cur.fetchall()
|
|
print(f" Controls with empty source: {len(empty_src)}")
|
|
|
|
for ctrl_id_uuid, control_id, title, reg in empty_src:
|
|
print(f" {control_id} | reg={reg} | {title[:60]}")
|
|
if reg == 'at_tkg':
|
|
new_source = 'Telekommunikationsgesetz Oesterreich'
|
|
else:
|
|
new_source = f"Unbekannt ({reg})"
|
|
|
|
if DRY_RUN:
|
|
print(f" [DRY RUN] Would set source='{new_source}'")
|
|
else:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET source_citation = jsonb_set(
|
|
source_citation, '{source}', %s::jsonb
|
|
)
|
|
WHERE id = %s
|
|
""", (json.dumps(new_source), ctrl_id_uuid))
|
|
print(f" Set source='{new_source}'")
|
|
print()
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Step 4: Fix OWASP cross-source misattributions
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if should_run(4):
|
|
print("=" * 70)
|
|
print("STEP 4: Fix OWASP cross-source misattributions")
|
|
print("=" * 70)
|
|
|
|
# Controls where source_citation.source doesn't match the regulation_code
|
|
OWASP_REG_TO_SOURCE = {
|
|
'owasp_top10_2021': 'OWASP Top 10 (2021)',
|
|
'owasp_asvs': 'OWASP ASVS 4.0',
|
|
'owasp_masvs': 'OWASP MASVS 2.0',
|
|
'owasp_samm': 'OWASP SAMM 2.0',
|
|
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
|
|
}
|
|
|
|
# Strategy: Move controls to the regulation_code that matches their actual source
|
|
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
|
|
# update the reg to 'owasp_asvs'
|
|
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
|
|
|
|
total_fixed = 0
|
|
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
|
|
cur.execute("""
|
|
SELECT id, control_id, source_citation->>'source' as src
|
|
FROM compliance.canonical_controls
|
|
WHERE generation_metadata->>'source_regulation' = %s
|
|
AND source_citation->>'source' <> %s
|
|
AND release_state NOT IN ('duplicate', 'too_close')
|
|
""", (reg_code, expected_source))
|
|
mismatches = cur.fetchall()
|
|
|
|
if mismatches:
|
|
print(f"\n {reg_code} → {len(mismatches)} Mismatches:")
|
|
for ctrl_id_uuid, control_id, actual_source in mismatches:
|
|
correct_reg = SOURCE_TO_REG.get(actual_source)
|
|
if correct_reg:
|
|
print(f" {control_id} | {actual_source} → reg={correct_reg}")
|
|
if not DRY_RUN:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET generation_metadata = jsonb_set(
|
|
generation_metadata, '{source_regulation}', %s::jsonb
|
|
)
|
|
WHERE id = %s
|
|
""", (json.dumps(correct_reg), ctrl_id_uuid))
|
|
total_fixed += 1
|
|
else:
|
|
print(f" {control_id} | {actual_source} → no mapping found")
|
|
|
|
if DRY_RUN:
|
|
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
|
|
else:
|
|
print(f"\n Fixed: {total_fixed} misattributions")
|
|
print()
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Step 5: Hard delete duplicate/too_close controls
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if should_run(5):
|
|
print("=" * 70)
|
|
print("STEP 5: Hard delete duplicate/too_close controls")
|
|
print("=" * 70)
|
|
|
|
# Verify no FK references
|
|
for table, col in [
|
|
('canonical_control_mappings', 'control_id'),
|
|
('obligation_extractions', 'control_uuid'),
|
|
('crosswalk_matrix', 'master_control_uuid'),
|
|
('obligation_candidates', 'parent_control_uuid'),
|
|
]:
|
|
cur.execute(f"""
|
|
SELECT count(*)
|
|
FROM compliance.{table} t
|
|
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
|
|
WHERE cc.release_state IN ('duplicate', 'too_close')
|
|
""")
|
|
fk_count = cur.fetchone()[0]
|
|
if fk_count > 0:
|
|
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
|
|
print(f" ABORTING Step 5 — clean FK refs first!")
|
|
sys.exit(1)
|
|
else:
|
|
print(f" {table}.{col}: 0 refs ✓")
|
|
|
|
# Check self-references
|
|
cur.execute("""
|
|
SELECT count(*)
|
|
FROM compliance.canonical_controls child
|
|
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
|
|
WHERE parent.release_state IN ('duplicate', 'too_close')
|
|
""")
|
|
self_refs = cur.fetchone()[0]
|
|
if self_refs > 0:
|
|
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
|
|
print(f" ABORTING Step 5!")
|
|
sys.exit(1)
|
|
print(f" Self-references: 0 ✓")
|
|
|
|
cur.execute("""
|
|
SELECT release_state, count(*)
|
|
FROM compliance.canonical_controls
|
|
WHERE release_state IN ('duplicate', 'too_close')
|
|
GROUP BY 1
|
|
""")
|
|
to_delete = {}
|
|
for state, cnt in cur.fetchall():
|
|
to_delete[state] = cnt
|
|
print(f"\n {state}: {cnt}")
|
|
|
|
total = sum(to_delete.values())
|
|
print(f"\n TOTAL to delete: {total}")
|
|
|
|
if DRY_RUN:
|
|
print(f" [DRY RUN] Would delete {total} controls")
|
|
else:
|
|
cur.execute("""
|
|
DELETE FROM compliance.canonical_controls
|
|
WHERE release_state IN ('duplicate', 'too_close')
|
|
""")
|
|
print(f" Deleted: {cur.rowcount} controls")
|
|
print()
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Step 6: Clean up canonical_processed_chunks generated_control_ids
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if should_run(6):
|
|
print("=" * 70)
|
|
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
|
|
print("=" * 70)
|
|
|
|
if DRY_RUN and should_run(5):
|
|
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
|
|
else:
|
|
# Find chunks that reference non-existent controls
|
|
cur.execute("""
|
|
SELECT id, generated_control_ids
|
|
FROM compliance.canonical_processed_chunks
|
|
WHERE generated_control_ids IS NOT NULL
|
|
AND generated_control_ids <> '[]'::jsonb
|
|
""")
|
|
chunks = cur.fetchall()
|
|
print(f" Chunks with generated_control_ids: {len(chunks)}")
|
|
|
|
# Get all existing control IDs
|
|
cur.execute("SELECT id::text FROM compliance.canonical_controls")
|
|
existing_ids = set(r[0] for r in cur.fetchall())
|
|
print(f" Existing controls: {len(existing_ids)}")
|
|
|
|
cleaned = 0
|
|
for chunk_id, control_ids in chunks:
|
|
if isinstance(control_ids, str):
|
|
control_ids = json.loads(control_ids)
|
|
if isinstance(control_ids, list):
|
|
valid_ids = [cid for cid in control_ids if cid in existing_ids]
|
|
if len(valid_ids) < len(control_ids):
|
|
removed = len(control_ids) - len(valid_ids)
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_processed_chunks
|
|
SET generated_control_ids = %s::jsonb
|
|
WHERE id = %s
|
|
""", (json.dumps(valid_ids), chunk_id))
|
|
cleaned += 1
|
|
|
|
print(f" Chunks cleaned: {cleaned}")
|
|
print()
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════
|
|
# Final summary
|
|
# ══════════════════════════════════════════════════════════════════
|
|
if not DRY_RUN:
|
|
conn.commit()
|
|
print("=" * 70)
|
|
print("COMMITTED. Final state:")
|
|
print("=" * 70)
|
|
else:
|
|
print("=" * 70)
|
|
print("[DRY RUN] No changes committed. Current state:")
|
|
print("=" * 70)
|
|
|
|
cur.execute("""
|
|
SELECT release_state, count(*)
|
|
FROM compliance.canonical_controls
|
|
GROUP BY 1
|
|
ORDER BY count(*) DESC
|
|
""")
|
|
total = 0
|
|
active = 0
|
|
for state, cnt in cur.fetchall():
|
|
total += cnt
|
|
if state not in ('duplicate', 'too_close'):
|
|
active += cnt
|
|
print(f" {state:15s}: {cnt:5d}")
|
|
|
|
print(f"\n TOTAL: {total}")
|
|
print(f" AKTIV: {active}")
|
|
|
|
conn.close()
|