feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
357
scripts/qa/phase5_normalize_and_cleanup.py
Normal file
357
scripts/qa/phase5_normalize_and_cleanup.py
Normal file
@@ -0,0 +1,357 @@
|
||||
"""Phase 5: Source Normalization + Duplicate Hard Delete.
|
||||
|
||||
Steps:
|
||||
1. OSCAL controls: add source_regulation to generation_metadata
|
||||
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
|
||||
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
|
||||
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
|
||||
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
|
||||
6. Clean up canonical_processed_chunks generated_control_ids
|
||||
|
||||
Usage:
|
||||
export DATABASE_URL='postgresql://...'
|
||||
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
DRY_RUN = "--dry-run" in sys.argv
|
||||
STEP_ONLY = None
|
||||
for arg in sys.argv:
|
||||
if arg.startswith("--step"):
|
||||
idx = sys.argv.index(arg)
|
||||
if idx + 1 < len(sys.argv):
|
||||
STEP_ONLY = int(sys.argv[idx + 1])
|
||||
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
def should_run(step):
|
||||
return STEP_ONLY is None or STEP_ONLY == step
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 1: OSCAL controls — add source_regulation to generation_metadata
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(1):
|
||||
print("=" * 70)
|
||||
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE generation_strategy = 'oscal_import'
|
||||
AND (generation_metadata->>'source_regulation' IS NULL
|
||||
OR generation_metadata->>'source_regulation' = '')
|
||||
""")
|
||||
count = cur.fetchone()[0]
|
||||
print(f" OSCAL controls without source_regulation: {count}")
|
||||
|
||||
if count > 0:
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would update {count} controls")
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
|
||||
WHERE generation_strategy = 'oscal_import'
|
||||
AND (generation_metadata->>'source_regulation' IS NULL
|
||||
OR generation_metadata->>'source_regulation' = '')
|
||||
""")
|
||||
print(f" Updated: {cur.rowcount}")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 2: v3 controls with NULL source — tag source as best guess
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(2):
|
||||
print("=" * 70)
|
||||
print("STEP 2: Fix v3 controls with NULL source")
|
||||
print("=" * 70)
|
||||
|
||||
# These 20 controls are v3/document_grouped with no source or regulation.
|
||||
# Based on title analysis, they cover:
|
||||
# - Data protection/privacy topics (DSGVO-adjacent)
|
||||
# - Software security (OWASP/NIST-adjacent)
|
||||
# - Mobile security (OWASP MASVS-adjacent)
|
||||
# Mark them as 'needs_review' and add a flag.
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' IS NULL
|
||||
AND pipeline_version = 3
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
v3_null = cur.fetchall()
|
||||
print(f" v3 controls with NULL source: {len(v3_null)}")
|
||||
|
||||
if v3_null:
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
|
||||
else:
|
||||
for ctrl_id_uuid, control_id, title in v3_null:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET release_state = 'needs_review',
|
||||
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"missing_source": true}'::jsonb
|
||||
WHERE id = %s
|
||||
""", (ctrl_id_uuid,))
|
||||
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 3: Fix empty-string source (DATA-631)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(3):
|
||||
print("=" * 70)
|
||||
print("STEP 3: Fix empty-string source")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title,
|
||||
generation_metadata->>'source_regulation' as reg
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = ''
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
empty_src = cur.fetchall()
|
||||
print(f" Controls with empty source: {len(empty_src)}")
|
||||
|
||||
for ctrl_id_uuid, control_id, title, reg in empty_src:
|
||||
print(f" {control_id} | reg={reg} | {title[:60]}")
|
||||
if reg == 'at_tkg':
|
||||
new_source = 'Telekommunikationsgesetz Oesterreich'
|
||||
else:
|
||||
new_source = f"Unbekannt ({reg})"
|
||||
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would set source='{new_source}'")
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = jsonb_set(
|
||||
source_citation, '{source}', %s::jsonb
|
||||
)
|
||||
WHERE id = %s
|
||||
""", (json.dumps(new_source), ctrl_id_uuid))
|
||||
print(f" Set source='{new_source}'")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 4: Fix OWASP cross-source misattributions
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(4):
|
||||
print("=" * 70)
|
||||
print("STEP 4: Fix OWASP cross-source misattributions")
|
||||
print("=" * 70)
|
||||
|
||||
# Controls where source_citation.source doesn't match the regulation_code
|
||||
OWASP_REG_TO_SOURCE = {
|
||||
'owasp_top10_2021': 'OWASP Top 10 (2021)',
|
||||
'owasp_asvs': 'OWASP ASVS 4.0',
|
||||
'owasp_masvs': 'OWASP MASVS 2.0',
|
||||
'owasp_samm': 'OWASP SAMM 2.0',
|
||||
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
|
||||
}
|
||||
|
||||
# Strategy: Move controls to the regulation_code that matches their actual source
|
||||
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
|
||||
# update the reg to 'owasp_asvs'
|
||||
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
|
||||
|
||||
total_fixed = 0
|
||||
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
|
||||
cur.execute("""
|
||||
SELECT id, control_id, source_citation->>'source' as src
|
||||
FROM compliance.canonical_controls
|
||||
WHERE generation_metadata->>'source_regulation' = %s
|
||||
AND source_citation->>'source' <> %s
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (reg_code, expected_source))
|
||||
mismatches = cur.fetchall()
|
||||
|
||||
if mismatches:
|
||||
print(f"\n {reg_code} → {len(mismatches)} Mismatches:")
|
||||
for ctrl_id_uuid, control_id, actual_source in mismatches:
|
||||
correct_reg = SOURCE_TO_REG.get(actual_source)
|
||||
if correct_reg:
|
||||
print(f" {control_id} | {actual_source} → reg={correct_reg}")
|
||||
if not DRY_RUN:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
generation_metadata, '{source_regulation}', %s::jsonb
|
||||
)
|
||||
WHERE id = %s
|
||||
""", (json.dumps(correct_reg), ctrl_id_uuid))
|
||||
total_fixed += 1
|
||||
else:
|
||||
print(f" {control_id} | {actual_source} → no mapping found")
|
||||
|
||||
if DRY_RUN:
|
||||
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
|
||||
else:
|
||||
print(f"\n Fixed: {total_fixed} misattributions")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 5: Hard delete duplicate/too_close controls
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(5):
|
||||
print("=" * 70)
|
||||
print("STEP 5: Hard delete duplicate/too_close controls")
|
||||
print("=" * 70)
|
||||
|
||||
# Verify no FK references
|
||||
for table, col in [
|
||||
('canonical_control_mappings', 'control_id'),
|
||||
('obligation_extractions', 'control_uuid'),
|
||||
('crosswalk_matrix', 'master_control_uuid'),
|
||||
('obligation_candidates', 'parent_control_uuid'),
|
||||
]:
|
||||
cur.execute(f"""
|
||||
SELECT count(*)
|
||||
FROM compliance.{table} t
|
||||
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
|
||||
WHERE cc.release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
fk_count = cur.fetchone()[0]
|
||||
if fk_count > 0:
|
||||
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
|
||||
print(f" ABORTING Step 5 — clean FK refs first!")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" {table}.{col}: 0 refs ✓")
|
||||
|
||||
# Check self-references
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls child
|
||||
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
|
||||
WHERE parent.release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
self_refs = cur.fetchone()[0]
|
||||
if self_refs > 0:
|
||||
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
|
||||
print(f" ABORTING Step 5!")
|
||||
sys.exit(1)
|
||||
print(f" Self-references: 0 ✓")
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state IN ('duplicate', 'too_close')
|
||||
GROUP BY 1
|
||||
""")
|
||||
to_delete = {}
|
||||
for state, cnt in cur.fetchall():
|
||||
to_delete[state] = cnt
|
||||
print(f"\n {state}: {cnt}")
|
||||
|
||||
total = sum(to_delete.values())
|
||||
print(f"\n TOTAL to delete: {total}")
|
||||
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would delete {total} controls")
|
||||
else:
|
||||
cur.execute("""
|
||||
DELETE FROM compliance.canonical_controls
|
||||
WHERE release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
print(f" Deleted: {cur.rowcount} controls")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 6: Clean up canonical_processed_chunks generated_control_ids
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(6):
|
||||
print("=" * 70)
|
||||
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
|
||||
print("=" * 70)
|
||||
|
||||
if DRY_RUN and should_run(5):
|
||||
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
|
||||
else:
|
||||
# Find chunks that reference non-existent controls
|
||||
cur.execute("""
|
||||
SELECT id, generated_control_ids
|
||||
FROM compliance.canonical_processed_chunks
|
||||
WHERE generated_control_ids IS NOT NULL
|
||||
AND generated_control_ids <> '[]'::jsonb
|
||||
""")
|
||||
chunks = cur.fetchall()
|
||||
print(f" Chunks with generated_control_ids: {len(chunks)}")
|
||||
|
||||
# Get all existing control IDs
|
||||
cur.execute("SELECT id::text FROM compliance.canonical_controls")
|
||||
existing_ids = set(r[0] for r in cur.fetchall())
|
||||
print(f" Existing controls: {len(existing_ids)}")
|
||||
|
||||
cleaned = 0
|
||||
for chunk_id, control_ids in chunks:
|
||||
if isinstance(control_ids, str):
|
||||
control_ids = json.loads(control_ids)
|
||||
if isinstance(control_ids, list):
|
||||
valid_ids = [cid for cid in control_ids if cid in existing_ids]
|
||||
if len(valid_ids) < len(control_ids):
|
||||
removed = len(control_ids) - len(valid_ids)
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_processed_chunks
|
||||
SET generated_control_ids = %s::jsonb
|
||||
WHERE id = %s
|
||||
""", (json.dumps(valid_ids), chunk_id))
|
||||
cleaned += 1
|
||||
|
||||
print(f" Chunks cleaned: {cleaned}")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Final summary
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if not DRY_RUN:
|
||||
conn.commit()
|
||||
print("=" * 70)
|
||||
print("COMMITTED. Final state:")
|
||||
print("=" * 70)
|
||||
else:
|
||||
print("=" * 70)
|
||||
print("[DRY RUN] No changes committed. Current state:")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY 1
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
total = 0
|
||||
active = 0
|
||||
for state, cnt in cur.fetchall():
|
||||
total += cnt
|
||||
if state not in ('duplicate', 'too_close'):
|
||||
active += cnt
|
||||
print(f" {state:15s}: {cnt:5d}")
|
||||
|
||||
print(f"\n TOTAL: {total}")
|
||||
print(f" AKTIV: {active}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user