"""Phase 5: Source Normalization + Duplicate Hard Delete. Steps: 1. OSCAL controls: add source_regulation to generation_metadata 2. Fix 20 v3 controls with NULL source (tag as manually_reviewed) 3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich) 4. Fix OWASP cross-source misattributions (regulation_code vs actual source) 5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs) 6. Clean up canonical_processed_chunks generated_control_ids Usage: export DATABASE_URL='postgresql://...' python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N] """ import os import sys import json import psycopg2 import urllib.parse DRY_RUN = "--dry-run" in sys.argv STEP_ONLY = None for arg in sys.argv: if arg.startswith("--step"): idx = sys.argv.index(arg) if idx + 1 < len(sys.argv): STEP_ONLY = int(sys.argv[idx + 1]) db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) cur = conn.cursor() def should_run(step): return STEP_ONLY is None or STEP_ONLY == step # ══════════════════════════════════════════════════════════════════ # Step 1: OSCAL controls — add source_regulation to generation_metadata # ══════════════════════════════════════════════════════════════════ if should_run(1): print("=" * 70) print("STEP 1: OSCAL controls — source_regulation in generation_metadata") print("=" * 70) cur.execute(""" SELECT count(*) FROM compliance.canonical_controls WHERE generation_strategy = 'oscal_import' AND (generation_metadata->>'source_regulation' IS NULL OR generation_metadata->>'source_regulation' = '') """) count = cur.fetchone()[0] print(f" OSCAL controls without source_regulation: {count}") if count > 0: if DRY_RUN: print(f" [DRY RUN] Would update {count} controls") else: cur.execute(""" UPDATE compliance.canonical_controls SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || '{"source_regulation": "nist_sp800_53r5"}'::jsonb WHERE generation_strategy = 'oscal_import' AND (generation_metadata->>'source_regulation' IS NULL OR generation_metadata->>'source_regulation' = '') """) print(f" Updated: {cur.rowcount}") print() # ══════════════════════════════════════════════════════════════════ # Step 2: v3 controls with NULL source — tag source as best guess # ══════════════════════════════════════════════════════════════════ if should_run(2): print("=" * 70) print("STEP 2: Fix v3 controls with NULL source") print("=" * 70) # These 20 controls are v3/document_grouped with no source or regulation. # Based on title analysis, they cover: # - Data protection/privacy topics (DSGVO-adjacent) # - Software security (OWASP/NIST-adjacent) # - Mobile security (OWASP MASVS-adjacent) # Mark them as 'needs_review' and add a flag. cur.execute(""" SELECT id, control_id, title FROM compliance.canonical_controls WHERE source_citation->>'source' IS NULL AND pipeline_version = 3 AND release_state NOT IN ('duplicate', 'too_close') """) v3_null = cur.fetchall() print(f" v3 controls with NULL source: {len(v3_null)}") if v3_null: if DRY_RUN: print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review") else: for ctrl_id_uuid, control_id, title in v3_null: cur.execute(""" UPDATE compliance.canonical_controls SET release_state = 'needs_review', generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || '{"missing_source": true}'::jsonb WHERE id = %s """, (ctrl_id_uuid,)) print(f" Marked {len(v3_null)} as needs_review with missing_source flag") print() # ══════════════════════════════════════════════════════════════════ # Step 3: Fix empty-string source (DATA-631) # ══════════════════════════════════════════════════════════════════ if should_run(3): print("=" * 70) print("STEP 3: Fix empty-string source") print("=" * 70) cur.execute(""" SELECT id, control_id, title, generation_metadata->>'source_regulation' as reg FROM compliance.canonical_controls WHERE source_citation->>'source' = '' AND release_state NOT IN ('duplicate', 'too_close') """) empty_src = cur.fetchall() print(f" Controls with empty source: {len(empty_src)}") for ctrl_id_uuid, control_id, title, reg in empty_src: print(f" {control_id} | reg={reg} | {title[:60]}") if reg == 'at_tkg': new_source = 'Telekommunikationsgesetz Oesterreich' else: new_source = f"Unbekannt ({reg})" if DRY_RUN: print(f" [DRY RUN] Would set source='{new_source}'") else: cur.execute(""" UPDATE compliance.canonical_controls SET source_citation = jsonb_set( source_citation, '{source}', %s::jsonb ) WHERE id = %s """, (json.dumps(new_source), ctrl_id_uuid)) print(f" Set source='{new_source}'") print() # ══════════════════════════════════════════════════════════════════ # Step 4: Fix OWASP cross-source misattributions # ══════════════════════════════════════════════════════════════════ if should_run(4): print("=" * 70) print("STEP 4: Fix OWASP cross-source misattributions") print("=" * 70) # Controls where source_citation.source doesn't match the regulation_code OWASP_REG_TO_SOURCE = { 'owasp_top10_2021': 'OWASP Top 10 (2021)', 'owasp_asvs': 'OWASP ASVS 4.0', 'owasp_masvs': 'OWASP MASVS 2.0', 'owasp_samm': 'OWASP SAMM 2.0', 'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)', } # Strategy: Move controls to the regulation_code that matches their actual source # i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021', # update the reg to 'owasp_asvs' SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()} total_fixed = 0 for reg_code, expected_source in OWASP_REG_TO_SOURCE.items(): cur.execute(""" SELECT id, control_id, source_citation->>'source' as src FROM compliance.canonical_controls WHERE generation_metadata->>'source_regulation' = %s AND source_citation->>'source' <> %s AND release_state NOT IN ('duplicate', 'too_close') """, (reg_code, expected_source)) mismatches = cur.fetchall() if mismatches: print(f"\n {reg_code} → {len(mismatches)} Mismatches:") for ctrl_id_uuid, control_id, actual_source in mismatches: correct_reg = SOURCE_TO_REG.get(actual_source) if correct_reg: print(f" {control_id} | {actual_source} → reg={correct_reg}") if not DRY_RUN: cur.execute(""" UPDATE compliance.canonical_controls SET generation_metadata = jsonb_set( generation_metadata, '{source_regulation}', %s::jsonb ) WHERE id = %s """, (json.dumps(correct_reg), ctrl_id_uuid)) total_fixed += 1 else: print(f" {control_id} | {actual_source} → no mapping found") if DRY_RUN: print(f"\n [DRY RUN] Would fix {total_fixed} misattributions") else: print(f"\n Fixed: {total_fixed} misattributions") print() # ══════════════════════════════════════════════════════════════════ # Step 5: Hard delete duplicate/too_close controls # ══════════════════════════════════════════════════════════════════ if should_run(5): print("=" * 70) print("STEP 5: Hard delete duplicate/too_close controls") print("=" * 70) # Verify no FK references for table, col in [ ('canonical_control_mappings', 'control_id'), ('obligation_extractions', 'control_uuid'), ('crosswalk_matrix', 'master_control_uuid'), ('obligation_candidates', 'parent_control_uuid'), ]: cur.execute(f""" SELECT count(*) FROM compliance.{table} t JOIN compliance.canonical_controls cc ON cc.id = t.{col} WHERE cc.release_state IN ('duplicate', 'too_close') """) fk_count = cur.fetchone()[0] if fk_count > 0: print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!") print(f" ABORTING Step 5 — clean FK refs first!") sys.exit(1) else: print(f" {table}.{col}: 0 refs ✓") # Check self-references cur.execute(""" SELECT count(*) FROM compliance.canonical_controls child JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid WHERE parent.release_state IN ('duplicate', 'too_close') """) self_refs = cur.fetchone()[0] if self_refs > 0: print(f" WARNING: {self_refs} child controls reference dup/too_close parents!") print(f" ABORTING Step 5!") sys.exit(1) print(f" Self-references: 0 ✓") cur.execute(""" SELECT release_state, count(*) FROM compliance.canonical_controls WHERE release_state IN ('duplicate', 'too_close') GROUP BY 1 """) to_delete = {} for state, cnt in cur.fetchall(): to_delete[state] = cnt print(f"\n {state}: {cnt}") total = sum(to_delete.values()) print(f"\n TOTAL to delete: {total}") if DRY_RUN: print(f" [DRY RUN] Would delete {total} controls") else: cur.execute(""" DELETE FROM compliance.canonical_controls WHERE release_state IN ('duplicate', 'too_close') """) print(f" Deleted: {cur.rowcount} controls") print() # ══════════════════════════════════════════════════════════════════ # Step 6: Clean up canonical_processed_chunks generated_control_ids # ══════════════════════════════════════════════════════════════════ if should_run(6): print("=" * 70) print("STEP 6: Clean up processed chunks (remove deleted control IDs)") print("=" * 70) if DRY_RUN and should_run(5): print(" [DRY RUN] Skipping — depends on Step 5 deletion") else: # Find chunks that reference non-existent controls cur.execute(""" SELECT id, generated_control_ids FROM compliance.canonical_processed_chunks WHERE generated_control_ids IS NOT NULL AND generated_control_ids <> '[]'::jsonb """) chunks = cur.fetchall() print(f" Chunks with generated_control_ids: {len(chunks)}") # Get all existing control IDs cur.execute("SELECT id::text FROM compliance.canonical_controls") existing_ids = set(r[0] for r in cur.fetchall()) print(f" Existing controls: {len(existing_ids)}") cleaned = 0 for chunk_id, control_ids in chunks: if isinstance(control_ids, str): control_ids = json.loads(control_ids) if isinstance(control_ids, list): valid_ids = [cid for cid in control_ids if cid in existing_ids] if len(valid_ids) < len(control_ids): removed = len(control_ids) - len(valid_ids) cur.execute(""" UPDATE compliance.canonical_processed_chunks SET generated_control_ids = %s::jsonb WHERE id = %s """, (json.dumps(valid_ids), chunk_id)) cleaned += 1 print(f" Chunks cleaned: {cleaned}") print() # ══════════════════════════════════════════════════════════════════ # Final summary # ══════════════════════════════════════════════════════════════════ if not DRY_RUN: conn.commit() print("=" * 70) print("COMMITTED. Final state:") print("=" * 70) else: print("=" * 70) print("[DRY RUN] No changes committed. Current state:") print("=" * 70) cur.execute(""" SELECT release_state, count(*) FROM compliance.canonical_controls GROUP BY 1 ORDER BY count(*) DESC """) total = 0 active = 0 for state, cnt in cur.fetchall(): total += cnt if state not in ('duplicate', 'too_close'): active += cnt print(f" {state:15s}: {cnt:5d}") print(f"\n TOTAL: {total}") print(f" AKTIV: {active}") conn.close()