breakpilot-compliance/scripts/qa/phase5_normalize_and_cleanup.py

"""Phase 5: Source Normalization + Duplicate Hard Delete.

Steps:
  1. OSCAL controls: add source_regulation to generation_metadata
  2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
  3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
  4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
  5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
  6. Clean up canonical_processed_chunks generated_control_ids

Usage:
  export DATABASE_URL='postgresql://...'
  python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
"""
import os
import sys
import json
import psycopg2
import urllib.parse

DRY_RUN = "--dry-run" in sys.argv
STEP_ONLY = None
for arg in sys.argv:
    if arg.startswith("--step"):
        idx = sys.argv.index(arg)
        if idx + 1 < len(sys.argv):
            STEP_ONLY = int(sys.argv[idx + 1])

db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
    host=parsed.hostname, port=parsed.port or 5432,
    user=parsed.username, password=parsed.password,
    dbname=parsed.path.lstrip('/'),
    options="-c search_path=compliance,public"
)
cur = conn.cursor()

def should_run(step):
    return STEP_ONLY is None or STEP_ONLY == step


# ══════════════════════════════════════════════════════════════════
# Step 1: OSCAL controls — add source_regulation to generation_metadata
# ══════════════════════════════════════════════════════════════════
if should_run(1):
    print("=" * 70)
    print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
    print("=" * 70)

    cur.execute("""
        SELECT count(*)
        FROM compliance.canonical_controls
        WHERE generation_strategy = 'oscal_import'
        AND (generation_metadata->>'source_regulation' IS NULL
             OR generation_metadata->>'source_regulation' = '')
    """)
    count = cur.fetchone()[0]
    print(f"  OSCAL controls without source_regulation: {count}")

    if count > 0:
        if DRY_RUN:
            print(f"  [DRY RUN] Would update {count} controls")
        else:
            cur.execute("""
                UPDATE compliance.canonical_controls
                SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
                    || '{"source_regulation": "nist_sp800_53r5"}'::jsonb
                WHERE generation_strategy = 'oscal_import'
                AND (generation_metadata->>'source_regulation' IS NULL
                     OR generation_metadata->>'source_regulation' = '')
            """)
            print(f"  Updated: {cur.rowcount}")
    print()


# ══════════════════════════════════════════════════════════════════
# Step 2: v3 controls with NULL source — tag source as best guess
# ══════════════════════════════════════════════════════════════════
if should_run(2):
    print("=" * 70)
    print("STEP 2: Fix v3 controls with NULL source")
    print("=" * 70)

    # These 20 controls are v3/document_grouped with no source or regulation.
    # Based on title analysis, they cover:
    # - Data protection/privacy topics (DSGVO-adjacent)
    # - Software security (OWASP/NIST-adjacent)
    # - Mobile security (OWASP MASVS-adjacent)
    # Mark them as 'needs_review' and add a flag.
    cur.execute("""
        SELECT id, control_id, title
        FROM compliance.canonical_controls
        WHERE source_citation->>'source' IS NULL
        AND pipeline_version = 3
        AND release_state NOT IN ('duplicate', 'too_close')
    """)
    v3_null = cur.fetchall()
    print(f"  v3 controls with NULL source: {len(v3_null)}")

    if v3_null:
        if DRY_RUN:
            print(f"  [DRY RUN] Would mark {len(v3_null)} as needs_review")
        else:
            for ctrl_id_uuid, control_id, title in v3_null:
                cur.execute("""
                    UPDATE compliance.canonical_controls
                    SET release_state = 'needs_review',
                        generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
                            || '{"missing_source": true}'::jsonb
                    WHERE id = %s
                """, (ctrl_id_uuid,))
            print(f"  Marked {len(v3_null)} as needs_review with missing_source flag")
    print()


# ══════════════════════════════════════════════════════════════════
# Step 3: Fix empty-string source (DATA-631)
# ══════════════════════════════════════════════════════════════════
if should_run(3):
    print("=" * 70)
    print("STEP 3: Fix empty-string source")
    print("=" * 70)

    cur.execute("""
        SELECT id, control_id, title,
               generation_metadata->>'source_regulation' as reg
        FROM compliance.canonical_controls
        WHERE source_citation->>'source' = ''
        AND release_state NOT IN ('duplicate', 'too_close')
    """)
    empty_src = cur.fetchall()
    print(f"  Controls with empty source: {len(empty_src)}")

    for ctrl_id_uuid, control_id, title, reg in empty_src:
        print(f"    {control_id} | reg={reg} | {title[:60]}")
        if reg == 'at_tkg':
            new_source = 'Telekommunikationsgesetz Oesterreich'
        else:
            new_source = f"Unbekannt ({reg})"

        if DRY_RUN:
            print(f"    [DRY RUN] Would set source='{new_source}'")
        else:
            cur.execute("""
                UPDATE compliance.canonical_controls
                SET source_citation = jsonb_set(
                    source_citation, '{source}', %s::jsonb
                )
                WHERE id = %s
            """, (json.dumps(new_source), ctrl_id_uuid))
            print(f"    Set source='{new_source}'")
    print()


# ══════════════════════════════════════════════════════════════════
# Step 4: Fix OWASP cross-source misattributions
# ══════════════════════════════════════════════════════════════════
if should_run(4):
    print("=" * 70)
    print("STEP 4: Fix OWASP cross-source misattributions")
    print("=" * 70)

    # Controls where source_citation.source doesn't match the regulation_code
    OWASP_REG_TO_SOURCE = {
        'owasp_top10_2021': 'OWASP Top 10 (2021)',
        'owasp_asvs': 'OWASP ASVS 4.0',
        'owasp_masvs': 'OWASP MASVS 2.0',
        'owasp_samm': 'OWASP SAMM 2.0',
        'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
    }

    # Strategy: Move controls to the regulation_code that matches their actual source
    # i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
    # update the reg to 'owasp_asvs'
    SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}

    total_fixed = 0
    for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
        cur.execute("""
            SELECT id, control_id, source_citation->>'source' as src
            FROM compliance.canonical_controls
            WHERE generation_metadata->>'source_regulation' = %s
            AND source_citation->>'source' <> %s
            AND release_state NOT IN ('duplicate', 'too_close')
        """, (reg_code, expected_source))
        mismatches = cur.fetchall()

        if mismatches:
            print(f"\n  {reg_code} → {len(mismatches)} Mismatches:")
            for ctrl_id_uuid, control_id, actual_source in mismatches:
                correct_reg = SOURCE_TO_REG.get(actual_source)
                if correct_reg:
                    print(f"    {control_id} | {actual_source} → reg={correct_reg}")
                    if not DRY_RUN:
                        cur.execute("""
                            UPDATE compliance.canonical_controls
                            SET generation_metadata = jsonb_set(
                                generation_metadata, '{source_regulation}', %s::jsonb
                            )
                            WHERE id = %s
                        """, (json.dumps(correct_reg), ctrl_id_uuid))
                    total_fixed += 1
                else:
                    print(f"    {control_id} | {actual_source} → no mapping found")

    if DRY_RUN:
        print(f"\n  [DRY RUN] Would fix {total_fixed} misattributions")
    else:
        print(f"\n  Fixed: {total_fixed} misattributions")
    print()


# ══════════════════════════════════════════════════════════════════
# Step 5: Hard delete duplicate/too_close controls
# ══════════════════════════════════════════════════════════════════
if should_run(5):
    print("=" * 70)
    print("STEP 5: Hard delete duplicate/too_close controls")
    print("=" * 70)

    # Verify no FK references
    for table, col in [
        ('canonical_control_mappings', 'control_id'),
        ('obligation_extractions', 'control_uuid'),
        ('crosswalk_matrix', 'master_control_uuid'),
        ('obligation_candidates', 'parent_control_uuid'),
    ]:
        cur.execute(f"""
            SELECT count(*)
            FROM compliance.{table} t
            JOIN compliance.canonical_controls cc ON cc.id = t.{col}
            WHERE cc.release_state IN ('duplicate', 'too_close')
        """)
        fk_count = cur.fetchone()[0]
        if fk_count > 0:
            print(f"  WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
            print(f"  ABORTING Step 5 — clean FK refs first!")
            sys.exit(1)
        else:
            print(f"  {table}.{col}: 0 refs ✓")

    # Check self-references
    cur.execute("""
        SELECT count(*)
        FROM compliance.canonical_controls child
        JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
        WHERE parent.release_state IN ('duplicate', 'too_close')
    """)
    self_refs = cur.fetchone()[0]
    if self_refs > 0:
        print(f"  WARNING: {self_refs} child controls reference dup/too_close parents!")
        print(f"  ABORTING Step 5!")
        sys.exit(1)
    print(f"  Self-references: 0 ✓")

    cur.execute("""
        SELECT release_state, count(*)
        FROM compliance.canonical_controls
        WHERE release_state IN ('duplicate', 'too_close')
        GROUP BY 1
    """)
    to_delete = {}
    for state, cnt in cur.fetchall():
        to_delete[state] = cnt
        print(f"\n  {state}: {cnt}")

    total = sum(to_delete.values())
    print(f"\n  TOTAL to delete: {total}")

    if DRY_RUN:
        print(f"  [DRY RUN] Would delete {total} controls")
    else:
        cur.execute("""
            DELETE FROM compliance.canonical_controls
            WHERE release_state IN ('duplicate', 'too_close')
        """)
        print(f"  Deleted: {cur.rowcount} controls")
    print()


# ══════════════════════════════════════════════════════════════════
# Step 6: Clean up canonical_processed_chunks generated_control_ids
# ══════════════════════════════════════════════════════════════════
if should_run(6):
    print("=" * 70)
    print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
    print("=" * 70)

    if DRY_RUN and should_run(5):
        print("  [DRY RUN] Skipping — depends on Step 5 deletion")
    else:
        # Find chunks that reference non-existent controls
        cur.execute("""
            SELECT id, generated_control_ids
            FROM compliance.canonical_processed_chunks
            WHERE generated_control_ids IS NOT NULL
            AND generated_control_ids <> '[]'::jsonb
        """)
        chunks = cur.fetchall()
        print(f"  Chunks with generated_control_ids: {len(chunks)}")

        # Get all existing control IDs
        cur.execute("SELECT id::text FROM compliance.canonical_controls")
        existing_ids = set(r[0] for r in cur.fetchall())
        print(f"  Existing controls: {len(existing_ids)}")

        cleaned = 0
        for chunk_id, control_ids in chunks:
            if isinstance(control_ids, str):
                control_ids = json.loads(control_ids)
            if isinstance(control_ids, list):
                valid_ids = [cid for cid in control_ids if cid in existing_ids]
                if len(valid_ids) < len(control_ids):
                    removed = len(control_ids) - len(valid_ids)
                    cur.execute("""
                        UPDATE compliance.canonical_processed_chunks
                        SET generated_control_ids = %s::jsonb
                        WHERE id = %s
                    """, (json.dumps(valid_ids), chunk_id))
                    cleaned += 1

        print(f"  Chunks cleaned: {cleaned}")
    print()


# ══════════════════════════════════════════════════════════════════
# Final summary
# ══════════════════════════════════════════════════════════════════
if not DRY_RUN:
    conn.commit()
    print("=" * 70)
    print("COMMITTED. Final state:")
    print("=" * 70)
else:
    print("=" * 70)
    print("[DRY RUN] No changes committed. Current state:")
    print("=" * 70)

cur.execute("""
    SELECT release_state, count(*)
    FROM compliance.canonical_controls
    GROUP BY 1
    ORDER BY count(*) DESC
""")
total = 0
active = 0
for state, cnt in cur.fetchall():
    total += cnt
    if state not in ('duplicate', 'too_close'):
        active += cnt
    print(f"  {state:15s}: {cnt:5d}")

print(f"\n  TOTAL:  {total}")
print(f"  AKTIV:  {active}")

conn.close()