Files
breakpilot-compliance/scripts/qa/phase5_normalize_and_cleanup.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

358 lines
15 KiB
Python

"""Phase 5: Source Normalization + Duplicate Hard Delete.
Steps:
1. OSCAL controls: add source_regulation to generation_metadata
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
6. Clean up canonical_processed_chunks generated_control_ids
Usage:
export DATABASE_URL='postgresql://...'
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
"""
import os
import sys
import json
import psycopg2
import urllib.parse
DRY_RUN = "--dry-run" in sys.argv
STEP_ONLY = None
for arg in sys.argv:
if arg.startswith("--step"):
idx = sys.argv.index(arg)
if idx + 1 < len(sys.argv):
STEP_ONLY = int(sys.argv[idx + 1])
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
def should_run(step):
return STEP_ONLY is None or STEP_ONLY == step
# ══════════════════════════════════════════════════════════════════
# Step 1: OSCAL controls — add source_regulation to generation_metadata
# ══════════════════════════════════════════════════════════════════
if should_run(1):
print("=" * 70)
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
print("=" * 70)
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE generation_strategy = 'oscal_import'
AND (generation_metadata->>'source_regulation' IS NULL
OR generation_metadata->>'source_regulation' = '')
""")
count = cur.fetchone()[0]
print(f" OSCAL controls without source_regulation: {count}")
if count > 0:
if DRY_RUN:
print(f" [DRY RUN] Would update {count} controls")
else:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
WHERE generation_strategy = 'oscal_import'
AND (generation_metadata->>'source_regulation' IS NULL
OR generation_metadata->>'source_regulation' = '')
""")
print(f" Updated: {cur.rowcount}")
print()
# ══════════════════════════════════════════════════════════════════
# Step 2: v3 controls with NULL source — tag source as best guess
# ══════════════════════════════════════════════════════════════════
if should_run(2):
print("=" * 70)
print("STEP 2: Fix v3 controls with NULL source")
print("=" * 70)
# These 20 controls are v3/document_grouped with no source or regulation.
# Based on title analysis, they cover:
# - Data protection/privacy topics (DSGVO-adjacent)
# - Software security (OWASP/NIST-adjacent)
# - Mobile security (OWASP MASVS-adjacent)
# Mark them as 'needs_review' and add a flag.
cur.execute("""
SELECT id, control_id, title
FROM compliance.canonical_controls
WHERE source_citation->>'source' IS NULL
AND pipeline_version = 3
AND release_state NOT IN ('duplicate', 'too_close')
""")
v3_null = cur.fetchall()
print(f" v3 controls with NULL source: {len(v3_null)}")
if v3_null:
if DRY_RUN:
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
else:
for ctrl_id_uuid, control_id, title in v3_null:
cur.execute("""
UPDATE compliance.canonical_controls
SET release_state = 'needs_review',
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"missing_source": true}'::jsonb
WHERE id = %s
""", (ctrl_id_uuid,))
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
print()
# ══════════════════════════════════════════════════════════════════
# Step 3: Fix empty-string source (DATA-631)
# ══════════════════════════════════════════════════════════════════
if should_run(3):
print("=" * 70)
print("STEP 3: Fix empty-string source")
print("=" * 70)
cur.execute("""
SELECT id, control_id, title,
generation_metadata->>'source_regulation' as reg
FROM compliance.canonical_controls
WHERE source_citation->>'source' = ''
AND release_state NOT IN ('duplicate', 'too_close')
""")
empty_src = cur.fetchall()
print(f" Controls with empty source: {len(empty_src)}")
for ctrl_id_uuid, control_id, title, reg in empty_src:
print(f" {control_id} | reg={reg} | {title[:60]}")
if reg == 'at_tkg':
new_source = 'Telekommunikationsgesetz Oesterreich'
else:
new_source = f"Unbekannt ({reg})"
if DRY_RUN:
print(f" [DRY RUN] Would set source='{new_source}'")
else:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = jsonb_set(
source_citation, '{source}', %s::jsonb
)
WHERE id = %s
""", (json.dumps(new_source), ctrl_id_uuid))
print(f" Set source='{new_source}'")
print()
# ══════════════════════════════════════════════════════════════════
# Step 4: Fix OWASP cross-source misattributions
# ══════════════════════════════════════════════════════════════════
if should_run(4):
print("=" * 70)
print("STEP 4: Fix OWASP cross-source misattributions")
print("=" * 70)
# Controls where source_citation.source doesn't match the regulation_code
OWASP_REG_TO_SOURCE = {
'owasp_top10_2021': 'OWASP Top 10 (2021)',
'owasp_asvs': 'OWASP ASVS 4.0',
'owasp_masvs': 'OWASP MASVS 2.0',
'owasp_samm': 'OWASP SAMM 2.0',
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
}
# Strategy: Move controls to the regulation_code that matches their actual source
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
# update the reg to 'owasp_asvs'
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
total_fixed = 0
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
cur.execute("""
SELECT id, control_id, source_citation->>'source' as src
FROM compliance.canonical_controls
WHERE generation_metadata->>'source_regulation' = %s
AND source_citation->>'source' <> %s
AND release_state NOT IN ('duplicate', 'too_close')
""", (reg_code, expected_source))
mismatches = cur.fetchall()
if mismatches:
print(f"\n {reg_code}{len(mismatches)} Mismatches:")
for ctrl_id_uuid, control_id, actual_source in mismatches:
correct_reg = SOURCE_TO_REG.get(actual_source)
if correct_reg:
print(f" {control_id} | {actual_source} → reg={correct_reg}")
if not DRY_RUN:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
generation_metadata, '{source_regulation}', %s::jsonb
)
WHERE id = %s
""", (json.dumps(correct_reg), ctrl_id_uuid))
total_fixed += 1
else:
print(f" {control_id} | {actual_source} → no mapping found")
if DRY_RUN:
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
else:
print(f"\n Fixed: {total_fixed} misattributions")
print()
# ══════════════════════════════════════════════════════════════════
# Step 5: Hard delete duplicate/too_close controls
# ══════════════════════════════════════════════════════════════════
if should_run(5):
print("=" * 70)
print("STEP 5: Hard delete duplicate/too_close controls")
print("=" * 70)
# Verify no FK references
for table, col in [
('canonical_control_mappings', 'control_id'),
('obligation_extractions', 'control_uuid'),
('crosswalk_matrix', 'master_control_uuid'),
('obligation_candidates', 'parent_control_uuid'),
]:
cur.execute(f"""
SELECT count(*)
FROM compliance.{table} t
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
WHERE cc.release_state IN ('duplicate', 'too_close')
""")
fk_count = cur.fetchone()[0]
if fk_count > 0:
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
print(f" ABORTING Step 5 — clean FK refs first!")
sys.exit(1)
else:
print(f" {table}.{col}: 0 refs ✓")
# Check self-references
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls child
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
WHERE parent.release_state IN ('duplicate', 'too_close')
""")
self_refs = cur.fetchone()[0]
if self_refs > 0:
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
print(f" ABORTING Step 5!")
sys.exit(1)
print(f" Self-references: 0 ✓")
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
WHERE release_state IN ('duplicate', 'too_close')
GROUP BY 1
""")
to_delete = {}
for state, cnt in cur.fetchall():
to_delete[state] = cnt
print(f"\n {state}: {cnt}")
total = sum(to_delete.values())
print(f"\n TOTAL to delete: {total}")
if DRY_RUN:
print(f" [DRY RUN] Would delete {total} controls")
else:
cur.execute("""
DELETE FROM compliance.canonical_controls
WHERE release_state IN ('duplicate', 'too_close')
""")
print(f" Deleted: {cur.rowcount} controls")
print()
# ══════════════════════════════════════════════════════════════════
# Step 6: Clean up canonical_processed_chunks generated_control_ids
# ══════════════════════════════════════════════════════════════════
if should_run(6):
print("=" * 70)
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
print("=" * 70)
if DRY_RUN and should_run(5):
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
else:
# Find chunks that reference non-existent controls
cur.execute("""
SELECT id, generated_control_ids
FROM compliance.canonical_processed_chunks
WHERE generated_control_ids IS NOT NULL
AND generated_control_ids <> '[]'::jsonb
""")
chunks = cur.fetchall()
print(f" Chunks with generated_control_ids: {len(chunks)}")
# Get all existing control IDs
cur.execute("SELECT id::text FROM compliance.canonical_controls")
existing_ids = set(r[0] for r in cur.fetchall())
print(f" Existing controls: {len(existing_ids)}")
cleaned = 0
for chunk_id, control_ids in chunks:
if isinstance(control_ids, str):
control_ids = json.loads(control_ids)
if isinstance(control_ids, list):
valid_ids = [cid for cid in control_ids if cid in existing_ids]
if len(valid_ids) < len(control_ids):
removed = len(control_ids) - len(valid_ids)
cur.execute("""
UPDATE compliance.canonical_processed_chunks
SET generated_control_ids = %s::jsonb
WHERE id = %s
""", (json.dumps(valid_ids), chunk_id))
cleaned += 1
print(f" Chunks cleaned: {cleaned}")
print()
# ══════════════════════════════════════════════════════════════════
# Final summary
# ══════════════════════════════════════════════════════════════════
if not DRY_RUN:
conn.commit()
print("=" * 70)
print("COMMITTED. Final state:")
print("=" * 70)
else:
print("=" * 70)
print("[DRY RUN] No changes committed. Current state:")
print("=" * 70)
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY 1
ORDER BY count(*) DESC
""")
total = 0
active = 0
for state, cnt in cur.fetchall():
total += cnt
if state not in ('duplicate', 'too_close'):
active += cnt
print(f" {state:15s}: {cnt:5d}")
print(f"\n TOTAL: {total}")
print(f" AKTIV: {active}")
conn.close()