""" Task 3: Normalize source_citation.source names. Same regulation has different source names from different pipeline runs. Standardize to one canonical name per regulation. """ import json import os import sys from sqlalchemy import create_engine, text as sql_text DB_URL = os.environ['DATABASE_URL'] engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"}) DRY_RUN = '--dry-run' in sys.argv # Canonical source names per regulation SOURCE_NAMES = { "eu_2023_1230": "Maschinenverordnung (EU) 2023/1230", "eu_2024_2847": "Cyber Resilience Act (CRA)", "eu_2024_1689": "KI-Verordnung (EU) 2024/1689", "eu_2022_2555": "NIS2-Richtlinie (EU) 2022/2555", "eu_2016_679": "DSGVO (EU) 2016/679", "eu_blue_guide_2022": "EU Blue Guide 2022", "nist_sp800_53r5": "NIST SP 800-53 Rev. 5", "nist_sp_800_218": "NIST SP 800-218 (SSDF)", "nist_csf_2_0": "NIST Cybersecurity Framework 2.0", "nist_sp800_63_3": "NIST SP 800-63-3", "nist_sp800_207": "NIST SP 800-207 (Zero Trust)", "nist_ai_rmf": "NIST AI Risk Management Framework", "owasp_top10_2021": "OWASP Top 10 (2021)", "owasp_asvs": "OWASP ASVS 4.0", "owasp_samm": "OWASP SAMM 2.0", "owasp_api_top10_2023": "OWASP API Security Top 10 (2023)", "owasp_masvs": "OWASP MASVS 2.0", "cisa_secure_by_design": "CISA Secure by Design", "enisa_ics_scada_dependencies": "ENISA ICS/SCADA Dependencies", "enisa_supply_chain_good_practices": "ENISA Supply Chain Good Practices", "enisa_threat_landscape_supply_chain": "ENISA Threat Landscape Supply Chain", "enisa_cybersecurity_state_2024": "ENISA Cybersecurity State 2024", "oecd_ai_principles": "OECD KI-Empfehlung", "gpsr": "Allgemeine Produktsicherheitsverordnung (GPSR)", "eu_2023_1542": "Batterieverordnung (EU) 2023/1542", "mica": "Markets in Crypto-Assets (MiCA)", "eu_2022_868": "Data Governance Act (DGA)", "dataact": "Data Act", "eucsa": "EU Cybersecurity Act (EUCSA)", "eaa": "European Accessibility Act (EAA)", "eu_2023_1803": "IFRS-Übernahmeverordnung", "amlr": "AML-Verordnung", "bdsg_2018_komplett": "Bundesdatenschutzgesetz (BDSG)", "bdsg": "Bundesdatenschutzgesetz (BDSG)", } print("=" * 60) print("TASK 3: NORMALIZE SOURCE NAMES") print("=" * 60) with engine.begin() as conn: # Find all current source_name variants r = conn.execute(sql_text(""" SELECT generation_metadata->>'source_regulation' as reg, source_citation->>'source' as current_name, count(*) as cnt FROM compliance.canonical_controls WHERE source_citation IS NOT NULL AND generation_metadata->>'source_regulation' IS NOT NULL GROUP BY 1, 2 ORDER BY 1, cnt DESC """)) updates = [] for row in r.fetchall(): reg = row[0] current = row[1] count = row[2] canonical = SOURCE_NAMES.get(reg) if canonical and current != canonical: updates.append((reg, current, canonical, count)) print(f"\n Source names to normalize: {len(updates)}") print(f"\n {'Regulation':30s} {'From':45s} → {'To':45s} {'Count':>5s}") print(f" {'-' * 130}") total_updated = 0 for reg, old_name, new_name, count in updates: print(f" {reg:30s} {old_name[:45]:45s} → {new_name[:45]:45s} {count:5d}") total_updated += count if not DRY_RUN: name_json = json.dumps(new_name) # "name" with quotes for jsonb conn.execute(sql_text(""" UPDATE compliance.canonical_controls SET source_citation = jsonb_set( source_citation, '{source}', CAST(:name_json AS jsonb) ) WHERE generation_metadata->>'source_regulation' = :reg AND source_citation->>'source' = :old_name """), {"reg": reg, "old_name": old_name, "name_json": name_json}) print(f"\n Total controls updated: {total_updated}") print(f" Dry run: {DRY_RUN}") # Verify if not DRY_RUN: r2 = conn.execute(sql_text(""" SELECT generation_metadata->>'source_regulation' as reg, source_citation->>'source' as name, count(*) FROM compliance.canonical_controls WHERE source_citation IS NOT NULL AND generation_metadata->>'source_regulation' IS NOT NULL GROUP BY 1, 2 HAVING count(*) >= 5 ORDER BY count(*) DESC """)) print(f"\n === Verified source names (>= 5 controls) ===") for row in r2.fetchall(): print(f" {str(row[0]):30s} {str(row[1]):50s} {row[2]:5d}")