breakpilot-compliance/scripts/qa/qa_normalize_sources.py

"""
Task 3: Normalize source_citation.source names.
Same regulation has different source names from different pipeline runs.
Standardize to one canonical name per regulation.
"""
import json
import os
import sys

from sqlalchemy import create_engine, text as sql_text

DB_URL = os.environ['DATABASE_URL']
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
DRY_RUN = '--dry-run' in sys.argv

# Canonical source names per regulation
SOURCE_NAMES = {
    "eu_2023_1230": "Maschinenverordnung (EU) 2023/1230",
    "eu_2024_2847": "Cyber Resilience Act (CRA)",
    "eu_2024_1689": "KI-Verordnung (EU) 2024/1689",
    "eu_2022_2555": "NIS2-Richtlinie (EU) 2022/2555",
    "eu_2016_679": "DSGVO (EU) 2016/679",
    "eu_blue_guide_2022": "EU Blue Guide 2022",
    "nist_sp800_53r5": "NIST SP 800-53 Rev. 5",
    "nist_sp_800_218": "NIST SP 800-218 (SSDF)",
    "nist_csf_2_0": "NIST Cybersecurity Framework 2.0",
    "nist_sp800_63_3": "NIST SP 800-63-3",
    "nist_sp800_207": "NIST SP 800-207 (Zero Trust)",
    "nist_ai_rmf": "NIST AI Risk Management Framework",
    "owasp_top10_2021": "OWASP Top 10 (2021)",
    "owasp_asvs": "OWASP ASVS 4.0",
    "owasp_samm": "OWASP SAMM 2.0",
    "owasp_api_top10_2023": "OWASP API Security Top 10 (2023)",
    "owasp_masvs": "OWASP MASVS 2.0",
    "cisa_secure_by_design": "CISA Secure by Design",
    "enisa_ics_scada_dependencies": "ENISA ICS/SCADA Dependencies",
    "enisa_supply_chain_good_practices": "ENISA Supply Chain Good Practices",
    "enisa_threat_landscape_supply_chain": "ENISA Threat Landscape Supply Chain",
    "enisa_cybersecurity_state_2024": "ENISA Cybersecurity State 2024",
    "oecd_ai_principles": "OECD KI-Empfehlung",
    "gpsr": "Allgemeine Produktsicherheitsverordnung (GPSR)",
    "eu_2023_1542": "Batterieverordnung (EU) 2023/1542",
    "mica": "Markets in Crypto-Assets (MiCA)",
    "eu_2022_868": "Data Governance Act (DGA)",
    "dataact": "Data Act",
    "eucsa": "EU Cybersecurity Act (EUCSA)",
    "eaa": "European Accessibility Act (EAA)",
    "eu_2023_1803": "IFRS-Übernahmeverordnung",
    "amlr": "AML-Verordnung",
    "bdsg_2018_komplett": "Bundesdatenschutzgesetz (BDSG)",
    "bdsg": "Bundesdatenschutzgesetz (BDSG)",
}

print("=" * 60)
print("TASK 3: NORMALIZE SOURCE NAMES")
print("=" * 60)

with engine.begin() as conn:
    # Find all current source_name variants
    r = conn.execute(sql_text("""
        SELECT generation_metadata->>'source_regulation' as reg,
               source_citation->>'source' as current_name,
               count(*) as cnt
        FROM compliance.canonical_controls
        WHERE source_citation IS NOT NULL
          AND generation_metadata->>'source_regulation' IS NOT NULL
        GROUP BY 1, 2
        ORDER BY 1, cnt DESC
    """))

    updates = []
    for row in r.fetchall():
        reg = row[0]
        current = row[1]
        count = row[2]
        canonical = SOURCE_NAMES.get(reg)

        if canonical and current != canonical:
            updates.append((reg, current, canonical, count))

    print(f"\n  Source names to normalize: {len(updates)}")
    print(f"\n  {'Regulation':30s} {'From':45s} → {'To':45s} {'Count':>5s}")
    print(f"  {'-' * 130}")

    total_updated = 0
    for reg, old_name, new_name, count in updates:
        print(f"  {reg:30s} {old_name[:45]:45s} → {new_name[:45]:45s} {count:5d}")
        total_updated += count

        if not DRY_RUN:
            name_json = json.dumps(new_name)  # "name" with quotes for jsonb
            conn.execute(sql_text("""
                UPDATE compliance.canonical_controls
                SET source_citation = jsonb_set(
                        source_citation,
                        '{source}',
                        CAST(:name_json AS jsonb)
                    )
                WHERE generation_metadata->>'source_regulation' = :reg
                  AND source_citation->>'source' = :old_name
            """), {"reg": reg, "old_name": old_name, "name_json": name_json})

    print(f"\n  Total controls updated: {total_updated}")
    print(f"  Dry run: {DRY_RUN}")

    # Verify
    if not DRY_RUN:
        r2 = conn.execute(sql_text("""
            SELECT generation_metadata->>'source_regulation' as reg,
                   source_citation->>'source' as name,
                   count(*)
            FROM compliance.canonical_controls
            WHERE source_citation IS NOT NULL
              AND generation_metadata->>'source_regulation' IS NOT NULL
            GROUP BY 1, 2
            HAVING count(*) >= 5
            ORDER BY count(*) DESC
        """))
        print(f"\n  === Verified source names (>= 5 controls) ===")
        for row in r2.fetchall():
            print(f"    {str(row[0]):30s} {str(row[1]):50s} {row[2]:5d}")