breakpilot-compliance/scripts/qa/phase74_generate_gap_controls.py

#!/usr/bin/env python3
"""
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.

Reads gap_analysis_results.json, extracts article text from PDFs,
calls Claude Sonnet to generate controls, inserts into DB.

Usage:
    python3 phase74_generate_gap_controls.py --dry-run          # show what would be generated
    python3 phase74_generate_gap_controls.py                     # generate and insert
    python3 phase74_generate_gap_controls.py --source "DSGVO"    # filter by source
    python3 phase74_generate_gap_controls.py --resume            # skip already-generated articles
"""
import os
import sys
import json
import re
import time
import hashlib
import argparse
import psycopg2
import urllib.parse
import requests
from pathlib import Path
from collections import Counter

sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
    build_eu_article_index, build_de_law_index, build_nist_index,
    build_owasp_index, build_generic_index, MAX_ARTICLES,
)

# ── Config ──────────────────────────────────────────────────────────
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PIPELINE_VERSION = 5
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))

try:
    import fitz
except ImportError:
    fitz = None

# ── Source name → regulation_code reverse map ────────────────────────
# Built from REGULATION_LICENSE_MAP in control_generator.py
SOURCE_TO_REGCODE = {
    "DSGVO (EU) 2016/679": "eu_2016_679",
    "KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
    "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
    "Cyber Resilience Act (CRA)": "eu_2024_2847",
    "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
    "EU Blue Guide 2022": "eu_blue_guide_2022",
    "Markets in Crypto-Assets (MiCA)": "mica",
    "Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
    "AML-Verordnung": "amlr",
    "Data Governance Act (DGA)": "dga",
    "Data Act": "data_act",
    "GPSR (EU) 2023/988": "gpsr",
    "IFRS-Übernahmeverordnung": "ifrs",
    "NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
    "NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
    "NIST SP 800-63-3": "nist_sp800_63_3",
    "NIST AI Risk Management Framework": "nist_ai_rmf",
    "NIST SP 800-218 (SSDF)": "nist_sp_800_218",
    "NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
    "OWASP Top 10 (2021)": "owasp_top10",
    "OWASP ASVS 4.0": "owasp_asvs",
    "OWASP SAMM 2.0": "owasp_samm",
    "OWASP API Security Top 10 (2023)": "owasp_api_top10",
    "OWASP MASVS 2.0": "owasp_masvs",
    "ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
    "ENISA Supply Chain Good Practices": "enisa_supply_chain",
    "CISA Secure by Design": "cisa_sbd",
    "Bundesdatenschutzgesetz (BDSG)": "bdsg",
    "Gewerbeordnung (GewO)": "gewo",
    "Handelsgesetzbuch (HGB)": "hgb",
    "Abgabenordnung (AO)": "ao",
    "OECD KI-Empfehlung": "oecd_ai_principles",
}

# License info per regulation code (from REGULATION_LICENSE_MAP)
LICENSE_MAP = {
    "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
    "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
    "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
    "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
    "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
    "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
    "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
    "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
    "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
    "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
    "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
    "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
    "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
    "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
    "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
    "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
    "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
    "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
    "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
    "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
    "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
}

# Domain detection keywords
DOMAIN_KEYWORDS = {
    "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
    "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
    "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
    "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
    "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
    "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
    "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
    "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
    "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
    "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
    "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
    "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
    "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
}

# ── Prompt (same as control_generator.py) ────────────────────────────

SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""

APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
  Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
  "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
  "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
  "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
  "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
  "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
  "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
  "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
  "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
  "Abfallwirtschaft", "Forschung"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
  Verwende ["all"] wenn keine Groessenbeschraenkung.
  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
  {"requires_any": ["signal"], "description": "Erklaerung"}
  Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
  "processes_minors_data", "automated_decisions", "employee_monitoring",
  "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """

CATEGORY_LIST = [
    "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
    "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
    "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
    "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
    "Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
    "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
    "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
]
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)


def build_prompt(source_name, article_label, article_text, license_type):
    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).

WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.

Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
{APPLICABILITY_PROMPT}

Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""


# ── PDF article extraction ───────────────────────────────────────────

def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
    """Extract the text of a specific article from a PDF."""
    if full_text is None:
        full_text = read_file(pdf_file)
    if not full_text:
        return ""

    if doc_type == "eu_regulation":
        art_num_match = re.search(r'\d+', article_label)
        if not art_num_match:
            return ""
        num = int(art_num_match.group())
        pattern = rf'\nArtikel\s+{num}\s*\n'
        match = re.search(pattern, full_text)
        if not match:
            return ""
        start = match.start()
        next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
        next_match = re.search(next_pattern, full_text)
        end = next_match.start() if next_match else min(start + 5000, len(full_text))
        return full_text[start:end].strip()[:3000]

    elif doc_type == "de_law":
        para_match = re.search(r'\d+', article_label)
        if not para_match:
            return ""
        num = int(para_match.group())
        pattern = rf'\n§\s+{num}\b'
        match = re.search(pattern, full_text)
        if not match:
            return ""
        start = match.start()
        next_pattern = rf'\n§\s+{num + 1}\b'
        next_match = re.search(next_pattern, full_text)
        end = next_match.start() if next_match else min(start + 5000, len(full_text))
        return full_text[start:end].strip()[:3000]

    elif doc_type == "nist":
        escaped = re.escape(article_label)
        match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
        if not match:
            return ""
        start = match.start()
        return full_text[start:start + 3000].strip()

    else:
        # Generic / OWASP / ENISA
        escaped = re.escape(article_label)
        match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
        if not match:
            return ""
        start = match.start()
        return full_text[start:start + 3000].strip()


# ── Anthropic API ────────────────────────────────────────────────────

def call_anthropic(prompt, system_prompt):
    """Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
    headers = {
        "x-api-key": ANTHROPIC_API_KEY,
        "anthropic-version": "2023-06-01",
        "content-type": "application/json",
    }
    payload = {
        "model": ANTHROPIC_MODEL,
        "max_tokens": 4096,
        "system": system_prompt,
        "messages": [{"role": "user", "content": prompt}],
    }

    try:
        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
        if resp.status_code != 200:
            return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
        data = resp.json()
        content = data["content"][0]["text"] if data.get("content") else ""
        usage = data.get("usage", {})
        parsed = parse_json(content)
        return parsed, content, usage, None
    except Exception as e:
        return None, "", {}, str(e)


def parse_json(text):
    """Parse JSON from LLM response, handling markdown fences."""
    text = text.strip()
    if text.startswith("```"):
        lines = text.split("\n")
        text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
        text = text.strip()

    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data[0] if data else None
        return data
    except json.JSONDecodeError:
        match = re.search(r'\{[\s\S]*\}', text)
        if match:
            try:
                return json.loads(match.group())
            except json.JSONDecodeError:
                return None
    return None


# ── Domain detection ─────────────────────────────────────────────────

def detect_domain(text):
    text_lower = text.lower()
    scores = {}
    for domain, keywords in DOMAIN_KEYWORDS.items():
        score = sum(1 for kw in keywords if kw in text_lower)
        if score > 0:
            scores[domain] = score
    if scores:
        return max(scores, key=scores.get)
    return "SEC"


# ── Control ID generation ────────────────────────────────────────────

def generate_control_id(domain, cur):
    """Generate next available control_id for domain prefix.

    Uses MAX(numeric suffix) to find the true highest number,
    avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
    """
    prefix = domain.upper()[:4]
    cur.execute("""
        SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
        FROM compliance.canonical_controls
        WHERE control_id LIKE %s
          AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
    """, (f"{prefix}-%",))
    row = cur.fetchone()
    if row and row[0] is not None:
        return f"{prefix}-{row[0] + 1}"
    return f"{prefix}-001"


# ── Main ─────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
    parser.add_argument("--source", type=str, help="Filter by source name substring")
    parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
    parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
    args = parser.parse_args()

    if not ANTHROPIC_API_KEY:
        print("ERROR: Set ANTHROPIC_API_KEY")
        sys.exit(1)

    # Load gap results
    with open(args.results) as f:
        gaps = json.load(f)
    total_gaps = sum(len(g["gap_articles"]) for g in gaps)
    print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")

    if args.source:
        gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
        total_gaps = sum(len(g["gap_articles"]) for g in gaps)
        print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")

    # DB connection with keepalive + reconnect helper
    db_url = os.environ['DATABASE_URL']
    parsed = urllib.parse.urlparse(db_url)

    def connect_db():
        """Create DB connection with TCP keepalive."""
        c = psycopg2.connect(
            host=parsed.hostname, port=parsed.port or 5432,
            user=parsed.username, password=parsed.password,
            dbname=parsed.path.lstrip('/'),
            options="-c search_path=compliance,public",
            keepalives=1, keepalives_idle=30,
            keepalives_interval=10, keepalives_count=5,
        )
        return c, c.cursor()

    conn, cur = connect_db()

    def ensure_db():
        """Reconnect if connection is dead."""
        nonlocal conn, cur
        try:
            cur.execute("SELECT 1")
        except Exception:
            print("  [RECONNECT] DB connection lost, reconnecting...")
            try:
                conn.close()
            except Exception:
                pass
            conn, cur = connect_db()
            return True
        return False

    # Get framework UUID
    cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
    fw_row = cur.fetchone()
    if not fw_row:
        print("ERROR: Framework bp_security_v1 not found")
        sys.exit(1)
    framework_uuid = fw_row[0]

    # If resuming, load existing articles per source
    existing_articles = {}
    if args.resume:
        cur.execute("""
            SELECT source_citation->>'source', source_citation->>'article'
            FROM compliance.canonical_controls
            WHERE source_citation->>'article' IS NOT NULL
        """)
        for src, art in cur.fetchall():
            existing_articles.setdefault(src, set()).add(art)
        print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")

    # Stats
    stats = Counter()
    total_input_tokens = 0
    total_output_tokens = 0
    generated_ids = []
    errors = []
    t_start = time.time()

    # Pre-read PDFs (cache full text per source)
    pdf_cache = {}

    for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
        source_name = gap_source["source"]
        gap_articles = gap_source["gap_articles"]
        filename = SOURCE_FILE_MAP.get(source_name)
        reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
        license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
        doc_type = classify_doc(source_name)

        if not filename:
            stats["skipped_no_pdf"] += len(gap_articles)
            continue

        # Read PDF once per source
        if source_name not in pdf_cache:
            pdf_cache[source_name] = read_file(filename)
        full_text = pdf_cache[source_name]
        if not full_text:
            stats["skipped_no_pdf"] += len(gap_articles)
            continue

        print(f"\n{'='*70}")
        print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
        print(f"{'='*70}")

        for gap in gap_articles:
            article_label = gap["label"]
            article_type = gap["type"]

            # Skip if already has controls (resume mode)
            if args.resume and article_label in existing_articles.get(source_name, set()):
                stats["skipped_exists"] += 1
                continue

            # Skip non-substantive NIST sections (intro chapters)
            if doc_type == "nist" and article_type == "section":
                section_match = re.match(r'Section (\d+)', article_label)
                if section_match and int(section_match.group(1)) <= 3:
                    stats["skipped_intro"] += 1
                    continue

            # Extract article text
            article_text = extract_article_text(filename, article_label, doc_type, full_text)
            if not article_text or len(article_text) < 30:
                stats["skipped_short_text"] += 1
                print(f"  SKIP {article_label}: text too short ({len(article_text)} chars)")
                continue

            if args.dry_run:
                print(f"  [DRY] {article_label} ({len(article_text)} chars)")
                stats["would_generate"] += 1
                continue

            # Call Anthropic
            prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
            data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)

            total_input_tokens += usage.get("input_tokens", 0)
            total_output_tokens += usage.get("output_tokens", 0)

            if error:
                stats["api_error"] += 1
                errors.append(f"{source_name} {article_label}: {error}")
                print(f"  ERROR {article_label}: {error}")
                time.sleep(5)
                continue

            if not data:
                stats["parse_error"] += 1
                print(f"  PARSE ERROR {article_label}")
                continue

            # Ensure DB is alive before writing
            ensure_db()

            # Build control
            title = str(data.get("title", ""))[:200]
            objective = str(data.get("objective", ""))
            rationale = str(data.get("rationale", ""))
            domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
            if not domain or len(domain) < 2:
                domain = detect_domain(article_text)

            control_id = generate_control_id(domain, cur)
            severity = str(data.get("severity", "medium")).lower()
            if severity not in ("low", "medium", "high", "critical"):
                severity = "medium"

            requirements = data.get("requirements", [])
            if not isinstance(requirements, list):
                requirements = [str(requirements)]
            test_procedure = data.get("test_procedure", [])
            if not isinstance(test_procedure, list):
                test_procedure = [str(test_procedure)]
            evidence = data.get("evidence", [])
            if not isinstance(evidence, list):
                evidence = [str(evidence)]
            tags = data.get("tags", [])
            if not isinstance(tags, list):
                tags = []
            target_audience = data.get("target_audience", [])
            if not isinstance(target_audience, list):
                target_audience = []
            applicable_industries = data.get("applicable_industries", ["all"])
            if not isinstance(applicable_industries, list):
                applicable_industries = ["all"]
            applicable_company_size = data.get("applicable_company_size", ["all"])
            if not isinstance(applicable_company_size, list):
                applicable_company_size = ["all"]
            scope_conditions = data.get("scope_conditions")

            source_citation = {
                "source": source_name,
                "article": data.get("source_article", article_label),
                "paragraph": data.get("source_paragraph", ""),
                "article_type": article_type,
                "license": license_info["license"],
                "source_type": license_info["source_type"],
            }

            generation_metadata = {
                "processing_path": "phase74_gap_fill",
                "license_rule": license_info["rule"],
                "source_regulation": reg_code,
                "source_article": article_label,
                "gap_fill": True,
            }

            category = str(data.get("category", "")) or None

            # Insert into DB
            try:
                cur.execute("""
                    INSERT INTO compliance.canonical_controls (
                        framework_id, control_id, title, objective, rationale,
                        scope, requirements, test_procedure, evidence,
                        severity, risk_score, implementation_effort,
                        open_anchors, release_state, tags,
                        license_rule, source_original_text, source_citation,
                        customer_visible, generation_metadata,
                        verification_method, category, generation_strategy,
                        target_audience, pipeline_version,
                        applicable_industries, applicable_company_size, scope_conditions
                    ) VALUES (
                        %s, %s, %s, %s, %s,
                        %s, %s, %s, %s,
                        %s, %s, %s,
                        %s, %s, %s,
                        %s, %s, %s,
                        %s, %s,
                        %s, %s, %s,
                        %s, %s,
                        %s, %s, %s
                    )
                    ON CONFLICT (framework_id, control_id) DO NOTHING
                    RETURNING id
                """, (
                    framework_uuid, control_id, title, objective, rationale,
                    json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
                    severity, 5, "m",
                    json.dumps([]), "draft", json.dumps(tags),
                    license_info["rule"], article_text, json.dumps(source_citation),
                    True, json.dumps(generation_metadata),
                    "document", category, "phase74_gap_fill",
                    json.dumps(target_audience), PIPELINE_VERSION,
                    json.dumps(applicable_industries), json.dumps(applicable_company_size),
                    json.dumps(scope_conditions) if scope_conditions else None,
                ))
                conn.commit()
                row = cur.fetchone()
                if row:
                    generated_ids.append(str(row[0]))
                    stats["generated"] += 1
                    print(f"  OK {control_id}: {title[:60]}")
                else:
                    stats["conflict"] += 1
                    print(f"  CONFLICT {control_id} (already exists)")
            except Exception as e:
                conn.rollback()
                stats["db_error"] += 1
                errors.append(f"DB {control_id}: {str(e)[:100]}")
                print(f"  DB ERROR {control_id}: {str(e)[:100]}")

            # Rate limit: ~0.5s between calls
            time.sleep(0.5)

    # ── Summary ──────────────────────────────────────────────────────
    elapsed = time.time() - t_start
    cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000

    print(f"\n\n{'='*70}")
    print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
    print(f"{'='*70}")
    print(f"  Laufzeit:              {elapsed/60:.1f} min")
    print(f"  API-Kosten:            ${cost:.2f}")
    print(f"  Input Tokens:          {total_input_tokens:,}")
    print(f"  Output Tokens:         {total_output_tokens:,}")
    print()
    for key in sorted(stats.keys()):
        print(f"  {key:<25s}: {stats[key]:5d}")
    print()

    if generated_ids:
        print(f"  Neue Control-IDs: {len(generated_ids)}")
        # Save generated IDs
        with open("/tmp/phase74_generated_ids.json", 'w') as f:
            json.dump(generated_ids, f)
        print(f"  IDs gespeichert: /tmp/phase74_generated_ids.json")

    if errors:
        print(f"\n  Fehler ({len(errors)}):")
        for e in errors[:20]:
            print(f"    {e}")
        if len(errors) > 20:
            print(f"    ... und {len(errors)-20} weitere")

    conn.close()


if __name__ == "__main__":
    main()