feat: Control Library UI, dedup migration, QA tooling, docs

- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions
@@ -0,0 +1,655 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
+
+Reads gap_analysis_results.json, extracts article text from PDFs,
+calls Claude Sonnet to generate controls, inserts into DB.
+
+Usage:
+    python3 phase74_generate_gap_controls.py --dry-run          # show what would be generated
+    python3 phase74_generate_gap_controls.py                     # generate and insert
+    python3 phase74_generate_gap_controls.py --source "DSGVO"    # filter by source
+    python3 phase74_generate_gap_controls.py --resume            # skip already-generated articles
+"""
+import os
+import sys
+import json
+import re
+import time
+import hashlib
+import argparse
+import psycopg2
+import urllib.parse
+import requests
+from pathlib import Path
+from collections import Counter
+
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+    build_eu_article_index, build_de_law_index, build_nist_index,
+    build_owasp_index, build_generic_index, MAX_ARTICLES,
+)
+
+# ── Config ──────────────────────────────────────────────────────────
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+PIPELINE_VERSION = 5
+GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+    import fitz
+except ImportError:
+    fitz = None
+
+# ── Source name → regulation_code reverse map ────────────────────────
+# Built from REGULATION_LICENSE_MAP in control_generator.py
+SOURCE_TO_REGCODE = {
+    "DSGVO (EU) 2016/679": "eu_2016_679",
+    "KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
+    "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
+    "Cyber Resilience Act (CRA)": "eu_2024_2847",
+    "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
+    "EU Blue Guide 2022": "eu_blue_guide_2022",
+    "Markets in Crypto-Assets (MiCA)": "mica",
+    "Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
+    "AML-Verordnung": "amlr",
+    "Data Governance Act (DGA)": "dga",
+    "Data Act": "data_act",
+    "GPSR (EU) 2023/988": "gpsr",
+    "IFRS-Übernahmeverordnung": "ifrs",
+    "NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
+    "NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
+    "NIST SP 800-63-3": "nist_sp800_63_3",
+    "NIST AI Risk Management Framework": "nist_ai_rmf",
+    "NIST SP 800-218 (SSDF)": "nist_sp_800_218",
+    "NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
+    "OWASP Top 10 (2021)": "owasp_top10",
+    "OWASP ASVS 4.0": "owasp_asvs",
+    "OWASP SAMM 2.0": "owasp_samm",
+    "OWASP API Security Top 10 (2023)": "owasp_api_top10",
+    "OWASP MASVS 2.0": "owasp_masvs",
+    "ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
+    "ENISA Supply Chain Good Practices": "enisa_supply_chain",
+    "CISA Secure by Design": "cisa_sbd",
+    "Bundesdatenschutzgesetz (BDSG)": "bdsg",
+    "Gewerbeordnung (GewO)": "gewo",
+    "Handelsgesetzbuch (HGB)": "hgb",
+    "Abgabenordnung (AO)": "ao",
+    "OECD KI-Empfehlung": "oecd_ai_principles",
+}
+
+# License info per regulation code (from REGULATION_LICENSE_MAP)
+LICENSE_MAP = {
+    "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
+    "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+    "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+    "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
+    "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
+}
+
+# Domain detection keywords
+DOMAIN_KEYWORDS = {
+    "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
+    "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
+    "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
+    "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
+    "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
+    "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
+    "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
+    "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
+    "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
+    "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
+    "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
+    "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
+    "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
+}
+
+# ── Prompt (same as control_generator.py) ────────────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+  Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
+  "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
+  "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
+  "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
+  "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
+  "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
+  "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
+  "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
+  "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
+  "Abfallwirtschaft", "Forschung"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+  Verwende ["all"] wenn keine Groessenbeschraenkung.
+  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+  {"requires_any": ["signal"], "description": "Erklaerung"}
+  Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
+  "processes_minors_data", "automated_decisions", "employee_monitoring",
+  "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
+
+CATEGORY_LIST = [
+    "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
+    "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
+    "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
+    "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
+    "Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
+    "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
+    "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
+]
+CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
+
+
+def build_prompt(source_name, article_label, article_text, license_type):
+    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
+- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
+- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF article extraction ───────────────────────────────────────────
+
+def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
+    """Extract the text of a specific article from a PDF."""
+    if full_text is None:
+        full_text = read_file(pdf_file)
+    if not full_text:
+        return ""
+
+    if doc_type == "eu_regulation":
+        art_num_match = re.search(r'\d+', article_label)
+        if not art_num_match:
+            return ""
+        num = int(art_num_match.group())
+        pattern = rf'\nArtikel\s+{num}\s*\n'
+        match = re.search(pattern, full_text)
+        if not match:
+            return ""
+        start = match.start()
+        next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else min(start + 5000, len(full_text))
+        return full_text[start:end].strip()[:3000]
+
+    elif doc_type == "de_law":
+        para_match = re.search(r'\d+', article_label)
+        if not para_match:
+            return ""
+        num = int(para_match.group())
+        pattern = rf'\n§\s+{num}\b'
+        match = re.search(pattern, full_text)
+        if not match:
+            return ""
+        start = match.start()
+        next_pattern = rf'\n§\s+{num + 1}\b'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else min(start + 5000, len(full_text))
+        return full_text[start:end].strip()[:3000]
+
+    elif doc_type == "nist":
+        escaped = re.escape(article_label)
+        match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
+        if not match:
+            return ""
+        start = match.start()
+        return full_text[start:start + 3000].strip()
+
+    else:
+        # Generic / OWASP / ENISA
+        escaped = re.escape(article_label)
+        match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
+        if not match:
+            return ""
+        start = match.start()
+        return full_text[start:start + 3000].strip()
+
+
+# ── Anthropic API ────────────────────────────────────────────────────
+
+def call_anthropic(prompt, system_prompt):
+    """Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": prompt}],
+    }
+
+    try:
+        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
+        if resp.status_code != 200:
+            return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+        data = resp.json()
+        content = data["content"][0]["text"] if data.get("content") else ""
+        usage = data.get("usage", {})
+        parsed = parse_json(content)
+        return parsed, content, usage, None
+    except Exception as e:
+        return None, "", {}, str(e)
+
+
+def parse_json(text):
+    """Parse JSON from LLM response, handling markdown fences."""
+    text = text.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
+        text = text.strip()
+
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            return data[0] if data else None
+        return data
+    except json.JSONDecodeError:
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                return None
+    return None
+
+
+# ── Domain detection ─────────────────────────────────────────────────
+
+def detect_domain(text):
+    text_lower = text.lower()
+    scores = {}
+    for domain, keywords in DOMAIN_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in text_lower)
+        if score > 0:
+            scores[domain] = score
+    if scores:
+        return max(scores, key=scores.get)
+    return "SEC"
+
+
+# ── Control ID generation ────────────────────────────────────────────
+
+def generate_control_id(domain, cur):
+    """Generate next available control_id for domain prefix.
+
+    Uses MAX(numeric suffix) to find the true highest number,
+    avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
+    """
+    prefix = domain.upper()[:4]
+    cur.execute("""
+        SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+        FROM compliance.canonical_controls
+        WHERE control_id LIKE %s
+          AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+    """, (f"{prefix}-%",))
+    row = cur.fetchone()
+    if row and row[0] is not None:
+        return f"{prefix}-{row[0] + 1}"
+    return f"{prefix}-001"
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    parser.add_argument("--source", type=str, help="Filter by source name substring")
+    parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
+    parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    # Load gap results
+    with open(args.results) as f:
+        gaps = json.load(f)
+    total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+    print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
+
+    if args.source:
+        gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
+        total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+        print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
+
+    # DB connection with keepalive + reconnect helper
+    db_url = os.environ['DATABASE_URL']
+    parsed = urllib.parse.urlparse(db_url)
+
+    def connect_db():
+        """Create DB connection with TCP keepalive."""
+        c = psycopg2.connect(
+            host=parsed.hostname, port=parsed.port or 5432,
+            user=parsed.username, password=parsed.password,
+            dbname=parsed.path.lstrip('/'),
+            options="-c search_path=compliance,public",
+            keepalives=1, keepalives_idle=30,
+            keepalives_interval=10, keepalives_count=5,
+        )
+        return c, c.cursor()
+
+    conn, cur = connect_db()
+
+    def ensure_db():
+        """Reconnect if connection is dead."""
+        nonlocal conn, cur
+        try:
+            cur.execute("SELECT 1")
+        except Exception:
+            print("  [RECONNECT] DB connection lost, reconnecting...")
+            try:
+                conn.close()
+            except Exception:
+                pass
+            conn, cur = connect_db()
+            return True
+        return False
+
+    # Get framework UUID
+    cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
+    fw_row = cur.fetchone()
+    if not fw_row:
+        print("ERROR: Framework bp_security_v1 not found")
+        sys.exit(1)
+    framework_uuid = fw_row[0]
+
+    # If resuming, load existing articles per source
+    existing_articles = {}
+    if args.resume:
+        cur.execute("""
+            SELECT source_citation->>'source', source_citation->>'article'
+            FROM compliance.canonical_controls
+            WHERE source_citation->>'article' IS NOT NULL
+        """)
+        for src, art in cur.fetchall():
+            existing_articles.setdefault(src, set()).add(art)
+        print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
+
+    # Stats
+    stats = Counter()
+    total_input_tokens = 0
+    total_output_tokens = 0
+    generated_ids = []
+    errors = []
+    t_start = time.time()
+
+    # Pre-read PDFs (cache full text per source)
+    pdf_cache = {}
+
+    for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
+        source_name = gap_source["source"]
+        gap_articles = gap_source["gap_articles"]
+        filename = SOURCE_FILE_MAP.get(source_name)
+        reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
+        license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
+        doc_type = classify_doc(source_name)
+
+        if not filename:
+            stats["skipped_no_pdf"] += len(gap_articles)
+            continue
+
+        # Read PDF once per source
+        if source_name not in pdf_cache:
+            pdf_cache[source_name] = read_file(filename)
+        full_text = pdf_cache[source_name]
+        if not full_text:
+            stats["skipped_no_pdf"] += len(gap_articles)
+            continue
+
+        print(f"\n{'='*70}")
+        print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
+        print(f"{'='*70}")
+
+        for gap in gap_articles:
+            article_label = gap["label"]
+            article_type = gap["type"]
+
+            # Skip if already has controls (resume mode)
+            if args.resume and article_label in existing_articles.get(source_name, set()):
+                stats["skipped_exists"] += 1
+                continue
+
+            # Skip non-substantive NIST sections (intro chapters)
+            if doc_type == "nist" and article_type == "section":
+                section_match = re.match(r'Section (\d+)', article_label)
+                if section_match and int(section_match.group(1)) <= 3:
+                    stats["skipped_intro"] += 1
+                    continue
+
+            # Extract article text
+            article_text = extract_article_text(filename, article_label, doc_type, full_text)
+            if not article_text or len(article_text) < 30:
+                stats["skipped_short_text"] += 1
+                print(f"  SKIP {article_label}: text too short ({len(article_text)} chars)")
+                continue
+
+            if args.dry_run:
+                print(f"  [DRY] {article_label} ({len(article_text)} chars)")
+                stats["would_generate"] += 1
+                continue
+
+            # Call Anthropic
+            prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
+            data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
+
+            total_input_tokens += usage.get("input_tokens", 0)
+            total_output_tokens += usage.get("output_tokens", 0)
+
+            if error:
+                stats["api_error"] += 1
+                errors.append(f"{source_name} {article_label}: {error}")
+                print(f"  ERROR {article_label}: {error}")
+                time.sleep(5)
+                continue
+
+            if not data:
+                stats["parse_error"] += 1
+                print(f"  PARSE ERROR {article_label}")
+                continue
+
+            # Ensure DB is alive before writing
+            ensure_db()
+
+            # Build control
+            title = str(data.get("title", ""))[:200]
+            objective = str(data.get("objective", ""))
+            rationale = str(data.get("rationale", ""))
+            domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
+            if not domain or len(domain) < 2:
+                domain = detect_domain(article_text)
+
+            control_id = generate_control_id(domain, cur)
+            severity = str(data.get("severity", "medium")).lower()
+            if severity not in ("low", "medium", "high", "critical"):
+                severity = "medium"
+
+            requirements = data.get("requirements", [])
+            if not isinstance(requirements, list):
+                requirements = [str(requirements)]
+            test_procedure = data.get("test_procedure", [])
+            if not isinstance(test_procedure, list):
+                test_procedure = [str(test_procedure)]
+            evidence = data.get("evidence", [])
+            if not isinstance(evidence, list):
+                evidence = [str(evidence)]
+            tags = data.get("tags", [])
+            if not isinstance(tags, list):
+                tags = []
+            target_audience = data.get("target_audience", [])
+            if not isinstance(target_audience, list):
+                target_audience = []
+            applicable_industries = data.get("applicable_industries", ["all"])
+            if not isinstance(applicable_industries, list):
+                applicable_industries = ["all"]
+            applicable_company_size = data.get("applicable_company_size", ["all"])
+            if not isinstance(applicable_company_size, list):
+                applicable_company_size = ["all"]
+            scope_conditions = data.get("scope_conditions")
+
+            source_citation = {
+                "source": source_name,
+                "article": data.get("source_article", article_label),
+                "paragraph": data.get("source_paragraph", ""),
+                "article_type": article_type,
+                "license": license_info["license"],
+                "source_type": license_info["source_type"],
+            }
+
+            generation_metadata = {
+                "processing_path": "phase74_gap_fill",
+                "license_rule": license_info["rule"],
+                "source_regulation": reg_code,
+                "source_article": article_label,
+                "gap_fill": True,
+            }
+
+            category = str(data.get("category", "")) or None
+
+            # Insert into DB
+            try:
+                cur.execute("""
+                    INSERT INTO compliance.canonical_controls (
+                        framework_id, control_id, title, objective, rationale,
+                        scope, requirements, test_procedure, evidence,
+                        severity, risk_score, implementation_effort,
+                        open_anchors, release_state, tags,
+                        license_rule, source_original_text, source_citation,
+                        customer_visible, generation_metadata,
+                        verification_method, category, generation_strategy,
+                        target_audience, pipeline_version,
+                        applicable_industries, applicable_company_size, scope_conditions
+                    ) VALUES (
+                        %s, %s, %s, %s, %s,
+                        %s, %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s,
+                        %s, %s, %s,
+                        %s, %s,
+                        %s, %s, %s
+                    )
+                    ON CONFLICT (framework_id, control_id) DO NOTHING
+                    RETURNING id
+                """, (
+                    framework_uuid, control_id, title, objective, rationale,
+                    json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
+                    severity, 5, "m",
+                    json.dumps([]), "draft", json.dumps(tags),
+                    license_info["rule"], article_text, json.dumps(source_citation),
+                    True, json.dumps(generation_metadata),
+                    "document", category, "phase74_gap_fill",
+                    json.dumps(target_audience), PIPELINE_VERSION,
+                    json.dumps(applicable_industries), json.dumps(applicable_company_size),
+                    json.dumps(scope_conditions) if scope_conditions else None,
+                ))
+                conn.commit()
+                row = cur.fetchone()
+                if row:
+                    generated_ids.append(str(row[0]))
+                    stats["generated"] += 1
+                    print(f"  OK {control_id}: {title[:60]}")
+                else:
+                    stats["conflict"] += 1
+                    print(f"  CONFLICT {control_id} (already exists)")
+            except Exception as e:
+                conn.rollback()
+                stats["db_error"] += 1
+                errors.append(f"DB {control_id}: {str(e)[:100]}")
+                print(f"  DB ERROR {control_id}: {str(e)[:100]}")
+
+            # Rate limit: ~0.5s between calls
+            time.sleep(0.5)
+
+    # ── Summary ──────────────────────────────────────────────────────
+    elapsed = time.time() - t_start
+    cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
+
+    print(f"\n\n{'='*70}")
+    print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
+    print(f"{'='*70}")
+    print(f"  Laufzeit:              {elapsed/60:.1f} min")
+    print(f"  API-Kosten:            ${cost:.2f}")
+    print(f"  Input Tokens:          {total_input_tokens:,}")
+    print(f"  Output Tokens:         {total_output_tokens:,}")
+    print()
+    for key in sorted(stats.keys()):
+        print(f"  {key:<25s}: {stats[key]:5d}")
+    print()
+
+    if generated_ids:
+        print(f"  Neue Control-IDs: {len(generated_ids)}")
+        # Save generated IDs
+        with open("/tmp/phase74_generated_ids.json", 'w') as f:
+            json.dump(generated_ids, f)
+        print(f"  IDs gespeichert: /tmp/phase74_generated_ids.json")
+
+    if errors:
+        print(f"\n  Fehler ({len(errors)}):")
+        for e in errors[:20]:
+            print(f"    {e}")
+        if len(errors) > 20:
+            print(f"    ... und {len(errors)-20} weitere")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()