Files
breakpilot-compliance/scripts/qa/phase74_generate_gap_controls.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

656 lines
29 KiB
Python

#!/usr/bin/env python3
"""
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
Reads gap_analysis_results.json, extracts article text from PDFs,
calls Claude Sonnet to generate controls, inserts into DB.
Usage:
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
python3 phase74_generate_gap_controls.py # generate and insert
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
"""
import os
import sys
import json
import re
import time
import hashlib
import argparse
import psycopg2
import urllib.parse
import requests
from pathlib import Path
from collections import Counter
sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
build_eu_article_index, build_de_law_index, build_nist_index,
build_owasp_index, build_generic_index, MAX_ARTICLES,
)
# ── Config ──────────────────────────────────────────────────────────
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PIPELINE_VERSION = 5
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
try:
import fitz
except ImportError:
fitz = None
# ── Source name → regulation_code reverse map ────────────────────────
# Built from REGULATION_LICENSE_MAP in control_generator.py
SOURCE_TO_REGCODE = {
"DSGVO (EU) 2016/679": "eu_2016_679",
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
"Cyber Resilience Act (CRA)": "eu_2024_2847",
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
"EU Blue Guide 2022": "eu_blue_guide_2022",
"Markets in Crypto-Assets (MiCA)": "mica",
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
"AML-Verordnung": "amlr",
"Data Governance Act (DGA)": "dga",
"Data Act": "data_act",
"GPSR (EU) 2023/988": "gpsr",
"IFRS-Übernahmeverordnung": "ifrs",
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
"NIST SP 800-63-3": "nist_sp800_63_3",
"NIST AI Risk Management Framework": "nist_ai_rmf",
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
"OWASP Top 10 (2021)": "owasp_top10",
"OWASP ASVS 4.0": "owasp_asvs",
"OWASP SAMM 2.0": "owasp_samm",
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
"OWASP MASVS 2.0": "owasp_masvs",
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
"CISA Secure by Design": "cisa_sbd",
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
"Gewerbeordnung (GewO)": "gewo",
"Handelsgesetzbuch (HGB)": "hgb",
"Abgabenordnung (AO)": "ao",
"OECD KI-Empfehlung": "oecd_ai_principles",
}
# License info per regulation code (from REGULATION_LICENSE_MAP)
LICENSE_MAP = {
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
}
# Domain detection keywords
DOMAIN_KEYWORDS = {
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
}
# ── Prompt (same as control_generator.py) ────────────────────────────
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
"Abfallwirtschaft", "Forschung"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
Verwende ["all"] wenn keine Groessenbeschraenkung.
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
{"requires_any": ["signal"], "description": "Erklaerung"}
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
"processes_minors_data", "automated_decisions", "employee_monitoring",
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
CATEGORY_LIST = [
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
]
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
def build_prompt(source_name, article_label, article_text, license_type):
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
{APPLICABILITY_PROMPT}
Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""
# ── PDF article extraction ───────────────────────────────────────────
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
"""Extract the text of a specific article from a PDF."""
if full_text is None:
full_text = read_file(pdf_file)
if not full_text:
return ""
if doc_type == "eu_regulation":
art_num_match = re.search(r'\d+', article_label)
if not art_num_match:
return ""
num = int(art_num_match.group())
pattern = rf'\nArtikel\s+{num}\s*\n'
match = re.search(pattern, full_text)
if not match:
return ""
start = match.start()
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else min(start + 5000, len(full_text))
return full_text[start:end].strip()[:3000]
elif doc_type == "de_law":
para_match = re.search(r'\d+', article_label)
if not para_match:
return ""
num = int(para_match.group())
pattern = rf'\\s+{num}\b'
match = re.search(pattern, full_text)
if not match:
return ""
start = match.start()
next_pattern = rf'\\s+{num + 1}\b'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else min(start + 5000, len(full_text))
return full_text[start:end].strip()[:3000]
elif doc_type == "nist":
escaped = re.escape(article_label)
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
if not match:
return ""
start = match.start()
return full_text[start:start + 3000].strip()
else:
# Generic / OWASP / ENISA
escaped = re.escape(article_label)
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
if not match:
return ""
start = match.start()
return full_text[start:start + 3000].strip()
# ── Anthropic API ────────────────────────────────────────────────────
def call_anthropic(prompt, system_prompt):
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": system_prompt,
"messages": [{"role": "user", "content": prompt}],
}
try:
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data["content"][0]["text"] if data.get("content") else ""
usage = data.get("usage", {})
parsed = parse_json(content)
return parsed, content, usage, None
except Exception as e:
return None, "", {}, str(e)
def parse_json(text):
"""Parse JSON from LLM response, handling markdown fences."""
text = text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
text = text.strip()
try:
data = json.loads(text)
if isinstance(data, list):
return data[0] if data else None
return data
except json.JSONDecodeError:
match = re.search(r'\{[\s\S]*\}', text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
return None
return None
# ── Domain detection ─────────────────────────────────────────────────
def detect_domain(text):
text_lower = text.lower()
scores = {}
for domain, keywords in DOMAIN_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[domain] = score
if scores:
return max(scores, key=scores.get)
return "SEC"
# ── Control ID generation ────────────────────────────────────────────
def generate_control_id(domain, cur):
"""Generate next available control_id for domain prefix.
Uses MAX(numeric suffix) to find the true highest number,
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
"""
prefix = domain.upper()[:4]
cur.execute("""
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
FROM compliance.canonical_controls
WHERE control_id LIKE %s
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
""", (f"{prefix}-%",))
row = cur.fetchone()
if row and row[0] is not None:
return f"{prefix}-{row[0] + 1}"
return f"{prefix}-001"
# ── Main ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
parser.add_argument("--source", type=str, help="Filter by source name substring")
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
args = parser.parse_args()
if not ANTHROPIC_API_KEY:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
# Load gap results
with open(args.results) as f:
gaps = json.load(f)
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
if args.source:
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
# DB connection with keepalive + reconnect helper
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
def connect_db():
"""Create DB connection with TCP keepalive."""
c = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public",
keepalives=1, keepalives_idle=30,
keepalives_interval=10, keepalives_count=5,
)
return c, c.cursor()
conn, cur = connect_db()
def ensure_db():
"""Reconnect if connection is dead."""
nonlocal conn, cur
try:
cur.execute("SELECT 1")
except Exception:
print(" [RECONNECT] DB connection lost, reconnecting...")
try:
conn.close()
except Exception:
pass
conn, cur = connect_db()
return True
return False
# Get framework UUID
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
fw_row = cur.fetchone()
if not fw_row:
print("ERROR: Framework bp_security_v1 not found")
sys.exit(1)
framework_uuid = fw_row[0]
# If resuming, load existing articles per source
existing_articles = {}
if args.resume:
cur.execute("""
SELECT source_citation->>'source', source_citation->>'article'
FROM compliance.canonical_controls
WHERE source_citation->>'article' IS NOT NULL
""")
for src, art in cur.fetchall():
existing_articles.setdefault(src, set()).add(art)
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
# Stats
stats = Counter()
total_input_tokens = 0
total_output_tokens = 0
generated_ids = []
errors = []
t_start = time.time()
# Pre-read PDFs (cache full text per source)
pdf_cache = {}
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
source_name = gap_source["source"]
gap_articles = gap_source["gap_articles"]
filename = SOURCE_FILE_MAP.get(source_name)
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
doc_type = classify_doc(source_name)
if not filename:
stats["skipped_no_pdf"] += len(gap_articles)
continue
# Read PDF once per source
if source_name not in pdf_cache:
pdf_cache[source_name] = read_file(filename)
full_text = pdf_cache[source_name]
if not full_text:
stats["skipped_no_pdf"] += len(gap_articles)
continue
print(f"\n{'='*70}")
print(f"{source_name}{len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
print(f"{'='*70}")
for gap in gap_articles:
article_label = gap["label"]
article_type = gap["type"]
# Skip if already has controls (resume mode)
if args.resume and article_label in existing_articles.get(source_name, set()):
stats["skipped_exists"] += 1
continue
# Skip non-substantive NIST sections (intro chapters)
if doc_type == "nist" and article_type == "section":
section_match = re.match(r'Section (\d+)', article_label)
if section_match and int(section_match.group(1)) <= 3:
stats["skipped_intro"] += 1
continue
# Extract article text
article_text = extract_article_text(filename, article_label, doc_type, full_text)
if not article_text or len(article_text) < 30:
stats["skipped_short_text"] += 1
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
continue
if args.dry_run:
print(f" [DRY] {article_label} ({len(article_text)} chars)")
stats["would_generate"] += 1
continue
# Call Anthropic
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
total_input_tokens += usage.get("input_tokens", 0)
total_output_tokens += usage.get("output_tokens", 0)
if error:
stats["api_error"] += 1
errors.append(f"{source_name} {article_label}: {error}")
print(f" ERROR {article_label}: {error}")
time.sleep(5)
continue
if not data:
stats["parse_error"] += 1
print(f" PARSE ERROR {article_label}")
continue
# Ensure DB is alive before writing
ensure_db()
# Build control
title = str(data.get("title", ""))[:200]
objective = str(data.get("objective", ""))
rationale = str(data.get("rationale", ""))
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
if not domain or len(domain) < 2:
domain = detect_domain(article_text)
control_id = generate_control_id(domain, cur)
severity = str(data.get("severity", "medium")).lower()
if severity not in ("low", "medium", "high", "critical"):
severity = "medium"
requirements = data.get("requirements", [])
if not isinstance(requirements, list):
requirements = [str(requirements)]
test_procedure = data.get("test_procedure", [])
if not isinstance(test_procedure, list):
test_procedure = [str(test_procedure)]
evidence = data.get("evidence", [])
if not isinstance(evidence, list):
evidence = [str(evidence)]
tags = data.get("tags", [])
if not isinstance(tags, list):
tags = []
target_audience = data.get("target_audience", [])
if not isinstance(target_audience, list):
target_audience = []
applicable_industries = data.get("applicable_industries", ["all"])
if not isinstance(applicable_industries, list):
applicable_industries = ["all"]
applicable_company_size = data.get("applicable_company_size", ["all"])
if not isinstance(applicable_company_size, list):
applicable_company_size = ["all"]
scope_conditions = data.get("scope_conditions")
source_citation = {
"source": source_name,
"article": data.get("source_article", article_label),
"paragraph": data.get("source_paragraph", ""),
"article_type": article_type,
"license": license_info["license"],
"source_type": license_info["source_type"],
}
generation_metadata = {
"processing_path": "phase74_gap_fill",
"license_rule": license_info["rule"],
"source_regulation": reg_code,
"source_article": article_label,
"gap_fill": True,
}
category = str(data.get("category", "")) or None
# Insert into DB
try:
cur.execute("""
INSERT INTO compliance.canonical_controls (
framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort,
open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata,
verification_method, category, generation_strategy,
target_audience, pipeline_version,
applicable_industries, applicable_company_size, scope_conditions
) VALUES (
%s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s, %s,
%s, %s,
%s, %s, %s
)
ON CONFLICT (framework_id, control_id) DO NOTHING
RETURNING id
""", (
framework_uuid, control_id, title, objective, rationale,
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
severity, 5, "m",
json.dumps([]), "draft", json.dumps(tags),
license_info["rule"], article_text, json.dumps(source_citation),
True, json.dumps(generation_metadata),
"document", category, "phase74_gap_fill",
json.dumps(target_audience), PIPELINE_VERSION,
json.dumps(applicable_industries), json.dumps(applicable_company_size),
json.dumps(scope_conditions) if scope_conditions else None,
))
conn.commit()
row = cur.fetchone()
if row:
generated_ids.append(str(row[0]))
stats["generated"] += 1
print(f" OK {control_id}: {title[:60]}")
else:
stats["conflict"] += 1
print(f" CONFLICT {control_id} (already exists)")
except Exception as e:
conn.rollback()
stats["db_error"] += 1
errors.append(f"DB {control_id}: {str(e)[:100]}")
print(f" DB ERROR {control_id}: {str(e)[:100]}")
# Rate limit: ~0.5s between calls
time.sleep(0.5)
# ── Summary ──────────────────────────────────────────────────────
elapsed = time.time() - t_start
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
print(f"\n\n{'='*70}")
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
print(f"{'='*70}")
print(f" Laufzeit: {elapsed/60:.1f} min")
print(f" API-Kosten: ${cost:.2f}")
print(f" Input Tokens: {total_input_tokens:,}")
print(f" Output Tokens: {total_output_tokens:,}")
print()
for key in sorted(stats.keys()):
print(f" {key:<25s}: {stats[key]:5d}")
print()
if generated_ids:
print(f" Neue Control-IDs: {len(generated_ids)}")
# Save generated IDs
with open("/tmp/phase74_generated_ids.json", 'w') as f:
json.dump(generated_ids, f)
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
if errors:
print(f"\n Fehler ({len(errors)}):")
for e in errors[:20]:
print(f" {e}")
if len(errors) > 20:
print(f" ... und {len(errors)-20} weitere")
conn.close()
if __name__ == "__main__":
main()