#!/usr/bin/env python3 """ Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet. Reads gap_analysis_results.json, extracts article text from PDFs, calls Claude Sonnet to generate controls, inserts into DB. Usage: python3 phase74_generate_gap_controls.py --dry-run # show what would be generated python3 phase74_generate_gap_controls.py # generate and insert python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source python3 phase74_generate_gap_controls.py --resume # skip already-generated articles """ import os import sys import json import re import time import hashlib import argparse import psycopg2 import urllib.parse import requests from pathlib import Path from collections import Counter sys.path.insert(0, os.path.dirname(__file__)) from pdf_qa_all import ( SOURCE_FILE_MAP, read_file, classify_doc, normalize, build_eu_article_index, build_de_law_index, build_nist_index, build_owasp_index, build_generic_index, MAX_ARTICLES, ) # ── Config ────────────────────────────────────────────────────────── ANTHROPIC_URL = "https://api.anthropic.com/v1/messages" ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6") ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") PIPELINE_VERSION = 5 GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json" PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs")) try: import fitz except ImportError: fitz = None # ── Source name → regulation_code reverse map ──────────────────────── # Built from REGULATION_LICENSE_MAP in control_generator.py SOURCE_TO_REGCODE = { "DSGVO (EU) 2016/679": "eu_2016_679", "KI-Verordnung (EU) 2024/1689": "eu_2024_1689", "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555", "Cyber Resilience Act (CRA)": "eu_2024_2847", "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230", "EU Blue Guide 2022": "eu_blue_guide_2022", "Markets in Crypto-Assets (MiCA)": "mica", "Batterieverordnung (EU) 2023/1542": "eu_2023_1542", "AML-Verordnung": "amlr", "Data Governance Act (DGA)": "dga", "Data Act": "data_act", "GPSR (EU) 2023/988": "gpsr", "IFRS-Übernahmeverordnung": "ifrs", "NIST SP 800-53 Rev. 5": "nist_sp800_53r5", "NIST SP 800-207 (Zero Trust)": "nist_sp800_207", "NIST SP 800-63-3": "nist_sp800_63_3", "NIST AI Risk Management Framework": "nist_ai_rmf", "NIST SP 800-218 (SSDF)": "nist_sp_800_218", "NIST Cybersecurity Framework 2.0": "nist_csf_2_0", "OWASP Top 10 (2021)": "owasp_top10", "OWASP ASVS 4.0": "owasp_asvs", "OWASP SAMM 2.0": "owasp_samm", "OWASP API Security Top 10 (2023)": "owasp_api_top10", "OWASP MASVS 2.0": "owasp_masvs", "ENISA ICS/SCADA Dependencies": "enisa_ics_scada", "ENISA Supply Chain Good Practices": "enisa_supply_chain", "CISA Secure by Design": "cisa_sbd", "Bundesdatenschutzgesetz (BDSG)": "bdsg", "Gewerbeordnung (GewO)": "gewo", "Handelsgesetzbuch (HGB)": "hgb", "Abgabenordnung (AO)": "ao", "OECD KI-Empfehlung": "oecd_ai_principles", } # License info per regulation code (from REGULATION_LICENSE_MAP) LICENSE_MAP = { "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"}, "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"}, "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"}, "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"}, "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"}, } # Domain detection keywords DOMAIN_KEYWORDS = { "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"], "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"], "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"], "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"], "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"], "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"], "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"], "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"], "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"], "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"], "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"], "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"], "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"], } # ── Prompt (same as control_generator.py) ──────────────────────────── SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung. Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array.""" APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist. Verwende ["all"] wenn der Control branchenuebergreifend gilt. Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel", "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung", "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie", "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil", "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik", "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation", "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei", "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft", "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste", "Abfallwirtschaft", "Forschung" - applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control? Verwende ["all"] wenn keine Groessenbeschraenkung. Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise" - scope_conditions: null wenn keine besonderen Bedingungen, sonst: {"requires_any": ["signal"], "description": "Erklaerung"} Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data", "processes_minors_data", "automated_decisions", "employee_monitoring", "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """ CATEGORY_LIST = [ "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen", "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer", "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit", "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management", "Produktsicherheit", "Marktüberwachung", "Supply Chain Security", "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht", "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness", ] CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST) def build_prompt(source_name, article_label, article_text, license_type): return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control. Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}). WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung. Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein. Gib JSON zurück mit diesen Feldern: - title: Kurzer prägnanter Titel (max 100 Zeichen) - objective: Was soll erreicht werden? (1-3 Sätze) - rationale: Warum ist das wichtig? (1-2 Sätze) - requirements: Liste von konkreten Anforderungen (Strings) - test_procedure: Liste von Prüfschritten (Strings) - evidence: Liste von Nachweisdokumenten (Strings) - severity: low/medium/high/critical - tags: Liste von Tags - domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit) - category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR} - target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer") - source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42") - source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2") {APPLICABILITY_PROMPT} Text: {article_text[:3000]} Quelle: {source_name}, {article_label}""" # ── PDF article extraction ─────────────────────────────────────────── def extract_article_text(pdf_file, article_label, doc_type, full_text=None): """Extract the text of a specific article from a PDF.""" if full_text is None: full_text = read_file(pdf_file) if not full_text: return "" if doc_type == "eu_regulation": art_num_match = re.search(r'\d+', article_label) if not art_num_match: return "" num = int(art_num_match.group()) pattern = rf'\nArtikel\s+{num}\s*\n' match = re.search(pattern, full_text) if not match: return "" start = match.start() next_pattern = rf'\nArtikel\s+{num + 1}\s*\n' next_match = re.search(next_pattern, full_text) end = next_match.start() if next_match else min(start + 5000, len(full_text)) return full_text[start:end].strip()[:3000] elif doc_type == "de_law": para_match = re.search(r'\d+', article_label) if not para_match: return "" num = int(para_match.group()) pattern = rf'\n§\s+{num}\b' match = re.search(pattern, full_text) if not match: return "" start = match.start() next_pattern = rf'\n§\s+{num + 1}\b' next_match = re.search(next_pattern, full_text) end = next_match.start() if next_match else min(start + 5000, len(full_text)) return full_text[start:end].strip()[:3000] elif doc_type == "nist": escaped = re.escape(article_label) match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text) if not match: return "" start = match.start() return full_text[start:start + 3000].strip() else: # Generic / OWASP / ENISA escaped = re.escape(article_label) match = re.search(rf'(?:^|\n).*{escaped}\b', full_text) if not match: return "" start = match.start() return full_text[start:start + 3000].strip() # ── Anthropic API ──────────────────────────────────────────────────── def call_anthropic(prompt, system_prompt): """Call Anthropic API. Returns (parsed_data, raw_text, usage, error).""" headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json", } payload = { "model": ANTHROPIC_MODEL, "max_tokens": 4096, "system": system_prompt, "messages": [{"role": "user", "content": prompt}], } try: resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120) if resp.status_code != 200: return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}" data = resp.json() content = data["content"][0]["text"] if data.get("content") else "" usage = data.get("usage", {}) parsed = parse_json(content) return parsed, content, usage, None except Exception as e: return None, "", {}, str(e) def parse_json(text): """Parse JSON from LLM response, handling markdown fences.""" text = text.strip() if text.startswith("```"): lines = text.split("\n") text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:]) text = text.strip() try: data = json.loads(text) if isinstance(data, list): return data[0] if data else None return data except json.JSONDecodeError: match = re.search(r'\{[\s\S]*\}', text) if match: try: return json.loads(match.group()) except json.JSONDecodeError: return None return None # ── Domain detection ───────────────────────────────────────────────── def detect_domain(text): text_lower = text.lower() scores = {} for domain, keywords in DOMAIN_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in text_lower) if score > 0: scores[domain] = score if scores: return max(scores, key=scores.get) return "SEC" # ── Control ID generation ──────────────────────────────────────────── def generate_control_id(domain, cur): """Generate next available control_id for domain prefix. Uses MAX(numeric suffix) to find the true highest number, avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort). """ prefix = domain.upper()[:4] cur.execute(""" SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER)) FROM compliance.canonical_controls WHERE control_id LIKE %s AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$' """, (f"{prefix}-%",)) row = cur.fetchone() if row and row[0] is not None: return f"{prefix}-{row[0] + 1}" return f"{prefix}-001" # ── Main ───────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles") parser.add_argument("--dry-run", action="store_true", help="Show what would be generated") parser.add_argument("--source", type=str, help="Filter by source name substring") parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls") parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json") args = parser.parse_args() if not ANTHROPIC_API_KEY: print("ERROR: Set ANTHROPIC_API_KEY") sys.exit(1) # Load gap results with open(args.results) as f: gaps = json.load(f) total_gaps = sum(len(g["gap_articles"]) for g in gaps) print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles") if args.source: gaps = [g for g in gaps if args.source.lower() in g["source"].lower()] total_gaps = sum(len(g["gap_articles"]) for g in gaps) print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps") # DB connection with keepalive + reconnect helper db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) def connect_db(): """Create DB connection with TCP keepalive.""" c = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public", keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5, ) return c, c.cursor() conn, cur = connect_db() def ensure_db(): """Reconnect if connection is dead.""" nonlocal conn, cur try: cur.execute("SELECT 1") except Exception: print(" [RECONNECT] DB connection lost, reconnecting...") try: conn.close() except Exception: pass conn, cur = connect_db() return True return False # Get framework UUID cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1") fw_row = cur.fetchone() if not fw_row: print("ERROR: Framework bp_security_v1 not found") sys.exit(1) framework_uuid = fw_row[0] # If resuming, load existing articles per source existing_articles = {} if args.resume: cur.execute(""" SELECT source_citation->>'source', source_citation->>'article' FROM compliance.canonical_controls WHERE source_citation->>'article' IS NOT NULL """) for src, art in cur.fetchall(): existing_articles.setdefault(src, set()).add(art) print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs") # Stats stats = Counter() total_input_tokens = 0 total_output_tokens = 0 generated_ids = [] errors = [] t_start = time.time() # Pre-read PDFs (cache full text per source) pdf_cache = {} for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])): source_name = gap_source["source"] gap_articles = gap_source["gap_articles"] filename = SOURCE_FILE_MAP.get(source_name) reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown") license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"}) doc_type = classify_doc(source_name) if not filename: stats["skipped_no_pdf"] += len(gap_articles) continue # Read PDF once per source if source_name not in pdf_cache: pdf_cache[source_name] = read_file(filename) full_text = pdf_cache[source_name] if not full_text: stats["skipped_no_pdf"] += len(gap_articles) continue print(f"\n{'='*70}") print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})") print(f"{'='*70}") for gap in gap_articles: article_label = gap["label"] article_type = gap["type"] # Skip if already has controls (resume mode) if args.resume and article_label in existing_articles.get(source_name, set()): stats["skipped_exists"] += 1 continue # Skip non-substantive NIST sections (intro chapters) if doc_type == "nist" and article_type == "section": section_match = re.match(r'Section (\d+)', article_label) if section_match and int(section_match.group(1)) <= 3: stats["skipped_intro"] += 1 continue # Extract article text article_text = extract_article_text(filename, article_label, doc_type, full_text) if not article_text or len(article_text) < 30: stats["skipped_short_text"] += 1 print(f" SKIP {article_label}: text too short ({len(article_text)} chars)") continue if args.dry_run: print(f" [DRY] {article_label} ({len(article_text)} chars)") stats["would_generate"] += 1 continue # Call Anthropic prompt = build_prompt(source_name, article_label, article_text, license_info["license"]) data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT) total_input_tokens += usage.get("input_tokens", 0) total_output_tokens += usage.get("output_tokens", 0) if error: stats["api_error"] += 1 errors.append(f"{source_name} {article_label}: {error}") print(f" ERROR {article_label}: {error}") time.sleep(5) continue if not data: stats["parse_error"] += 1 print(f" PARSE ERROR {article_label}") continue # Ensure DB is alive before writing ensure_db() # Build control title = str(data.get("title", ""))[:200] objective = str(data.get("objective", "")) rationale = str(data.get("rationale", "")) domain = str(data.get("domain", detect_domain(article_text))).upper()[:4] if not domain or len(domain) < 2: domain = detect_domain(article_text) control_id = generate_control_id(domain, cur) severity = str(data.get("severity", "medium")).lower() if severity not in ("low", "medium", "high", "critical"): severity = "medium" requirements = data.get("requirements", []) if not isinstance(requirements, list): requirements = [str(requirements)] test_procedure = data.get("test_procedure", []) if not isinstance(test_procedure, list): test_procedure = [str(test_procedure)] evidence = data.get("evidence", []) if not isinstance(evidence, list): evidence = [str(evidence)] tags = data.get("tags", []) if not isinstance(tags, list): tags = [] target_audience = data.get("target_audience", []) if not isinstance(target_audience, list): target_audience = [] applicable_industries = data.get("applicable_industries", ["all"]) if not isinstance(applicable_industries, list): applicable_industries = ["all"] applicable_company_size = data.get("applicable_company_size", ["all"]) if not isinstance(applicable_company_size, list): applicable_company_size = ["all"] scope_conditions = data.get("scope_conditions") source_citation = { "source": source_name, "article": data.get("source_article", article_label), "paragraph": data.get("source_paragraph", ""), "article_type": article_type, "license": license_info["license"], "source_type": license_info["source_type"], } generation_metadata = { "processing_path": "phase74_gap_fill", "license_rule": license_info["rule"], "source_regulation": reg_code, "source_article": article_label, "gap_fill": True, } category = str(data.get("category", "")) or None # Insert into DB try: cur.execute(""" INSERT INTO compliance.canonical_controls ( framework_id, control_id, title, objective, rationale, scope, requirements, test_procedure, evidence, severity, risk_score, implementation_effort, open_anchors, release_state, tags, license_rule, source_original_text, source_citation, customer_visible, generation_metadata, verification_method, category, generation_strategy, target_audience, pipeline_version, applicable_industries, applicable_company_size, scope_conditions ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) ON CONFLICT (framework_id, control_id) DO NOTHING RETURNING id """, ( framework_uuid, control_id, title, objective, rationale, json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence), severity, 5, "m", json.dumps([]), "draft", json.dumps(tags), license_info["rule"], article_text, json.dumps(source_citation), True, json.dumps(generation_metadata), "document", category, "phase74_gap_fill", json.dumps(target_audience), PIPELINE_VERSION, json.dumps(applicable_industries), json.dumps(applicable_company_size), json.dumps(scope_conditions) if scope_conditions else None, )) conn.commit() row = cur.fetchone() if row: generated_ids.append(str(row[0])) stats["generated"] += 1 print(f" OK {control_id}: {title[:60]}") else: stats["conflict"] += 1 print(f" CONFLICT {control_id} (already exists)") except Exception as e: conn.rollback() stats["db_error"] += 1 errors.append(f"DB {control_id}: {str(e)[:100]}") print(f" DB ERROR {control_id}: {str(e)[:100]}") # Rate limit: ~0.5s between calls time.sleep(0.5) # ── Summary ────────────────────────────────────────────────────── elapsed = time.time() - t_start cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000 print(f"\n\n{'='*70}") print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}") print(f"{'='*70}") print(f" Laufzeit: {elapsed/60:.1f} min") print(f" API-Kosten: ${cost:.2f}") print(f" Input Tokens: {total_input_tokens:,}") print(f" Output Tokens: {total_output_tokens:,}") print() for key in sorted(stats.keys()): print(f" {key:<25s}: {stats[key]:5d}") print() if generated_ids: print(f" Neue Control-IDs: {len(generated_ids)}") # Save generated IDs with open("/tmp/phase74_generated_ids.json", 'w') as f: json.dump(generated_ids, f) print(f" IDs gespeichert: /tmp/phase74_generated_ids.json") if errors: print(f"\n Fehler ({len(errors)}):") for e in errors[:20]: print(f" {e}") if len(errors) > 20: print(f" ... und {len(errors)-20} weitere") conn.close() if __name__ == "__main__": main()