Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
656 lines
29 KiB
Python
656 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
|
|
|
|
Reads gap_analysis_results.json, extracts article text from PDFs,
|
|
calls Claude Sonnet to generate controls, inserts into DB.
|
|
|
|
Usage:
|
|
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
|
|
python3 phase74_generate_gap_controls.py # generate and insert
|
|
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
|
|
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
|
|
"""
|
|
import os
|
|
import sys
|
|
import json
|
|
import re
|
|
import time
|
|
import hashlib
|
|
import argparse
|
|
import psycopg2
|
|
import urllib.parse
|
|
import requests
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
from pdf_qa_all import (
|
|
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
|
|
build_eu_article_index, build_de_law_index, build_nist_index,
|
|
build_owasp_index, build_generic_index, MAX_ARTICLES,
|
|
)
|
|
|
|
# ── Config ──────────────────────────────────────────────────────────
|
|
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
|
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
|
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
PIPELINE_VERSION = 5
|
|
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
|
|
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
|
|
|
try:
|
|
import fitz
|
|
except ImportError:
|
|
fitz = None
|
|
|
|
# ── Source name → regulation_code reverse map ────────────────────────
|
|
# Built from REGULATION_LICENSE_MAP in control_generator.py
|
|
SOURCE_TO_REGCODE = {
|
|
"DSGVO (EU) 2016/679": "eu_2016_679",
|
|
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
|
|
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
|
|
"Cyber Resilience Act (CRA)": "eu_2024_2847",
|
|
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
|
|
"EU Blue Guide 2022": "eu_blue_guide_2022",
|
|
"Markets in Crypto-Assets (MiCA)": "mica",
|
|
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
|
|
"AML-Verordnung": "amlr",
|
|
"Data Governance Act (DGA)": "dga",
|
|
"Data Act": "data_act",
|
|
"GPSR (EU) 2023/988": "gpsr",
|
|
"IFRS-Übernahmeverordnung": "ifrs",
|
|
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
|
|
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
|
|
"NIST SP 800-63-3": "nist_sp800_63_3",
|
|
"NIST AI Risk Management Framework": "nist_ai_rmf",
|
|
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
|
|
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
|
|
"OWASP Top 10 (2021)": "owasp_top10",
|
|
"OWASP ASVS 4.0": "owasp_asvs",
|
|
"OWASP SAMM 2.0": "owasp_samm",
|
|
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
|
|
"OWASP MASVS 2.0": "owasp_masvs",
|
|
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
|
|
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
|
|
"CISA Secure by Design": "cisa_sbd",
|
|
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
|
|
"Gewerbeordnung (GewO)": "gewo",
|
|
"Handelsgesetzbuch (HGB)": "hgb",
|
|
"Abgabenordnung (AO)": "ao",
|
|
"OECD KI-Empfehlung": "oecd_ai_principles",
|
|
}
|
|
|
|
# License info per regulation code (from REGULATION_LICENSE_MAP)
|
|
LICENSE_MAP = {
|
|
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
|
|
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
|
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
|
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
|
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
|
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
|
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
|
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
|
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
|
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
|
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
|
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
|
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
|
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
|
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
|
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
|
|
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
|
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
|
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
|
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
|
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
|
|
}
|
|
|
|
# Domain detection keywords
|
|
DOMAIN_KEYWORDS = {
|
|
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
|
|
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
|
|
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
|
|
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
|
|
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
|
|
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
|
|
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
|
|
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
|
|
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
|
|
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
|
|
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
|
|
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
|
|
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
|
|
}
|
|
|
|
# ── Prompt (same as control_generator.py) ────────────────────────────
|
|
|
|
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
|
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
|
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
|
|
|
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
|
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
|
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
|
|
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
|
|
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
|
|
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
|
|
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
|
|
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
|
|
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
|
|
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
|
|
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
|
|
"Abfallwirtschaft", "Forschung"
|
|
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
|
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
|
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
|
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
|
{"requires_any": ["signal"], "description": "Erklaerung"}
|
|
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
|
|
"processes_minors_data", "automated_decisions", "employee_monitoring",
|
|
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
|
|
|
|
CATEGORY_LIST = [
|
|
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
|
|
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
|
|
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
|
|
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
|
|
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
|
|
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
|
|
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
|
|
]
|
|
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
|
|
|
|
|
|
def build_prompt(source_name, article_label, article_text, license_type):
|
|
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
|
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
|
|
|
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
|
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
|
|
|
Gib JSON zurück mit diesen Feldern:
|
|
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
|
- objective: Was soll erreicht werden? (1-3 Sätze)
|
|
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
|
- requirements: Liste von konkreten Anforderungen (Strings)
|
|
- test_procedure: Liste von Prüfschritten (Strings)
|
|
- evidence: Liste von Nachweisdokumenten (Strings)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags
|
|
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
|
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
|
|
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
|
|
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
|
|
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
|
|
{APPLICABILITY_PROMPT}
|
|
|
|
Text: {article_text[:3000]}
|
|
Quelle: {source_name}, {article_label}"""
|
|
|
|
|
|
# ── PDF article extraction ───────────────────────────────────────────
|
|
|
|
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
|
|
"""Extract the text of a specific article from a PDF."""
|
|
if full_text is None:
|
|
full_text = read_file(pdf_file)
|
|
if not full_text:
|
|
return ""
|
|
|
|
if doc_type == "eu_regulation":
|
|
art_num_match = re.search(r'\d+', article_label)
|
|
if not art_num_match:
|
|
return ""
|
|
num = int(art_num_match.group())
|
|
pattern = rf'\nArtikel\s+{num}\s*\n'
|
|
match = re.search(pattern, full_text)
|
|
if not match:
|
|
return ""
|
|
start = match.start()
|
|
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
|
|
next_match = re.search(next_pattern, full_text)
|
|
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
|
return full_text[start:end].strip()[:3000]
|
|
|
|
elif doc_type == "de_law":
|
|
para_match = re.search(r'\d+', article_label)
|
|
if not para_match:
|
|
return ""
|
|
num = int(para_match.group())
|
|
pattern = rf'\n§\s+{num}\b'
|
|
match = re.search(pattern, full_text)
|
|
if not match:
|
|
return ""
|
|
start = match.start()
|
|
next_pattern = rf'\n§\s+{num + 1}\b'
|
|
next_match = re.search(next_pattern, full_text)
|
|
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
|
return full_text[start:end].strip()[:3000]
|
|
|
|
elif doc_type == "nist":
|
|
escaped = re.escape(article_label)
|
|
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
|
|
if not match:
|
|
return ""
|
|
start = match.start()
|
|
return full_text[start:start + 3000].strip()
|
|
|
|
else:
|
|
# Generic / OWASP / ENISA
|
|
escaped = re.escape(article_label)
|
|
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
|
|
if not match:
|
|
return ""
|
|
start = match.start()
|
|
return full_text[start:start + 3000].strip()
|
|
|
|
|
|
# ── Anthropic API ────────────────────────────────────────────────────
|
|
|
|
def call_anthropic(prompt, system_prompt):
|
|
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
payload = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 4096,
|
|
"system": system_prompt,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}
|
|
|
|
try:
|
|
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
|
|
if resp.status_code != 200:
|
|
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
|
data = resp.json()
|
|
content = data["content"][0]["text"] if data.get("content") else ""
|
|
usage = data.get("usage", {})
|
|
parsed = parse_json(content)
|
|
return parsed, content, usage, None
|
|
except Exception as e:
|
|
return None, "", {}, str(e)
|
|
|
|
|
|
def parse_json(text):
|
|
"""Parse JSON from LLM response, handling markdown fences."""
|
|
text = text.strip()
|
|
if text.startswith("```"):
|
|
lines = text.split("\n")
|
|
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
|
|
text = text.strip()
|
|
|
|
try:
|
|
data = json.loads(text)
|
|
if isinstance(data, list):
|
|
return data[0] if data else None
|
|
return data
|
|
except json.JSONDecodeError:
|
|
match = re.search(r'\{[\s\S]*\}', text)
|
|
if match:
|
|
try:
|
|
return json.loads(match.group())
|
|
except json.JSONDecodeError:
|
|
return None
|
|
return None
|
|
|
|
|
|
# ── Domain detection ─────────────────────────────────────────────────
|
|
|
|
def detect_domain(text):
|
|
text_lower = text.lower()
|
|
scores = {}
|
|
for domain, keywords in DOMAIN_KEYWORDS.items():
|
|
score = sum(1 for kw in keywords if kw in text_lower)
|
|
if score > 0:
|
|
scores[domain] = score
|
|
if scores:
|
|
return max(scores, key=scores.get)
|
|
return "SEC"
|
|
|
|
|
|
# ── Control ID generation ────────────────────────────────────────────
|
|
|
|
def generate_control_id(domain, cur):
|
|
"""Generate next available control_id for domain prefix.
|
|
|
|
Uses MAX(numeric suffix) to find the true highest number,
|
|
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
|
|
"""
|
|
prefix = domain.upper()[:4]
|
|
cur.execute("""
|
|
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
|
|
FROM compliance.canonical_controls
|
|
WHERE control_id LIKE %s
|
|
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
|
|
""", (f"{prefix}-%",))
|
|
row = cur.fetchone()
|
|
if row and row[0] is not None:
|
|
return f"{prefix}-{row[0] + 1}"
|
|
return f"{prefix}-001"
|
|
|
|
|
|
# ── Main ─────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
|
parser.add_argument("--source", type=str, help="Filter by source name substring")
|
|
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
|
|
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
|
|
args = parser.parse_args()
|
|
|
|
if not ANTHROPIC_API_KEY:
|
|
print("ERROR: Set ANTHROPIC_API_KEY")
|
|
sys.exit(1)
|
|
|
|
# Load gap results
|
|
with open(args.results) as f:
|
|
gaps = json.load(f)
|
|
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
|
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
|
|
|
|
if args.source:
|
|
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
|
|
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
|
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
|
|
|
|
# DB connection with keepalive + reconnect helper
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
|
|
def connect_db():
|
|
"""Create DB connection with TCP keepalive."""
|
|
c = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public",
|
|
keepalives=1, keepalives_idle=30,
|
|
keepalives_interval=10, keepalives_count=5,
|
|
)
|
|
return c, c.cursor()
|
|
|
|
conn, cur = connect_db()
|
|
|
|
def ensure_db():
|
|
"""Reconnect if connection is dead."""
|
|
nonlocal conn, cur
|
|
try:
|
|
cur.execute("SELECT 1")
|
|
except Exception:
|
|
print(" [RECONNECT] DB connection lost, reconnecting...")
|
|
try:
|
|
conn.close()
|
|
except Exception:
|
|
pass
|
|
conn, cur = connect_db()
|
|
return True
|
|
return False
|
|
|
|
# Get framework UUID
|
|
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
|
|
fw_row = cur.fetchone()
|
|
if not fw_row:
|
|
print("ERROR: Framework bp_security_v1 not found")
|
|
sys.exit(1)
|
|
framework_uuid = fw_row[0]
|
|
|
|
# If resuming, load existing articles per source
|
|
existing_articles = {}
|
|
if args.resume:
|
|
cur.execute("""
|
|
SELECT source_citation->>'source', source_citation->>'article'
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'article' IS NOT NULL
|
|
""")
|
|
for src, art in cur.fetchall():
|
|
existing_articles.setdefault(src, set()).add(art)
|
|
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
|
|
|
|
# Stats
|
|
stats = Counter()
|
|
total_input_tokens = 0
|
|
total_output_tokens = 0
|
|
generated_ids = []
|
|
errors = []
|
|
t_start = time.time()
|
|
|
|
# Pre-read PDFs (cache full text per source)
|
|
pdf_cache = {}
|
|
|
|
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
|
|
source_name = gap_source["source"]
|
|
gap_articles = gap_source["gap_articles"]
|
|
filename = SOURCE_FILE_MAP.get(source_name)
|
|
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
|
|
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
|
|
doc_type = classify_doc(source_name)
|
|
|
|
if not filename:
|
|
stats["skipped_no_pdf"] += len(gap_articles)
|
|
continue
|
|
|
|
# Read PDF once per source
|
|
if source_name not in pdf_cache:
|
|
pdf_cache[source_name] = read_file(filename)
|
|
full_text = pdf_cache[source_name]
|
|
if not full_text:
|
|
stats["skipped_no_pdf"] += len(gap_articles)
|
|
continue
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
|
|
print(f"{'='*70}")
|
|
|
|
for gap in gap_articles:
|
|
article_label = gap["label"]
|
|
article_type = gap["type"]
|
|
|
|
# Skip if already has controls (resume mode)
|
|
if args.resume and article_label in existing_articles.get(source_name, set()):
|
|
stats["skipped_exists"] += 1
|
|
continue
|
|
|
|
# Skip non-substantive NIST sections (intro chapters)
|
|
if doc_type == "nist" and article_type == "section":
|
|
section_match = re.match(r'Section (\d+)', article_label)
|
|
if section_match and int(section_match.group(1)) <= 3:
|
|
stats["skipped_intro"] += 1
|
|
continue
|
|
|
|
# Extract article text
|
|
article_text = extract_article_text(filename, article_label, doc_type, full_text)
|
|
if not article_text or len(article_text) < 30:
|
|
stats["skipped_short_text"] += 1
|
|
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
|
|
continue
|
|
|
|
if args.dry_run:
|
|
print(f" [DRY] {article_label} ({len(article_text)} chars)")
|
|
stats["would_generate"] += 1
|
|
continue
|
|
|
|
# Call Anthropic
|
|
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
|
|
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
|
|
|
|
total_input_tokens += usage.get("input_tokens", 0)
|
|
total_output_tokens += usage.get("output_tokens", 0)
|
|
|
|
if error:
|
|
stats["api_error"] += 1
|
|
errors.append(f"{source_name} {article_label}: {error}")
|
|
print(f" ERROR {article_label}: {error}")
|
|
time.sleep(5)
|
|
continue
|
|
|
|
if not data:
|
|
stats["parse_error"] += 1
|
|
print(f" PARSE ERROR {article_label}")
|
|
continue
|
|
|
|
# Ensure DB is alive before writing
|
|
ensure_db()
|
|
|
|
# Build control
|
|
title = str(data.get("title", ""))[:200]
|
|
objective = str(data.get("objective", ""))
|
|
rationale = str(data.get("rationale", ""))
|
|
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
|
|
if not domain or len(domain) < 2:
|
|
domain = detect_domain(article_text)
|
|
|
|
control_id = generate_control_id(domain, cur)
|
|
severity = str(data.get("severity", "medium")).lower()
|
|
if severity not in ("low", "medium", "high", "critical"):
|
|
severity = "medium"
|
|
|
|
requirements = data.get("requirements", [])
|
|
if not isinstance(requirements, list):
|
|
requirements = [str(requirements)]
|
|
test_procedure = data.get("test_procedure", [])
|
|
if not isinstance(test_procedure, list):
|
|
test_procedure = [str(test_procedure)]
|
|
evidence = data.get("evidence", [])
|
|
if not isinstance(evidence, list):
|
|
evidence = [str(evidence)]
|
|
tags = data.get("tags", [])
|
|
if not isinstance(tags, list):
|
|
tags = []
|
|
target_audience = data.get("target_audience", [])
|
|
if not isinstance(target_audience, list):
|
|
target_audience = []
|
|
applicable_industries = data.get("applicable_industries", ["all"])
|
|
if not isinstance(applicable_industries, list):
|
|
applicable_industries = ["all"]
|
|
applicable_company_size = data.get("applicable_company_size", ["all"])
|
|
if not isinstance(applicable_company_size, list):
|
|
applicable_company_size = ["all"]
|
|
scope_conditions = data.get("scope_conditions")
|
|
|
|
source_citation = {
|
|
"source": source_name,
|
|
"article": data.get("source_article", article_label),
|
|
"paragraph": data.get("source_paragraph", ""),
|
|
"article_type": article_type,
|
|
"license": license_info["license"],
|
|
"source_type": license_info["source_type"],
|
|
}
|
|
|
|
generation_metadata = {
|
|
"processing_path": "phase74_gap_fill",
|
|
"license_rule": license_info["rule"],
|
|
"source_regulation": reg_code,
|
|
"source_article": article_label,
|
|
"gap_fill": True,
|
|
}
|
|
|
|
category = str(data.get("category", "")) or None
|
|
|
|
# Insert into DB
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO compliance.canonical_controls (
|
|
framework_id, control_id, title, objective, rationale,
|
|
scope, requirements, test_procedure, evidence,
|
|
severity, risk_score, implementation_effort,
|
|
open_anchors, release_state, tags,
|
|
license_rule, source_original_text, source_citation,
|
|
customer_visible, generation_metadata,
|
|
verification_method, category, generation_strategy,
|
|
target_audience, pipeline_version,
|
|
applicable_industries, applicable_company_size, scope_conditions
|
|
) VALUES (
|
|
%s, %s, %s, %s, %s,
|
|
%s, %s, %s, %s,
|
|
%s, %s, %s,
|
|
%s, %s, %s,
|
|
%s, %s, %s,
|
|
%s, %s,
|
|
%s, %s, %s,
|
|
%s, %s,
|
|
%s, %s, %s
|
|
)
|
|
ON CONFLICT (framework_id, control_id) DO NOTHING
|
|
RETURNING id
|
|
""", (
|
|
framework_uuid, control_id, title, objective, rationale,
|
|
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
|
|
severity, 5, "m",
|
|
json.dumps([]), "draft", json.dumps(tags),
|
|
license_info["rule"], article_text, json.dumps(source_citation),
|
|
True, json.dumps(generation_metadata),
|
|
"document", category, "phase74_gap_fill",
|
|
json.dumps(target_audience), PIPELINE_VERSION,
|
|
json.dumps(applicable_industries), json.dumps(applicable_company_size),
|
|
json.dumps(scope_conditions) if scope_conditions else None,
|
|
))
|
|
conn.commit()
|
|
row = cur.fetchone()
|
|
if row:
|
|
generated_ids.append(str(row[0]))
|
|
stats["generated"] += 1
|
|
print(f" OK {control_id}: {title[:60]}")
|
|
else:
|
|
stats["conflict"] += 1
|
|
print(f" CONFLICT {control_id} (already exists)")
|
|
except Exception as e:
|
|
conn.rollback()
|
|
stats["db_error"] += 1
|
|
errors.append(f"DB {control_id}: {str(e)[:100]}")
|
|
print(f" DB ERROR {control_id}: {str(e)[:100]}")
|
|
|
|
# Rate limit: ~0.5s between calls
|
|
time.sleep(0.5)
|
|
|
|
# ── Summary ──────────────────────────────────────────────────────
|
|
elapsed = time.time() - t_start
|
|
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
|
|
|
|
print(f"\n\n{'='*70}")
|
|
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
|
|
print(f"{'='*70}")
|
|
print(f" Laufzeit: {elapsed/60:.1f} min")
|
|
print(f" API-Kosten: ${cost:.2f}")
|
|
print(f" Input Tokens: {total_input_tokens:,}")
|
|
print(f" Output Tokens: {total_output_tokens:,}")
|
|
print()
|
|
for key in sorted(stats.keys()):
|
|
print(f" {key:<25s}: {stats[key]:5d}")
|
|
print()
|
|
|
|
if generated_ids:
|
|
print(f" Neue Control-IDs: {len(generated_ids)}")
|
|
# Save generated IDs
|
|
with open("/tmp/phase74_generated_ids.json", 'w') as f:
|
|
json.dump(generated_ids, f)
|
|
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
|
|
|
|
if errors:
|
|
print(f"\n Fehler ({len(errors)}):")
|
|
for e in errors[:20]:
|
|
print(f" {e}")
|
|
if len(errors) > 20:
|
|
print(f" ... und {len(errors)-20} weitere")
|
|
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|