feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
655
scripts/qa/phase74_generate_gap_controls.py
Normal file
655
scripts/qa/phase74_generate_gap_controls.py
Normal file
@@ -0,0 +1,655 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
|
||||
|
||||
Reads gap_analysis_results.json, extracts article text from PDFs,
|
||||
calls Claude Sonnet to generate controls, inserts into DB.
|
||||
|
||||
Usage:
|
||||
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
|
||||
python3 phase74_generate_gap_controls.py # generate and insert
|
||||
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
|
||||
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import argparse
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from pdf_qa_all import (
|
||||
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
|
||||
build_eu_article_index, build_de_law_index, build_nist_index,
|
||||
build_owasp_index, build_generic_index, MAX_ARTICLES,
|
||||
)
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────
|
||||
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
PIPELINE_VERSION = 5
|
||||
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = None
|
||||
|
||||
# ── Source name → regulation_code reverse map ────────────────────────
|
||||
# Built from REGULATION_LICENSE_MAP in control_generator.py
|
||||
SOURCE_TO_REGCODE = {
|
||||
"DSGVO (EU) 2016/679": "eu_2016_679",
|
||||
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
|
||||
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
|
||||
"Cyber Resilience Act (CRA)": "eu_2024_2847",
|
||||
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
|
||||
"EU Blue Guide 2022": "eu_blue_guide_2022",
|
||||
"Markets in Crypto-Assets (MiCA)": "mica",
|
||||
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
|
||||
"AML-Verordnung": "amlr",
|
||||
"Data Governance Act (DGA)": "dga",
|
||||
"Data Act": "data_act",
|
||||
"GPSR (EU) 2023/988": "gpsr",
|
||||
"IFRS-Übernahmeverordnung": "ifrs",
|
||||
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
|
||||
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
|
||||
"NIST SP 800-63-3": "nist_sp800_63_3",
|
||||
"NIST AI Risk Management Framework": "nist_ai_rmf",
|
||||
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
|
||||
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
|
||||
"OWASP Top 10 (2021)": "owasp_top10",
|
||||
"OWASP ASVS 4.0": "owasp_asvs",
|
||||
"OWASP SAMM 2.0": "owasp_samm",
|
||||
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
|
||||
"OWASP MASVS 2.0": "owasp_masvs",
|
||||
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
|
||||
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
|
||||
"CISA Secure by Design": "cisa_sbd",
|
||||
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
|
||||
"Gewerbeordnung (GewO)": "gewo",
|
||||
"Handelsgesetzbuch (HGB)": "hgb",
|
||||
"Abgabenordnung (AO)": "ao",
|
||||
"OECD KI-Empfehlung": "oecd_ai_principles",
|
||||
}
|
||||
|
||||
# License info per regulation code (from REGULATION_LICENSE_MAP)
|
||||
LICENSE_MAP = {
|
||||
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
|
||||
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
||||
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
||||
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
|
||||
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
|
||||
}
|
||||
|
||||
# Domain detection keywords
|
||||
DOMAIN_KEYWORDS = {
|
||||
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
|
||||
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
|
||||
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
|
||||
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
|
||||
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
|
||||
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
|
||||
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
|
||||
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
|
||||
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
|
||||
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
|
||||
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
|
||||
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
|
||||
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
|
||||
}
|
||||
|
||||
# ── Prompt (same as control_generator.py) ────────────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
||||
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
||||
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
|
||||
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
|
||||
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
|
||||
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
|
||||
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
|
||||
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
|
||||
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
|
||||
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
|
||||
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
|
||||
"Abfallwirtschaft", "Forschung"
|
||||
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
||||
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
||||
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
||||
{"requires_any": ["signal"], "description": "Erklaerung"}
|
||||
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
|
||||
"processes_minors_data", "automated_decisions", "employee_monitoring",
|
||||
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
|
||||
|
||||
CATEGORY_LIST = [
|
||||
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
|
||||
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
|
||||
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
|
||||
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
|
||||
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
|
||||
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
|
||||
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
|
||||
]
|
||||
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
|
||||
|
||||
|
||||
def build_prompt(source_name, article_label, article_text, license_type):
|
||||
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
||||
|
||||
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
||||
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
||||
|
||||
Gib JSON zurück mit diesen Feldern:
|
||||
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Sätze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Prüfschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
|
||||
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
|
||||
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
|
||||
{APPLICABILITY_PROMPT}
|
||||
|
||||
Text: {article_text[:3000]}
|
||||
Quelle: {source_name}, {article_label}"""
|
||||
|
||||
|
||||
# ── PDF article extraction ───────────────────────────────────────────
|
||||
|
||||
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
|
||||
"""Extract the text of a specific article from a PDF."""
|
||||
if full_text is None:
|
||||
full_text = read_file(pdf_file)
|
||||
if not full_text:
|
||||
return ""
|
||||
|
||||
if doc_type == "eu_regulation":
|
||||
art_num_match = re.search(r'\d+', article_label)
|
||||
if not art_num_match:
|
||||
return ""
|
||||
num = int(art_num_match.group())
|
||||
pattern = rf'\nArtikel\s+{num}\s*\n'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
||||
return full_text[start:end].strip()[:3000]
|
||||
|
||||
elif doc_type == "de_law":
|
||||
para_match = re.search(r'\d+', article_label)
|
||||
if not para_match:
|
||||
return ""
|
||||
num = int(para_match.group())
|
||||
pattern = rf'\n§\s+{num}\b'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
next_pattern = rf'\n§\s+{num + 1}\b'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
||||
return full_text[start:end].strip()[:3000]
|
||||
|
||||
elif doc_type == "nist":
|
||||
escaped = re.escape(article_label)
|
||||
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
return full_text[start:start + 3000].strip()
|
||||
|
||||
else:
|
||||
# Generic / OWASP / ENISA
|
||||
escaped = re.escape(article_label)
|
||||
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
return full_text[start:start + 3000].strip()
|
||||
|
||||
|
||||
# ── Anthropic API ────────────────────────────────────────────────────
|
||||
|
||||
def call_anthropic(prompt, system_prompt):
|
||||
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
content = data["content"][0]["text"] if data.get("content") else ""
|
||||
usage = data.get("usage", {})
|
||||
parsed = parse_json(content)
|
||||
return parsed, content, usage, None
|
||||
except Exception as e:
|
||||
return None, "", {}, str(e)
|
||||
|
||||
|
||||
def parse_json(text):
|
||||
"""Parse JSON from LLM response, handling markdown fences."""
|
||||
text = text.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
|
||||
text = text.strip()
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, list):
|
||||
return data[0] if data else None
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# ── Domain detection ─────────────────────────────────────────────────
|
||||
|
||||
def detect_domain(text):
|
||||
text_lower = text.lower()
|
||||
scores = {}
|
||||
for domain, keywords in DOMAIN_KEYWORDS.items():
|
||||
score = sum(1 for kw in keywords if kw in text_lower)
|
||||
if score > 0:
|
||||
scores[domain] = score
|
||||
if scores:
|
||||
return max(scores, key=scores.get)
|
||||
return "SEC"
|
||||
|
||||
|
||||
# ── Control ID generation ────────────────────────────────────────────
|
||||
|
||||
def generate_control_id(domain, cur):
|
||||
"""Generate next available control_id for domain prefix.
|
||||
|
||||
Uses MAX(numeric suffix) to find the true highest number,
|
||||
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
|
||||
"""
|
||||
prefix = domain.upper()[:4]
|
||||
cur.execute("""
|
||||
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id LIKE %s
|
||||
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
|
||||
""", (f"{prefix}-%",))
|
||||
row = cur.fetchone()
|
||||
if row and row[0] is not None:
|
||||
return f"{prefix}-{row[0] + 1}"
|
||||
return f"{prefix}-001"
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
||||
parser.add_argument("--source", type=str, help="Filter by source name substring")
|
||||
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
|
||||
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
# Load gap results
|
||||
with open(args.results) as f:
|
||||
gaps = json.load(f)
|
||||
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
||||
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
|
||||
|
||||
if args.source:
|
||||
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
|
||||
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
||||
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
|
||||
|
||||
# DB connection with keepalive + reconnect helper
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
|
||||
def connect_db():
|
||||
"""Create DB connection with TCP keepalive."""
|
||||
c = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public",
|
||||
keepalives=1, keepalives_idle=30,
|
||||
keepalives_interval=10, keepalives_count=5,
|
||||
)
|
||||
return c, c.cursor()
|
||||
|
||||
conn, cur = connect_db()
|
||||
|
||||
def ensure_db():
|
||||
"""Reconnect if connection is dead."""
|
||||
nonlocal conn, cur
|
||||
try:
|
||||
cur.execute("SELECT 1")
|
||||
except Exception:
|
||||
print(" [RECONNECT] DB connection lost, reconnecting...")
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
conn, cur = connect_db()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Get framework UUID
|
||||
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
|
||||
fw_row = cur.fetchone()
|
||||
if not fw_row:
|
||||
print("ERROR: Framework bp_security_v1 not found")
|
||||
sys.exit(1)
|
||||
framework_uuid = fw_row[0]
|
||||
|
||||
# If resuming, load existing articles per source
|
||||
existing_articles = {}
|
||||
if args.resume:
|
||||
cur.execute("""
|
||||
SELECT source_citation->>'source', source_citation->>'article'
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'article' IS NOT NULL
|
||||
""")
|
||||
for src, art in cur.fetchall():
|
||||
existing_articles.setdefault(src, set()).add(art)
|
||||
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
|
||||
|
||||
# Stats
|
||||
stats = Counter()
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
generated_ids = []
|
||||
errors = []
|
||||
t_start = time.time()
|
||||
|
||||
# Pre-read PDFs (cache full text per source)
|
||||
pdf_cache = {}
|
||||
|
||||
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
|
||||
source_name = gap_source["source"]
|
||||
gap_articles = gap_source["gap_articles"]
|
||||
filename = SOURCE_FILE_MAP.get(source_name)
|
||||
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
|
||||
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
|
||||
doc_type = classify_doc(source_name)
|
||||
|
||||
if not filename:
|
||||
stats["skipped_no_pdf"] += len(gap_articles)
|
||||
continue
|
||||
|
||||
# Read PDF once per source
|
||||
if source_name not in pdf_cache:
|
||||
pdf_cache[source_name] = read_file(filename)
|
||||
full_text = pdf_cache[source_name]
|
||||
if not full_text:
|
||||
stats["skipped_no_pdf"] += len(gap_articles)
|
||||
continue
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
|
||||
print(f"{'='*70}")
|
||||
|
||||
for gap in gap_articles:
|
||||
article_label = gap["label"]
|
||||
article_type = gap["type"]
|
||||
|
||||
# Skip if already has controls (resume mode)
|
||||
if args.resume and article_label in existing_articles.get(source_name, set()):
|
||||
stats["skipped_exists"] += 1
|
||||
continue
|
||||
|
||||
# Skip non-substantive NIST sections (intro chapters)
|
||||
if doc_type == "nist" and article_type == "section":
|
||||
section_match = re.match(r'Section (\d+)', article_label)
|
||||
if section_match and int(section_match.group(1)) <= 3:
|
||||
stats["skipped_intro"] += 1
|
||||
continue
|
||||
|
||||
# Extract article text
|
||||
article_text = extract_article_text(filename, article_label, doc_type, full_text)
|
||||
if not article_text or len(article_text) < 30:
|
||||
stats["skipped_short_text"] += 1
|
||||
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
print(f" [DRY] {article_label} ({len(article_text)} chars)")
|
||||
stats["would_generate"] += 1
|
||||
continue
|
||||
|
||||
# Call Anthropic
|
||||
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
|
||||
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
|
||||
|
||||
total_input_tokens += usage.get("input_tokens", 0)
|
||||
total_output_tokens += usage.get("output_tokens", 0)
|
||||
|
||||
if error:
|
||||
stats["api_error"] += 1
|
||||
errors.append(f"{source_name} {article_label}: {error}")
|
||||
print(f" ERROR {article_label}: {error}")
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
if not data:
|
||||
stats["parse_error"] += 1
|
||||
print(f" PARSE ERROR {article_label}")
|
||||
continue
|
||||
|
||||
# Ensure DB is alive before writing
|
||||
ensure_db()
|
||||
|
||||
# Build control
|
||||
title = str(data.get("title", ""))[:200]
|
||||
objective = str(data.get("objective", ""))
|
||||
rationale = str(data.get("rationale", ""))
|
||||
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
|
||||
if not domain or len(domain) < 2:
|
||||
domain = detect_domain(article_text)
|
||||
|
||||
control_id = generate_control_id(domain, cur)
|
||||
severity = str(data.get("severity", "medium")).lower()
|
||||
if severity not in ("low", "medium", "high", "critical"):
|
||||
severity = "medium"
|
||||
|
||||
requirements = data.get("requirements", [])
|
||||
if not isinstance(requirements, list):
|
||||
requirements = [str(requirements)]
|
||||
test_procedure = data.get("test_procedure", [])
|
||||
if not isinstance(test_procedure, list):
|
||||
test_procedure = [str(test_procedure)]
|
||||
evidence = data.get("evidence", [])
|
||||
if not isinstance(evidence, list):
|
||||
evidence = [str(evidence)]
|
||||
tags = data.get("tags", [])
|
||||
if not isinstance(tags, list):
|
||||
tags = []
|
||||
target_audience = data.get("target_audience", [])
|
||||
if not isinstance(target_audience, list):
|
||||
target_audience = []
|
||||
applicable_industries = data.get("applicable_industries", ["all"])
|
||||
if not isinstance(applicable_industries, list):
|
||||
applicable_industries = ["all"]
|
||||
applicable_company_size = data.get("applicable_company_size", ["all"])
|
||||
if not isinstance(applicable_company_size, list):
|
||||
applicable_company_size = ["all"]
|
||||
scope_conditions = data.get("scope_conditions")
|
||||
|
||||
source_citation = {
|
||||
"source": source_name,
|
||||
"article": data.get("source_article", article_label),
|
||||
"paragraph": data.get("source_paragraph", ""),
|
||||
"article_type": article_type,
|
||||
"license": license_info["license"],
|
||||
"source_type": license_info["source_type"],
|
||||
}
|
||||
|
||||
generation_metadata = {
|
||||
"processing_path": "phase74_gap_fill",
|
||||
"license_rule": license_info["rule"],
|
||||
"source_regulation": reg_code,
|
||||
"source_article": article_label,
|
||||
"gap_fill": True,
|
||||
}
|
||||
|
||||
category = str(data.get("category", "")) or None
|
||||
|
||||
# Insert into DB
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls (
|
||||
framework_id, control_id, title, objective, rationale,
|
||||
scope, requirements, test_procedure, evidence,
|
||||
severity, risk_score, implementation_effort,
|
||||
open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, generation_metadata,
|
||||
verification_method, category, generation_strategy,
|
||||
target_audience, pipeline_version,
|
||||
applicable_industries, applicable_company_size, scope_conditions
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s
|
||||
)
|
||||
ON CONFLICT (framework_id, control_id) DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
framework_uuid, control_id, title, objective, rationale,
|
||||
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
|
||||
severity, 5, "m",
|
||||
json.dumps([]), "draft", json.dumps(tags),
|
||||
license_info["rule"], article_text, json.dumps(source_citation),
|
||||
True, json.dumps(generation_metadata),
|
||||
"document", category, "phase74_gap_fill",
|
||||
json.dumps(target_audience), PIPELINE_VERSION,
|
||||
json.dumps(applicable_industries), json.dumps(applicable_company_size),
|
||||
json.dumps(scope_conditions) if scope_conditions else None,
|
||||
))
|
||||
conn.commit()
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
generated_ids.append(str(row[0]))
|
||||
stats["generated"] += 1
|
||||
print(f" OK {control_id}: {title[:60]}")
|
||||
else:
|
||||
stats["conflict"] += 1
|
||||
print(f" CONFLICT {control_id} (already exists)")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
stats["db_error"] += 1
|
||||
errors.append(f"DB {control_id}: {str(e)[:100]}")
|
||||
print(f" DB ERROR {control_id}: {str(e)[:100]}")
|
||||
|
||||
# Rate limit: ~0.5s between calls
|
||||
time.sleep(0.5)
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
elapsed = time.time() - t_start
|
||||
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
|
||||
|
||||
print(f"\n\n{'='*70}")
|
||||
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
|
||||
print(f"{'='*70}")
|
||||
print(f" Laufzeit: {elapsed/60:.1f} min")
|
||||
print(f" API-Kosten: ${cost:.2f}")
|
||||
print(f" Input Tokens: {total_input_tokens:,}")
|
||||
print(f" Output Tokens: {total_output_tokens:,}")
|
||||
print()
|
||||
for key in sorted(stats.keys()):
|
||||
print(f" {key:<25s}: {stats[key]:5d}")
|
||||
print()
|
||||
|
||||
if generated_ids:
|
||||
print(f" Neue Control-IDs: {len(generated_ids)}")
|
||||
# Save generated IDs
|
||||
with open("/tmp/phase74_generated_ids.json", 'w') as f:
|
||||
json.dump(generated_ids, f)
|
||||
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
|
||||
|
||||
if errors:
|
||||
print(f"\n Fehler ({len(errors)}):")
|
||||
for e in errors[:20]:
|
||||
print(f" {e}")
|
||||
if len(errors) > 20:
|
||||
print(f" ... und {len(errors)-20} weitere")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user