feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,29 @@
|
||||
"""Apply PDF QA results: update source_citation with correct article + article_type."""
|
||||
"""
|
||||
Apply PDF QA results: update source_citation with correct article_type + article.
|
||||
|
||||
Safety modes:
|
||||
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
|
||||
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
|
||||
--dry-run: Show what would change without writing.
|
||||
|
||||
Usage:
|
||||
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
|
||||
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
|
||||
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from collections import Counter
|
||||
|
||||
RESULTS_FILE = "/tmp/pdf_qa_results.json"
|
||||
|
||||
# Parse args
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
force_article = "--force-article" in sys.argv
|
||||
|
||||
# Load results
|
||||
with open(RESULTS_FILE) as f:
|
||||
results = json.load(f)
|
||||
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
|
||||
# Update in batches
|
||||
# Load current DB state for all affected controls
|
||||
cur = conn.cursor()
|
||||
updated = 0
|
||||
ctrl_ids = [r["ctrl_id"] for r in results]
|
||||
cur.execute("""
|
||||
SELECT id,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as article_type,
|
||||
source_citation->>'source' as source
|
||||
FROM compliance.canonical_controls
|
||||
WHERE id = ANY(%s::uuid[])
|
||||
""", (ctrl_ids,))
|
||||
db_state = {}
|
||||
for row in cur.fetchall():
|
||||
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
|
||||
|
||||
# Counters
|
||||
stats = Counter()
|
||||
updated_type = 0
|
||||
updated_article = 0
|
||||
updated_recital = 0
|
||||
errors = 0
|
||||
unchanged = 0
|
||||
|
||||
for i, r in enumerate(results):
|
||||
ctrl_id = r["ctrl_id"]
|
||||
article_label = r["article_label"]
|
||||
article_type = r["article_type"] # preamble, article, annex, section, unknown
|
||||
new_article = r["article_label"]
|
||||
new_type = r["article_type"]
|
||||
db = db_state.get(ctrl_id, {})
|
||||
|
||||
if not db:
|
||||
stats["missing_in_db"] += 1
|
||||
continue
|
||||
|
||||
old_type = db.get("article_type")
|
||||
old_article = db.get("article", "").strip()
|
||||
|
||||
# Decide what to update
|
||||
set_type = (old_type != new_type)
|
||||
set_article = (not old_article) or (force_article and old_article != new_article)
|
||||
set_recital = (new_type == "preamble")
|
||||
|
||||
if set_type:
|
||||
stats["type_" + ("new" if not old_type else "changed")] += 1
|
||||
else:
|
||||
stats["type_unchanged"] += 1
|
||||
|
||||
if not old_article and set_article:
|
||||
stats["article_new"] += 1
|
||||
elif old_article and old_article != new_article:
|
||||
if force_article:
|
||||
stats["article_force_changed"] += 1
|
||||
else:
|
||||
stats["article_skipped"] += 1
|
||||
else:
|
||||
stats["article_unchanged"] += 1
|
||||
|
||||
if set_recital:
|
||||
stats["recital"] += 1
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Update source_citation: set article and article_type
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation
|
||||
|| jsonb_build_object('article', %s, 'article_type', %s),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
AND (
|
||||
source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s
|
||||
)
|
||||
""", (article_label, article_type, ctrl_id, article_label, article_type))
|
||||
# Build JSONB update
|
||||
updates = {}
|
||||
if set_type:
|
||||
updates["article_type"] = new_type
|
||||
if set_article:
|
||||
updates["article"] = new_article
|
||||
|
||||
if cur.rowcount > 0:
|
||||
updated += 1
|
||||
else:
|
||||
unchanged += 1
|
||||
if updates:
|
||||
# Merge into source_citation
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
""", (json.dumps(updates), ctrl_id))
|
||||
if set_type:
|
||||
updated_type += 1
|
||||
if set_article:
|
||||
updated_article += 1
|
||||
|
||||
# Mark preamble as recital_suspect
|
||||
if set_recital:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata, '{}'::jsonb),
|
||||
'{recital_suspect}',
|
||||
'true'::jsonb
|
||||
),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
""", (ctrl_id,))
|
||||
updated_recital += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
if (i + 1) % 500 == 0:
|
||||
if (i + 1) % 1000 == 0:
|
||||
conn.commit()
|
||||
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
|
||||
print(f" Progress: {i+1}/{len(results)}")
|
||||
|
||||
conn.commit()
|
||||
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
mode = "DRY-RUN" if dry_run else "APPLIED"
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Mode: {mode}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\n article_type:")
|
||||
print(f" New (was NULL): {stats['type_new']:5d}")
|
||||
print(f" Changed: {stats['type_changed']:5d}")
|
||||
print(f" Unchanged: {stats['type_unchanged']:5d}")
|
||||
print(f"\n article:")
|
||||
print(f" New (was empty): {stats['article_new']:5d}")
|
||||
if force_article:
|
||||
print(f" Force-changed: {stats['article_force_changed']:5d}")
|
||||
else:
|
||||
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
|
||||
print(f" Unchanged: {stats['article_unchanged']:5d}")
|
||||
print(f"\n Preamble/Recital: {stats['recital']:5d}")
|
||||
print(f" Missing in DB: {stats['missing_in_db']:5d}")
|
||||
|
||||
if not dry_run:
|
||||
print(f"\n Updates written:")
|
||||
print(f" article_type: {updated_type:5d}")
|
||||
print(f" article: {updated_article:5d}")
|
||||
print(f" recital_suspect: {updated_recital:5d}")
|
||||
print(f" Errors: {errors:5d}")
|
||||
|
||||
# Verify: count by article_type
|
||||
cur.execute("""
|
||||
|
||||
524
scripts/qa/benchmark_llm_controls.py
Normal file
524
scripts/qa/benchmark_llm_controls.py
Normal file
@@ -0,0 +1,524 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
|
||||
|
||||
Tests 5 representative gap articles from different sources.
|
||||
Measures: quality (JSON valid, fields complete), response time, cost estimate.
|
||||
|
||||
Usage:
|
||||
python3 benchmark_llm_controls.py
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────
|
||||
LITELLM_URL = "https://llm-dev.meghsakha.com"
|
||||
LITELLM_MODEL = "gpt-oss-120b"
|
||||
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
|
||||
|
||||
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_MODEL = "claude-sonnet-4-6"
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
print("PyMuPDF not available, using pre-extracted texts")
|
||||
fitz = None
|
||||
|
||||
# ── Prompts (identical to control_generator.py) ─────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
||||
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
||||
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
|
||||
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
|
||||
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
||||
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
||||
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
||||
{"requires_any": ["signal"], "description": "Erklaerung"}"""
|
||||
|
||||
|
||||
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
|
||||
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
||||
|
||||
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
||||
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
||||
|
||||
Gib JSON zurück mit diesen Feldern:
|
||||
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Sätze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Prüfschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
|
||||
- category: Inhaltliche Kategorie
|
||||
- target_audience: Liste der Zielgruppen
|
||||
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
|
||||
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
|
||||
{APPLICABILITY_PROMPT}
|
||||
|
||||
Text: {article_text[:3000]}
|
||||
Quelle: {source_name}, {article_label}"""
|
||||
|
||||
|
||||
# ── PDF Text Extraction ─────────────────────────────────────────────
|
||||
|
||||
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
|
||||
"""Extract the text of a specific article from a PDF."""
|
||||
import re
|
||||
|
||||
path = PDF_DIR / pdf_file
|
||||
if not path.exists() or fitz is None:
|
||||
return ""
|
||||
|
||||
doc = fitz.open(str(path))
|
||||
full_text = ""
|
||||
for page in doc:
|
||||
full_text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
|
||||
# Find article boundaries
|
||||
if doc_type == "eu_regulation":
|
||||
# Find "Artikel N" heading
|
||||
art_num = re.search(r'\d+', article_label)
|
||||
if not art_num:
|
||||
return ""
|
||||
num = int(art_num.group())
|
||||
# Find start of this article
|
||||
pattern = rf'\nArtikel\s+{num}\s*\n'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return f"[Artikel {num} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
# Find start of next article
|
||||
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else start + 5000
|
||||
text = full_text[start:end].strip()
|
||||
return text[:3000]
|
||||
|
||||
elif doc_type == "de_law":
|
||||
para_num = re.search(r'\d+', article_label)
|
||||
if not para_num:
|
||||
return ""
|
||||
num = int(para_num.group())
|
||||
pattern = rf'\n§\s+{num}\b'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return f"[§ {num} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
next_pattern = rf'\n§\s+{num+1}\b'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else start + 5000
|
||||
text = full_text[start:end].strip()
|
||||
return text[:3000]
|
||||
|
||||
elif doc_type == "nist":
|
||||
# Find NIST control family
|
||||
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
|
||||
if not match:
|
||||
return f"[{article_label} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
text = full_text[start:start+3000].strip()
|
||||
return text
|
||||
|
||||
else:
|
||||
# Generic section search
|
||||
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
|
||||
if not match:
|
||||
return f"[{article_label} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
text = full_text[start:start+3000].strip()
|
||||
return text
|
||||
|
||||
|
||||
# ── API Calls ────────────────────────────────────────────────────────
|
||||
|
||||
def call_litellm(prompt: str, system_prompt: str) -> tuple:
|
||||
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {LITELLM_API_KEY}",
|
||||
}
|
||||
payload = {
|
||||
"model": LITELLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.3,
|
||||
"max_tokens": 4096,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{LITELLM_URL}/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=180,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
usage = data.get("usage", {})
|
||||
return content, duration, None, usage
|
||||
except Exception as e:
|
||||
return "", time.time() - t0, str(e), {}
|
||||
|
||||
|
||||
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
|
||||
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
|
||||
duration = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
|
||||
data = resp.json()
|
||||
content = data["content"][0]["text"] if data.get("content") else ""
|
||||
usage = data.get("usage", {})
|
||||
return content, duration, None, usage
|
||||
except Exception as e:
|
||||
return "", time.time() - t0, str(e), {}
|
||||
|
||||
|
||||
# ── Quality Assessment ───────────────────────────────────────────────
|
||||
|
||||
REQUIRED_FIELDS = [
|
||||
"title", "objective", "rationale", "requirements",
|
||||
"test_procedure", "evidence", "severity", "domain",
|
||||
]
|
||||
|
||||
BONUS_FIELDS = [
|
||||
"tags", "category", "target_audience", "source_article",
|
||||
"applicable_industries", "applicable_company_size",
|
||||
]
|
||||
|
||||
|
||||
def assess_quality(raw_text: str) -> dict:
|
||||
"""Assess the quality of a control generation response."""
|
||||
result = {
|
||||
"json_valid": False,
|
||||
"required_fields": 0,
|
||||
"required_total": len(REQUIRED_FIELDS),
|
||||
"bonus_fields": 0,
|
||||
"bonus_total": len(BONUS_FIELDS),
|
||||
"requirements_count": 0,
|
||||
"test_procedure_count": 0,
|
||||
"evidence_count": 0,
|
||||
"title_length": 0,
|
||||
"objective_length": 0,
|
||||
"score": 0,
|
||||
}
|
||||
|
||||
# Try to parse JSON
|
||||
text = raw_text.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, list):
|
||||
data = data[0] if data else {}
|
||||
except json.JSONDecodeError:
|
||||
# Try to find JSON object
|
||||
import re
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return result
|
||||
else:
|
||||
return result
|
||||
|
||||
result["json_valid"] = True
|
||||
|
||||
# Check required fields
|
||||
for f in REQUIRED_FIELDS:
|
||||
val = data.get(f)
|
||||
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
|
||||
result["required_fields"] += 1
|
||||
|
||||
# Check bonus fields
|
||||
for f in BONUS_FIELDS:
|
||||
val = data.get(f)
|
||||
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
|
||||
result["bonus_fields"] += 1
|
||||
|
||||
# Depth metrics
|
||||
reqs = data.get("requirements", [])
|
||||
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
|
||||
tp = data.get("test_procedure", [])
|
||||
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
|
||||
ev = data.get("evidence", [])
|
||||
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
|
||||
result["title_length"] = len(data.get("title", ""))
|
||||
result["objective_length"] = len(data.get("objective", ""))
|
||||
|
||||
# Score: 0-100
|
||||
score = 0
|
||||
score += 20 if result["json_valid"] else 0
|
||||
score += (result["required_fields"] / result["required_total"]) * 40
|
||||
score += (result["bonus_fields"] / result["bonus_total"]) * 15
|
||||
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
|
||||
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
|
||||
score += 1 if result["objective_length"] > 50 else 0
|
||||
result["score"] = round(score, 1)
|
||||
|
||||
result["parsed_data"] = data
|
||||
return result
|
||||
|
||||
|
||||
# ── Test Cases ───────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASES = [
|
||||
{
|
||||
"source": "DSGVO (EU) 2016/679",
|
||||
"article": "Artikel 32",
|
||||
"pdf": "dsgvo_2016_679.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
|
||||
},
|
||||
{
|
||||
"source": "KI-Verordnung (EU) 2024/1689",
|
||||
"article": "Artikel 9",
|
||||
"pdf": "ai_act_2024_1689.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Risikomanagement für Hochrisiko-KI",
|
||||
},
|
||||
{
|
||||
"source": "NIS2-Richtlinie (EU) 2022/2555",
|
||||
"article": "Artikel 21",
|
||||
"pdf": "nis2_2022_2555.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
|
||||
},
|
||||
{
|
||||
"source": "Cyber Resilience Act (CRA)",
|
||||
"article": "Artikel 13",
|
||||
"pdf": "cra_2024_2847.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Pflichten der Hersteller",
|
||||
},
|
||||
{
|
||||
"source": "Bundesdatenschutzgesetz (BDSG)",
|
||||
"article": "§ 26",
|
||||
"pdf": "bdsg.pdf",
|
||||
"doc_type": "de_law",
|
||||
"license": "DE_LAW",
|
||||
"description": "Datenverarbeitung im Beschäftigungskontext",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if not ANTHROPIC_API_KEY:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
print("=" * 80)
|
||||
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
|
||||
print("=" * 80)
|
||||
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
|
||||
print(f" Anthropic: {ANTHROPIC_MODEL}")
|
||||
print(f" Tests: {len(TEST_CASES)}")
|
||||
print()
|
||||
|
||||
# Pre-check LiteLLM
|
||||
try:
|
||||
r = requests.get(f"{LITELLM_URL}/v1/models",
|
||||
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
|
||||
print(f" LiteLLM OK: {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f" LiteLLM ERROR: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
results = []
|
||||
|
||||
for i, tc in enumerate(TEST_CASES):
|
||||
print(f"\n{'='*80}")
|
||||
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
|
||||
print(f" {tc['description']}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Extract article text from PDF
|
||||
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
|
||||
if not article_text or article_text.startswith("["):
|
||||
print(f" WARNING: {article_text or 'Empty text'}")
|
||||
continue
|
||||
|
||||
print(f" Text extracted: {len(article_text)} chars")
|
||||
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
|
||||
|
||||
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
|
||||
|
||||
# ── Call LiteLLM ──
|
||||
print(f"\n --- gpt-oss-120b ---")
|
||||
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
|
||||
if litellm_err:
|
||||
print(f" ERROR: {litellm_err}")
|
||||
litellm_quality = {"json_valid": False, "score": 0}
|
||||
else:
|
||||
print(f" Time: {litellm_time:.1f}s")
|
||||
print(f" Tokens: {litellm_usage}")
|
||||
litellm_quality = assess_quality(litellm_raw)
|
||||
print(f" JSON valid: {litellm_quality['json_valid']}")
|
||||
print(f" Score: {litellm_quality['score']}/100")
|
||||
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
|
||||
print(f" Requirements: {litellm_quality['requirements_count']}, "
|
||||
f"Tests: {litellm_quality['test_procedure_count']}, "
|
||||
f"Evidence: {litellm_quality['evidence_count']}")
|
||||
if litellm_quality.get("parsed_data"):
|
||||
d = litellm_quality["parsed_data"]
|
||||
print(f" Title: {d.get('title', 'N/A')}")
|
||||
|
||||
# ── Call Anthropic ──
|
||||
print(f"\n --- Claude Sonnet 4.6 ---")
|
||||
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
|
||||
if anthropic_err:
|
||||
print(f" ERROR: {anthropic_err}")
|
||||
anthropic_quality = {"json_valid": False, "score": 0}
|
||||
else:
|
||||
print(f" Time: {anthropic_time:.1f}s")
|
||||
print(f" Tokens: {anthropic_usage}")
|
||||
anthropic_quality = assess_quality(anthropic_raw)
|
||||
print(f" JSON valid: {anthropic_quality['json_valid']}")
|
||||
print(f" Score: {anthropic_quality['score']}/100")
|
||||
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
|
||||
print(f" Requirements: {anthropic_quality['requirements_count']}, "
|
||||
f"Tests: {anthropic_quality['test_procedure_count']}, "
|
||||
f"Evidence: {anthropic_quality['evidence_count']}")
|
||||
if anthropic_quality.get("parsed_data"):
|
||||
d = anthropic_quality["parsed_data"]
|
||||
print(f" Title: {d.get('title', 'N/A')}")
|
||||
|
||||
# Compare
|
||||
print(f"\n --- VERGLEICH ---")
|
||||
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
|
||||
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
|
||||
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
|
||||
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
|
||||
f"Sonnet {anthropic_quality.get('score', 0)}/100")
|
||||
|
||||
results.append({
|
||||
"test": f"{tc['source']} — {tc['article']}",
|
||||
"litellm": {
|
||||
"time": round(litellm_time, 1),
|
||||
"score": litellm_quality.get("score", 0),
|
||||
"json_valid": litellm_quality.get("json_valid", False),
|
||||
"requirements": litellm_quality.get("requirements_count", 0),
|
||||
"tests": litellm_quality.get("test_procedure_count", 0),
|
||||
"usage": litellm_usage,
|
||||
"raw": litellm_raw[:500] if litellm_raw else "",
|
||||
},
|
||||
"anthropic": {
|
||||
"time": round(anthropic_time, 1),
|
||||
"score": anthropic_quality.get("score", 0),
|
||||
"json_valid": anthropic_quality.get("json_valid", False),
|
||||
"requirements": anthropic_quality.get("requirements_count", 0),
|
||||
"tests": anthropic_quality.get("test_procedure_count", 0),
|
||||
"usage": anthropic_usage,
|
||||
"raw": anthropic_raw[:500] if anthropic_raw else "",
|
||||
},
|
||||
})
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
print(f"\n\n{'='*80}")
|
||||
print("ZUSAMMENFASSUNG")
|
||||
print(f"{'='*80}")
|
||||
|
||||
if not results:
|
||||
print(" Keine Ergebnisse.")
|
||||
return
|
||||
|
||||
litellm_scores = [r["litellm"]["score"] for r in results]
|
||||
anthropic_scores = [r["anthropic"]["score"] for r in results]
|
||||
litellm_times = [r["litellm"]["time"] for r in results]
|
||||
anthropic_times = [r["anthropic"]["time"] for r in results]
|
||||
|
||||
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
|
||||
print(f" {'-'*30} {'-'*15} {'-'*15}")
|
||||
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
|
||||
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
|
||||
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
|
||||
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
|
||||
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
|
||||
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
|
||||
print(f" {'Avg Requirements':<30s} "
|
||||
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
|
||||
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
|
||||
print(f" {'Avg Test Procedures':<30s} "
|
||||
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
|
||||
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
|
||||
|
||||
# Cost estimate
|
||||
# Claude Sonnet: ~$3/M input, ~$15/M output
|
||||
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
|
||||
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
|
||||
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
|
||||
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
|
||||
|
||||
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
|
||||
print(f" gpt-oss-120b: $0.00 (self-hosted)")
|
||||
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
|
||||
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
|
||||
|
||||
# Extrapolate for 494 gap articles
|
||||
if results:
|
||||
cost_per_control = anthropic_cost / len(results)
|
||||
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
|
||||
print(f" gpt-oss-120b: $0.00")
|
||||
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
|
||||
avg_time_120b = sum(litellm_times) / len(litellm_times)
|
||||
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
|
||||
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
|
||||
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
|
||||
|
||||
# Save full results
|
||||
out_path = "/tmp/benchmark_llm_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n Detaillierte Ergebnisse: {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
200
scripts/qa/blue_guide_en_match.py
Normal file
200
scripts/qa/blue_guide_en_match.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Match unmatched Blue Guide controls against the English PDF."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import unicodedata
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
print("ERROR: PyMuPDF (fitz) not installed")
|
||||
exit(1)
|
||||
|
||||
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
|
||||
|
||||
def normalize(s):
|
||||
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
||||
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
||||
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
||||
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
||||
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
||||
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
# Read EN PDF
|
||||
print(f"Reading {PDF_PATH}...")
|
||||
doc = fitz.open(PDF_PATH)
|
||||
text = ""
|
||||
for page in doc:
|
||||
text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
print(f" {len(text):,} chars")
|
||||
|
||||
text_norm = normalize(text)
|
||||
|
||||
# Build article index for EN Blue Guide
|
||||
# EN Blue Guide uses "Article N" headings (not "Artikel N")
|
||||
items = []
|
||||
|
||||
# Find where "Article 1" starts — content before is preamble/intro
|
||||
art1_match = re.search(r'\nArticle\s+1\s*\n', text)
|
||||
if not art1_match:
|
||||
# Try section-based structure instead
|
||||
print(" No 'Article N' headings found, trying section-based index...")
|
||||
for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
|
||||
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||
else:
|
||||
art1_pos = art1_match.start()
|
||||
# Article headings
|
||||
for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
|
||||
art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
|
||||
items.append((m.start(), f"Article {m.group(1)}", "article"))
|
||||
|
||||
# Annex markers
|
||||
for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
|
||||
items.append((m.start(), f"Annex {m.group(1)}", "annex"))
|
||||
|
||||
# Also try numbered section headings as fallback
|
||||
for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
|
||||
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
unique = []
|
||||
for pos, label, typ in items:
|
||||
if label not in seen:
|
||||
seen.add(label)
|
||||
unique.append((pos, label, typ))
|
||||
|
||||
print(f" Index: {len(unique)} sections")
|
||||
if unique[:5]:
|
||||
for pos, label, typ in unique[:5]:
|
||||
print(f" {label} [{typ}] @ pos {pos}")
|
||||
|
||||
# Precompute normalized positions
|
||||
index_norm = []
|
||||
for pos, label, typ in unique:
|
||||
norm_pos = len(normalize(text[:pos]))
|
||||
index_norm.append((norm_pos, label, typ))
|
||||
|
||||
# Connect to DB
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get Blue Guide controls without article_type (unmatched)
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text,
|
||||
source_citation->>'article' as existing_article,
|
||||
source_citation->>'article_type' as existing_type,
|
||||
release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'EU Blue Guide 2022'
|
||||
AND source_original_text IS NOT NULL
|
||||
AND length(source_original_text) > 50
|
||||
AND (source_citation->>'article_type' IS NULL)
|
||||
ORDER BY control_id
|
||||
""")
|
||||
controls = cur.fetchall()
|
||||
print(f"\nUnmatched Blue Guide controls: {len(controls)}")
|
||||
|
||||
# Match each control
|
||||
results = []
|
||||
found = 0
|
||||
not_found = 0
|
||||
|
||||
for ctrl in controls:
|
||||
ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
|
||||
orig_norm = normalize(orig_text)
|
||||
if len(orig_norm) < 30:
|
||||
not_found += 1
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
||||
for length in [80, 60, 40, 30, 20]:
|
||||
start = max(0, int(len(orig_norm) * start_frac))
|
||||
snippet = orig_norm[start:start+length]
|
||||
if not snippet or len(snippet) < 15:
|
||||
continue
|
||||
pos = text_norm.find(snippet)
|
||||
if pos >= 0:
|
||||
# Find section
|
||||
label = "Unknown"
|
||||
typ = "unknown"
|
||||
for h_pos, h_label, h_type in reversed(index_norm):
|
||||
if h_pos <= pos:
|
||||
label = h_label
|
||||
typ = h_type
|
||||
break
|
||||
results.append({
|
||||
"ctrl_id": str(ctrl_id),
|
||||
"control_id": control_id,
|
||||
"source": "EU Blue Guide 2022",
|
||||
"article_label": label,
|
||||
"article_type": typ,
|
||||
})
|
||||
found += 1
|
||||
is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
|
||||
print(f" {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
not_found += 1
|
||||
print(f" {control_id:10s}: NOT FOUND {title[:50]}")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
|
||||
|
||||
# Save results
|
||||
out_path = "/tmp/blue_guide_en_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
print(f"Saved to {out_path}")
|
||||
|
||||
# Apply results to DB
|
||||
if results:
|
||||
print(f"\nApplying {len(results)} results to DB...")
|
||||
applied = 0
|
||||
for r in results:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('article', %s, 'article_type', %s)
|
||||
WHERE id = %s::uuid
|
||||
AND (source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
||||
""", (r["article_label"], r["article_type"],
|
||||
r["ctrl_id"], r["article_label"], r["article_type"]))
|
||||
if cur.rowcount > 0:
|
||||
applied += 1
|
||||
conn.commit()
|
||||
print(f" Applied: {applied} controls updated")
|
||||
|
||||
# Show type distribution
|
||||
type_counts = {}
|
||||
for r in results:
|
||||
t = r["article_type"]
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
if type_counts:
|
||||
print(f"\nArticle type distribution:")
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t:12s}: {c:5d}")
|
||||
|
||||
conn.close()
|
||||
188
scripts/qa/gap_analysis.py
Normal file
188
scripts/qa/gap_analysis.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
|
||||
|
||||
For each regulation PDF:
|
||||
1. Extract all articles/sections from the PDF
|
||||
2. Compare with controls in the DB that reference this article
|
||||
3. Report gaps (articles with no controls)
|
||||
|
||||
Usage:
|
||||
python3 gap_analysis.py # show all gaps
|
||||
python3 gap_analysis.py --source "DSGVO" # filter by source
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
# Import from pdf_qa_all
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from pdf_qa_all import (
|
||||
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
|
||||
build_eu_article_index, build_de_law_index, build_nist_index,
|
||||
build_owasp_index, build_generic_index, MAX_ARTICLES
|
||||
)
|
||||
|
||||
# Only analyze sources with significant control counts (skip sources with <5 controls)
|
||||
MIN_CONTROLS = 5
|
||||
|
||||
|
||||
def main():
|
||||
source_filter = None
|
||||
if "--source" in sys.argv:
|
||||
idx = sys.argv.index("--source")
|
||||
if idx + 1 < len(sys.argv):
|
||||
source_filter = sys.argv[idx + 1]
|
||||
|
||||
# DB connection
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get all controls grouped by source with their article
|
||||
cur.execute("""
|
||||
SELECT source_citation->>'source' as source,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as article_type,
|
||||
count(*) as cnt
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
GROUP BY 1, 2, 3
|
||||
ORDER BY 1, 2
|
||||
""")
|
||||
|
||||
# Build: source -> {article -> (type, count)}
|
||||
controls_by_source = defaultdict(dict)
|
||||
for source, article, art_type, cnt in cur.fetchall():
|
||||
if article:
|
||||
controls_by_source[source][article] = (art_type or "unknown", cnt)
|
||||
|
||||
total_gaps = 0
|
||||
total_articles_checked = 0
|
||||
total_covered = 0
|
||||
gap_report = []
|
||||
|
||||
sources_to_check = sorted(SOURCE_FILE_MAP.keys())
|
||||
if source_filter:
|
||||
sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
|
||||
|
||||
for source_name in sources_to_check:
|
||||
filename = SOURCE_FILE_MAP.get(source_name)
|
||||
if filename is None:
|
||||
continue
|
||||
|
||||
controls = controls_by_source.get(source_name, {})
|
||||
if len(controls) < MIN_CONTROLS and not source_filter:
|
||||
continue
|
||||
|
||||
# Read PDF and build article index
|
||||
text = read_file(filename)
|
||||
if text is None:
|
||||
continue
|
||||
|
||||
doc_type = classify_doc(source_name)
|
||||
max_art = MAX_ARTICLES.get(source_name)
|
||||
|
||||
if doc_type == "eu_regulation":
|
||||
index = build_eu_article_index(text, max_article=max_art)
|
||||
elif doc_type == "de_law":
|
||||
index = build_de_law_index(text)
|
||||
elif doc_type == "nist":
|
||||
index = build_nist_index(text)
|
||||
elif doc_type == "owasp":
|
||||
index = build_owasp_index(text, source_name)
|
||||
else:
|
||||
index = build_generic_index(text)
|
||||
|
||||
if not index:
|
||||
continue
|
||||
|
||||
# Only look at substantive articles (not preamble, not annex for gap analysis)
|
||||
substantive_types = {"article", "section", "control", "requirement", "category"}
|
||||
substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
|
||||
|
||||
preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
|
||||
annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
|
||||
|
||||
# Check which articles have controls
|
||||
covered = []
|
||||
gaps = []
|
||||
for pos, label, typ in substantive_articles:
|
||||
if label in controls:
|
||||
covered.append(label)
|
||||
else:
|
||||
gaps.append((label, typ))
|
||||
|
||||
total_articles_checked += len(substantive_articles)
|
||||
total_covered += len(covered)
|
||||
total_gaps += len(gaps)
|
||||
|
||||
# Count preamble/annex controls
|
||||
preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
|
||||
annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
|
||||
|
||||
coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"{source_name}")
|
||||
print(f" PDF articles: {len(substantive_articles)} substantive, "
|
||||
f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
|
||||
print(f" DB controls: {sum(v[1] for v in controls.values())} total "
|
||||
f"({preamble_controls} preamble, {annex_controls} annex)")
|
||||
print(f" Coverage: {len(covered)}/{len(substantive_articles)} "
|
||||
f"({coverage_pct:.0f}%)")
|
||||
|
||||
if gaps:
|
||||
print(f" GAPS ({len(gaps)}):")
|
||||
for label, typ in gaps[:30]: # limit output
|
||||
print(f" - {label} [{typ}]")
|
||||
if len(gaps) > 30:
|
||||
print(f" ... and {len(gaps)-30} more")
|
||||
|
||||
gap_report.append({
|
||||
"source": source_name,
|
||||
"total_articles": len(substantive_articles),
|
||||
"covered": len(covered),
|
||||
"gaps": len(gaps),
|
||||
"coverage_pct": round(coverage_pct, 1),
|
||||
"gap_articles": [{"label": l, "type": t} for l, t in gaps],
|
||||
})
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*70}")
|
||||
print("GAP ANALYSIS SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Sources analyzed: {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
|
||||
print(f" Total articles in PDFs: {total_articles_checked}")
|
||||
print(f" Articles with controls: {total_covered}")
|
||||
print(f" Articles WITHOUT controls: {total_gaps}")
|
||||
if total_articles_checked:
|
||||
print(f" Overall coverage: {total_covered/total_articles_checked*100:.1f}%")
|
||||
|
||||
print(f"\n Sources with gaps:")
|
||||
for r in sorted(gap_report, key=lambda x: -x["gaps"]):
|
||||
print(f" {r['source']:45s} {r['gaps']:4d} gaps "
|
||||
f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
|
||||
|
||||
# Save report
|
||||
out_path = "/tmp/gap_analysis_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(gap_report, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n Full report saved to {out_path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
288
scripts/qa/oscal_analysis.py
Normal file
288
scripts/qa/oscal_analysis.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""Analyze NIST OSCAL data and compare with existing controls in DB."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from collections import defaultdict
|
||||
|
||||
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
|
||||
|
||||
# ── Load SP 800-53 Rev 5 ──
|
||||
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
|
||||
sp853 = json.load(f)["catalog"]
|
||||
|
||||
print("=" * 70)
|
||||
print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
|
||||
print("=" * 70)
|
||||
print(f" UUID: {sp853.get('uuid', '?')}")
|
||||
print(f" Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
|
||||
|
||||
# Count controls
|
||||
families = sp853.get("groups", [])
|
||||
total_base = 0
|
||||
total_enhancements = 0
|
||||
total_withdrawn = 0
|
||||
total_active = 0
|
||||
family_stats = []
|
||||
|
||||
for fam in families:
|
||||
fam_id = fam.get("id", "?")
|
||||
fam_title = fam.get("title", "?")
|
||||
controls = fam.get("controls", [])
|
||||
base = 0
|
||||
enhancements = 0
|
||||
withdrawn = 0
|
||||
|
||||
for ctrl in controls:
|
||||
# Check if withdrawn
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
is_withdrawn = props.get("status") == "withdrawn"
|
||||
if is_withdrawn:
|
||||
withdrawn += 1
|
||||
else:
|
||||
base += 1
|
||||
|
||||
# Count enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
|
||||
if enh_props.get("status") == "withdrawn":
|
||||
withdrawn += 1
|
||||
else:
|
||||
enhancements += 1
|
||||
|
||||
family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
|
||||
total_base += base
|
||||
total_enhancements += enhancements
|
||||
total_withdrawn += withdrawn
|
||||
|
||||
total_active = total_base + total_enhancements
|
||||
print(f"\n Families: {len(families)}")
|
||||
print(f" Base Controls: {total_base}")
|
||||
print(f" Enhancements: {total_enhancements}")
|
||||
print(f" Withdrawn: {total_withdrawn}")
|
||||
print(f" TOTAL ACTIVE: {total_active}")
|
||||
|
||||
print(f"\n Per Family:")
|
||||
print(f" {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
|
||||
for fam_id, title, base, enh, wdrn in family_stats:
|
||||
print(f" {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
|
||||
|
||||
# Show example control structure
|
||||
print(f"\n Example Control (AC-6 Least Privilege):")
|
||||
for fam in families:
|
||||
for ctrl in fam.get("controls", []):
|
||||
if ctrl["id"] == "ac-6":
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
print(f" ID: {ctrl['id']}")
|
||||
print(f" Label: {props.get('label', '?')}")
|
||||
print(f" Title: {ctrl['title']}")
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
prose = part.get("prose", "")
|
||||
print(f" Statement: {prose[:150]}...")
|
||||
elif part.get("name") == "guidance":
|
||||
prose = part.get("prose", "")
|
||||
print(f" Guidance: {prose[:150]}...")
|
||||
enh_count = len(ctrl.get("controls", []))
|
||||
print(f" Enhancements: {enh_count}")
|
||||
links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
|
||||
print(f" Related: {', '.join(links[:8])}...")
|
||||
break
|
||||
|
||||
# ── Load CSF 2.0 ──
|
||||
print(f"\n{'='*70}")
|
||||
print("NIST CSF 2.0 — OSCAL Catalog Analysis")
|
||||
print("=" * 70)
|
||||
|
||||
with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
|
||||
csf = json.load(f)["catalog"]
|
||||
|
||||
csf_groups = csf.get("groups", [])
|
||||
csf_total = 0
|
||||
for grp in csf_groups:
|
||||
func_title = grp.get("title", "?")
|
||||
cats = grp.get("groups", [])
|
||||
subcats = 0
|
||||
for cat in cats:
|
||||
subcats += len(cat.get("controls", []))
|
||||
csf_total += subcats
|
||||
print(f" {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
|
||||
|
||||
print(f" TOTAL: {csf_total} subcategories")
|
||||
|
||||
# ── Compare with existing DB controls ──
|
||||
print(f"\n{'='*70}")
|
||||
print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
|
||||
print("=" * 70)
|
||||
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get existing NIST controls
|
||||
cur.execute("""
|
||||
SELECT control_id, title,
|
||||
source_citation->>'source' as source,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as art_type,
|
||||
release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' LIKE 'NIST%%'
|
||||
ORDER BY source_citation->>'source', control_id
|
||||
""")
|
||||
nist_controls = cur.fetchall()
|
||||
|
||||
# Group by source
|
||||
by_source = defaultdict(list)
|
||||
for ctrl in nist_controls:
|
||||
by_source[ctrl[2]].append(ctrl)
|
||||
|
||||
print(f"\n Bestehende NIST Controls in DB:")
|
||||
for src in sorted(by_source.keys()):
|
||||
ctrls = by_source[src]
|
||||
active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
|
||||
with_article = sum(1 for c in ctrls if c[3])
|
||||
print(f" {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
|
||||
|
||||
# For SP 800-53: which control families do we have?
|
||||
sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
|
||||
existing_families = set()
|
||||
existing_articles = set()
|
||||
for ctrl in sp853_existing:
|
||||
article = ctrl[3] or ""
|
||||
if article:
|
||||
# Extract family prefix (e.g., "AC-6" → "AC")
|
||||
m = re.match(r'([A-Z]{2})-', article)
|
||||
if m:
|
||||
existing_families.add(m.group(1))
|
||||
existing_articles.add(article)
|
||||
|
||||
print(f"\n SP 800-53 in DB:")
|
||||
print(f" Total: {len(sp853_existing)}")
|
||||
print(f" Families covered: {len(existing_families)}")
|
||||
print(f" Unique articles: {len(existing_articles)}")
|
||||
print(f" Families: {', '.join(sorted(existing_families))}")
|
||||
|
||||
# Compare: which OSCAL controls are NOT in our DB?
|
||||
oscal_controls = {} # id → (label, title, statement)
|
||||
for fam in families:
|
||||
for ctrl in fam.get("controls", []):
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
if props.get("status") == "withdrawn":
|
||||
continue
|
||||
label = props.get("label", ctrl["id"].upper())
|
||||
statement = ""
|
||||
guidance = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
statement = part.get("prose", "")
|
||||
# Also check sub-items
|
||||
for sub in part.get("parts", []):
|
||||
statement += " " + sub.get("prose", "")
|
||||
elif part.get("name") == "guidance":
|
||||
guidance = part.get("prose", "")
|
||||
|
||||
oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
|
||||
|
||||
# Enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
|
||||
if enh_props.get("status") == "withdrawn":
|
||||
continue
|
||||
enh_label = enh_props.get("label", enh["id"].upper())
|
||||
enh_statement = ""
|
||||
enh_guidance = ""
|
||||
for part in enh.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
enh_statement = part.get("prose", "")
|
||||
for sub in part.get("parts", []):
|
||||
enh_statement += " " + sub.get("prose", "")
|
||||
elif part.get("name") == "guidance":
|
||||
enh_guidance = part.get("prose", "")
|
||||
oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
|
||||
|
||||
print(f"\n OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
|
||||
|
||||
# Find missing: in OSCAL but not in DB
|
||||
missing = []
|
||||
covered = []
|
||||
for label in sorted(oscal_controls.keys()):
|
||||
if label in existing_articles:
|
||||
covered.append(label)
|
||||
else:
|
||||
missing.append(label)
|
||||
|
||||
print(f" In DB vorhanden: {len(covered)}")
|
||||
print(f" FEHLEND in DB: {len(missing)}")
|
||||
|
||||
# Missing by family
|
||||
missing_by_fam = defaultdict(list)
|
||||
for label in missing:
|
||||
fam = label.split("-")[0]
|
||||
missing_by_fam[fam].append(label)
|
||||
|
||||
print(f"\n Fehlende Controls nach Family:")
|
||||
for fam in sorted(missing_by_fam.keys()):
|
||||
ctrls = missing_by_fam[fam]
|
||||
examples = ", ".join(ctrls[:5])
|
||||
more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
|
||||
print(f" {fam:4s}: {len(ctrls):3d} fehlend ({examples}{more})")
|
||||
|
||||
# Also check CSF 2.0
|
||||
print(f"\n{'='*70}")
|
||||
print("NIST CSF 2.0 — Vergleich mit DB")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
|
||||
""")
|
||||
csf_row = cur.fetchone()
|
||||
print(f" CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
|
||||
|
||||
csf_subcats = 0
|
||||
csf_ids = []
|
||||
for grp in csf_groups:
|
||||
for cat in grp.get("groups", []):
|
||||
for subcat in cat.get("controls", []):
|
||||
csf_subcats += 1
|
||||
props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
|
||||
csf_ids.append(props.get("label", subcat["id"]))
|
||||
|
||||
print(f" CSF 2.0 OSCAL Subcategories: {csf_subcats}")
|
||||
print(f" Beispiele: {', '.join(csf_ids[:10])}")
|
||||
|
||||
# ── Summary / Potential ──
|
||||
print(f"\n{'='*70}")
|
||||
print("POTENTIAL: Was OSCAL uns bringt")
|
||||
print("=" * 70)
|
||||
print(f"""
|
||||
SP 800-53 Rev 5:
|
||||
- {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
|
||||
- Jeder Control hat: Statement + Guidance + Assessment-Methoden
|
||||
- Cross-References zwischen Controls (für Mapping)
|
||||
- Maschinenlesbare Parameter (ODP)
|
||||
- Public Domain — keine Lizenzprobleme
|
||||
|
||||
CSF 2.0:
|
||||
- {csf_subcats} Subcategories als Compliance-Controls
|
||||
- 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
|
||||
- Direkte Mappings zu SP 800-53 Controls
|
||||
|
||||
Nächste Schritte:
|
||||
1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
|
||||
2. Statement-Text als source_original_text verwenden
|
||||
3. article_type='control', article=Label (z.B. 'AC-6')
|
||||
4. CSF 2.0 als eigene Regulation importieren
|
||||
5. Cross-References als Grundlage für Control-Mappings nutzen
|
||||
""")
|
||||
|
||||
conn.close()
|
||||
289
scripts/qa/oscal_import.py
Normal file
289
scripts/qa/oscal_import.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import uuid
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
|
||||
|
||||
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
|
||||
sp853 = json.load(f)["catalog"]
|
||||
|
||||
# ── Extract all OSCAL controls ──
|
||||
def extract_controls(catalog):
|
||||
"""Extract all active controls with full data."""
|
||||
controls = []
|
||||
for fam in catalog.get("groups", []):
|
||||
fam_id = fam.get("id", "").upper()
|
||||
fam_title = fam.get("title", "")
|
||||
|
||||
for ctrl in fam.get("controls", []):
|
||||
result = extract_single(ctrl, fam_title)
|
||||
if result:
|
||||
controls.append(result)
|
||||
# Enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
result = extract_single(enh, fam_title)
|
||||
if result:
|
||||
controls.append(result)
|
||||
return controls
|
||||
|
||||
def extract_single(ctrl, family_title):
|
||||
"""Extract a single control or enhancement."""
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
if props.get("status") == "withdrawn":
|
||||
return None
|
||||
|
||||
label = props.get("label", ctrl["id"].upper())
|
||||
title = ctrl.get("title", "")
|
||||
|
||||
# Extract statement (main requirement text)
|
||||
statement = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
statement = part.get("prose", "")
|
||||
# Sub-items (a., b., c., etc.)
|
||||
for sub in part.get("parts", []):
|
||||
sub_prose = sub.get("prose", "")
|
||||
sub_label = ""
|
||||
for sp in sub.get("props", []):
|
||||
if sp["name"] == "label":
|
||||
sub_label = sp.get("value", "")
|
||||
if sub_label:
|
||||
statement += f"\n{sub_label} {sub_prose}"
|
||||
elif sub_prose:
|
||||
statement += f"\n{sub_prose}"
|
||||
# Nested sub-sub-items
|
||||
for subsub in sub.get("parts", []):
|
||||
ss_prose = subsub.get("prose", "")
|
||||
ss_label = ""
|
||||
for sp in subsub.get("props", []):
|
||||
if sp["name"] == "label":
|
||||
ss_label = sp.get("value", "")
|
||||
if ss_label:
|
||||
statement += f"\n {ss_label} {ss_prose}"
|
||||
elif ss_prose:
|
||||
statement += f"\n {ss_prose}"
|
||||
|
||||
# Extract guidance
|
||||
guidance = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "guidance":
|
||||
guidance = part.get("prose", "")
|
||||
|
||||
# Cross-references
|
||||
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
|
||||
|
||||
# Parameters
|
||||
params = []
|
||||
for p in ctrl.get("params", []):
|
||||
param_id = p.get("id", "")
|
||||
param_label = p.get("label", "")
|
||||
guidelines = ""
|
||||
for g in p.get("guidelines", []):
|
||||
guidelines += g.get("prose", "")
|
||||
select_choices = []
|
||||
if "select" in p:
|
||||
for choice in p["select"].get("choice", []):
|
||||
select_choices.append(choice)
|
||||
params.append({
|
||||
"id": param_id,
|
||||
"label": param_label,
|
||||
"guidelines": guidelines,
|
||||
"choices": select_choices,
|
||||
})
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"title": title,
|
||||
"family": family_title,
|
||||
"statement": statement.strip(),
|
||||
"guidance": guidance.strip(),
|
||||
"related": related,
|
||||
"params": params,
|
||||
"is_enhancement": "(" in label,
|
||||
}
|
||||
|
||||
all_oscal = extract_controls(sp853)
|
||||
print(f"Total OSCAL active controls: {len(all_oscal)}")
|
||||
|
||||
# ── Normalize label for comparison ──
|
||||
def normalize_label(label):
|
||||
label = re.sub(r'-0+(\d)', r'-\1', label)
|
||||
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
|
||||
return label.upper()
|
||||
|
||||
# ── DB connection ──
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get existing labels
|
||||
cur.execute("""
|
||||
SELECT DISTINCT source_citation->>'article' as article
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
||||
AND source_citation->>'article' IS NOT NULL
|
||||
""")
|
||||
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
|
||||
print(f"Existing DB labels (normalized): {len(existing_labels)}")
|
||||
|
||||
# Get highest control_id numbers per prefix
|
||||
cur.execute("""
|
||||
SELECT control_id FROM compliance.canonical_controls
|
||||
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
|
||||
ORDER BY control_id
|
||||
""")
|
||||
existing_ids = set(r[0] for r in cur.fetchall())
|
||||
|
||||
# Find next available ID per prefix
|
||||
def next_control_id(prefix, existing):
|
||||
"""Find next available control_id like SEC-1234."""
|
||||
max_num = 0
|
||||
pattern = re.compile(rf'^{prefix}-(\d+)$')
|
||||
for eid in existing:
|
||||
m = pattern.match(eid)
|
||||
if m:
|
||||
max_num = max(max_num, int(m.group(1)))
|
||||
return max_num
|
||||
|
||||
# Map NIST families to our control_id prefixes
|
||||
FAMILY_PREFIX = {
|
||||
"Access Control": "ACC",
|
||||
"Awareness and Training": "GOV",
|
||||
"Audit and Accountability": "LOG",
|
||||
"Assessment, Authorization, and Monitoring": "GOV",
|
||||
"Configuration Management": "COMP",
|
||||
"Contingency Planning": "INC",
|
||||
"Identification and Authentication": "AUTH",
|
||||
"Incident Response": "INC",
|
||||
"Maintenance": "COMP",
|
||||
"Media Protection": "DATA",
|
||||
"Physical and Environmental Protection": "SEC",
|
||||
"Planning": "GOV",
|
||||
"Program Management": "GOV",
|
||||
"Personnel Security": "GOV",
|
||||
"Personally Identifiable Information Processing and Transparency": "DATA",
|
||||
"Risk Assessment": "GOV",
|
||||
"System and Services Acquisition": "COMP",
|
||||
"System and Communications Protection": "NET",
|
||||
"System and Information Integrity": "SEC",
|
||||
"Supply Chain Risk Management": "COMP",
|
||||
}
|
||||
|
||||
# Track next IDs
|
||||
prefix_counters = {}
|
||||
for prefix in set(FAMILY_PREFIX.values()):
|
||||
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
|
||||
print(f"Starting counters: {prefix_counters}")
|
||||
|
||||
# ── Filter to only new controls ──
|
||||
to_import = []
|
||||
for ctrl in all_oscal:
|
||||
norm = normalize_label(ctrl["label"])
|
||||
if norm not in existing_labels:
|
||||
to_import.append(ctrl)
|
||||
|
||||
print(f"\nControls to import: {len(to_import)}")
|
||||
|
||||
# ── Import ──
|
||||
imported = 0
|
||||
for ctrl in to_import:
|
||||
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
|
||||
prefix_counters[prefix] += 1
|
||||
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
|
||||
|
||||
# Build title: "NIST {label}: {title}"
|
||||
title = f"NIST {ctrl['label']}: {ctrl['title']}"
|
||||
|
||||
# source_original_text = statement (the official requirement text)
|
||||
source_text = ctrl["statement"]
|
||||
if not source_text:
|
||||
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
|
||||
|
||||
# objective = guidance text
|
||||
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
|
||||
|
||||
# source_citation
|
||||
citation = {
|
||||
"source": "NIST SP 800-53 Rev. 5",
|
||||
"article": ctrl["label"],
|
||||
"article_type": "control",
|
||||
"source_type": "standard",
|
||||
"oscal_import": True,
|
||||
}
|
||||
if ctrl["related"]:
|
||||
citation["related_controls"] = ctrl["related"][:20]
|
||||
if ctrl["params"]:
|
||||
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
|
||||
|
||||
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
|
||||
new_id = str(uuid.uuid4())
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls
|
||||
(id, framework_id, control_id, title, objective, rationale,
|
||||
severity, source_original_text,
|
||||
source_citation, pipeline_version, release_state,
|
||||
generation_strategy, category)
|
||||
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
|
||||
""", (
|
||||
new_id,
|
||||
FRAMEWORK_ID,
|
||||
control_id,
|
||||
title[:500],
|
||||
objective[:5000],
|
||||
source_text[:10000],
|
||||
json.dumps(citation, ensure_ascii=False),
|
||||
ctrl["family"],
|
||||
))
|
||||
imported += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\nImported: {imported} new controls")
|
||||
|
||||
# ── Verify ──
|
||||
cur.execute("""
|
||||
SELECT count(*),
|
||||
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
||||
""")
|
||||
total, active = cur.fetchone()
|
||||
print(f"\nSP 800-53 after import: {total} total, {active} active")
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY release_state
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
print(f"\nDB release_state gesamt:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]:15s}: {row[1]:5d}")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
|
||||
|
||||
# ── Import stats by family ──
|
||||
fam_counts = {}
|
||||
for ctrl in to_import:
|
||||
fam = ctrl["family"]
|
||||
fam_counts[fam] = fam_counts.get(fam, 0) + 1
|
||||
|
||||
print(f"\nImportiert nach Family:")
|
||||
for fam in sorted(fam_counts.keys()):
|
||||
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
|
||||
|
||||
conn.close()
|
||||
274
scripts/qa/owasp_cleanup.py
Normal file
274
scripts/qa/owasp_cleanup.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""OWASP Cleanup:
|
||||
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
|
||||
2. Fix 47 wrong source attributions (found in different OWASP PDF)
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import unicodedata
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
print("ERROR: PyMuPDF not installed")
|
||||
exit(1)
|
||||
|
||||
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
|
||||
|
||||
def normalize(s):
|
||||
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
||||
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
||||
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
||||
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
||||
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
||||
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
# Load OWASP PDFs
|
||||
OWASP_PDFS = {
|
||||
"OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
|
||||
"OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
|
||||
"OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
|
||||
"OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
|
||||
"OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
|
||||
}
|
||||
|
||||
pdf_norms = {}
|
||||
for name, filename in OWASP_PDFS.items():
|
||||
path = os.path.join(PDF_DIR, filename)
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
doc = fitz.open(path)
|
||||
text = ""
|
||||
for page in doc:
|
||||
text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
pdf_norms[name] = normalize(text)
|
||||
|
||||
def build_owasp_index(text_norm, source_name):
|
||||
# We need the raw text for regex, but we already normalized.
|
||||
# Rebuild index from normalized text.
|
||||
items = []
|
||||
if "Top 10" in source_name and "API" not in source_name:
|
||||
for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
|
||||
items.append((m.start(), m.group(1), "category"))
|
||||
elif "API" in source_name:
|
||||
for m in re.finditer(r'(API\d+:\d{4})', text_norm):
|
||||
items.append((m.start(), m.group(1), "category"))
|
||||
elif "ASVS" in source_name:
|
||||
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
elif "MASVS" in source_name:
|
||||
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
unique = []
|
||||
for pos, label, typ in items:
|
||||
if label not in seen:
|
||||
seen.add(label)
|
||||
unique.append((pos, label, typ))
|
||||
return unique
|
||||
|
||||
pdf_indexes = {}
|
||||
for name, norm in pdf_norms.items():
|
||||
pdf_indexes[name] = build_owasp_index(norm, name)
|
||||
|
||||
def find_in_pdf(orig_text, source_name):
|
||||
"""Find control text in a specific PDF. Returns (label, type) or None."""
|
||||
pdf_norm = pdf_norms.get(source_name)
|
||||
if not pdf_norm:
|
||||
return None
|
||||
orig_norm = normalize(orig_text)
|
||||
if len(orig_norm) < 20:
|
||||
return None
|
||||
idx = pdf_indexes.get(source_name, [])
|
||||
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
||||
for length in [80, 60, 40, 30, 20]:
|
||||
start = max(0, int(len(orig_norm) * start_frac))
|
||||
snippet = orig_norm[start:start+length]
|
||||
if not snippet or len(snippet) < 15:
|
||||
continue
|
||||
pos = pdf_norm.find(snippet)
|
||||
if pos >= 0:
|
||||
label = "Unknown"
|
||||
typ = "unknown"
|
||||
for h_pos, h_label, h_type in reversed(idx):
|
||||
if h_pos <= pos:
|
||||
label = h_label
|
||||
typ = h_type
|
||||
break
|
||||
return (label, typ)
|
||||
return None
|
||||
|
||||
# DB
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
print("=" * 60)
|
||||
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
|
||||
print("=" * 60)
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text, release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
ORDER BY control_id
|
||||
""")
|
||||
top10_unmatched = cur.fetchall()
|
||||
print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
|
||||
|
||||
# Separate: found in other OWASP PDF vs not found anywhere
|
||||
to_mark_dup = []
|
||||
to_fix_source = []
|
||||
|
||||
for ctrl in top10_unmatched:
|
||||
uid, cid, title, text, state = ctrl
|
||||
|
||||
# Check if found in another OWASP PDF
|
||||
found_in = None
|
||||
found_result = None
|
||||
for other_src in OWASP_PDFS:
|
||||
if other_src == 'OWASP Top 10 (2021)':
|
||||
continue
|
||||
result = find_in_pdf(text, other_src)
|
||||
if result:
|
||||
found_in = other_src
|
||||
found_result = result
|
||||
break
|
||||
|
||||
if found_in:
|
||||
to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
|
||||
else:
|
||||
to_mark_dup.append((uid, cid))
|
||||
|
||||
print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
|
||||
print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
|
||||
|
||||
# Mark as duplicate
|
||||
dup_marked = 0
|
||||
for uid, cid in to_mark_dup:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET release_state = 'duplicate'
|
||||
WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (uid,))
|
||||
if cur.rowcount > 0:
|
||||
dup_marked += 1
|
||||
|
||||
print(f" Marked as duplicate: {dup_marked}")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# STEP 2: Fix wrong source attributions across ALL OWASP sources
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
print(f"\n{'='*60}")
|
||||
print("STEP 2: Fix wrong OWASP source attributions")
|
||||
print("=" * 60)
|
||||
|
||||
all_fixes = list(to_fix_source) # Start with Top 10 fixes
|
||||
|
||||
# Also check ASVS, SAMM, MASVS
|
||||
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = %s
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (source,))
|
||||
controls = cur.fetchall()
|
||||
|
||||
for ctrl in controls:
|
||||
uid, cid, title, text = ctrl
|
||||
# Try own PDF first
|
||||
result = find_in_pdf(text, source)
|
||||
if result:
|
||||
# Found in own PDF! Update article info
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('article', %s, 'article_type', %s)
|
||||
WHERE id = %s
|
||||
AND (source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
||||
""", (result[0], result[1], uid, result[0], result[1]))
|
||||
continue
|
||||
|
||||
# Try other OWASP PDFs
|
||||
for other_src in OWASP_PDFS:
|
||||
if other_src == source:
|
||||
continue
|
||||
result = find_in_pdf(text, other_src)
|
||||
if result:
|
||||
all_fixes.append((uid, cid, other_src, result[0], result[1]))
|
||||
break
|
||||
|
||||
print(f" Total wrong-source controls found: {len(all_fixes)}")
|
||||
|
||||
# Apply source fixes
|
||||
fixed = 0
|
||||
for uid, cid, correct_source, label, typ in all_fixes:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
|
||||
WHERE id = %s
|
||||
""", (correct_source, label, typ, uid,))
|
||||
if cur.rowcount > 0:
|
||||
fixed += 1
|
||||
print(f" {cid:10s} → {correct_source} / {label} [{typ}]")
|
||||
|
||||
print(f" Fixed: {fixed} controls")
|
||||
|
||||
conn.commit()
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# SUMMARY
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
print(f"\n{'='*60}")
|
||||
print("ZUSAMMENFASSUNG")
|
||||
print("=" * 60)
|
||||
print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
|
||||
print(f" Wrong source attribution → fixed: {fixed}")
|
||||
|
||||
# Final counts
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY release_state
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
print(f"\n DB release_state nach Cleanup:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]:15s}: {row[1]:5d}")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
active = cur.fetchone()[0]
|
||||
print(f"\n Aktive Controls: {active}")
|
||||
|
||||
conn.close()
|
||||
316
scripts/qa/owasp_github_match.py
Normal file
316
scripts/qa/owasp_github_match.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
|
||||
|
||||
def normalize(s):
|
||||
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
||||
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
||||
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
||||
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
||||
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
||||
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
# ── Load Markdown sources ──
|
||||
def load_markdown_dir(path, pattern="*.md"):
|
||||
"""Load all markdown files, return combined text and per-file index."""
|
||||
texts = {}
|
||||
for f in sorted(path.glob(pattern)):
|
||||
try:
|
||||
texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
return texts
|
||||
|
||||
# ASVS 4.0 — V-files contain requirements
|
||||
asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
|
||||
asvs_files = load_markdown_dir(asvs_dir)
|
||||
asvs_full = "\n".join(asvs_files.values())
|
||||
asvs_norm = normalize(asvs_full)
|
||||
print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
|
||||
|
||||
# SAMM core — YAML + Markdown
|
||||
samm_dir = GITHUB_DIR / "samm-core"
|
||||
samm_texts = {}
|
||||
for f in samm_dir.rglob("*.yml"):
|
||||
try:
|
||||
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
for f in samm_dir.rglob("*.md"):
|
||||
try:
|
||||
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
samm_full = "\n".join(samm_texts.values())
|
||||
samm_norm = normalize(samm_full)
|
||||
print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
|
||||
|
||||
# MASVS — control markdown files
|
||||
masvs_dir = GITHUB_DIR / "masvs"
|
||||
masvs_files = {}
|
||||
for f in masvs_dir.rglob("*.md"):
|
||||
try:
|
||||
masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
masvs_full = "\n".join(masvs_files.values())
|
||||
masvs_norm = normalize(masvs_full)
|
||||
print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
|
||||
|
||||
# API Security
|
||||
api_dir = GITHUB_DIR / "api-security"
|
||||
api_files = {}
|
||||
for f in api_dir.rglob("*.md"):
|
||||
try:
|
||||
api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
api_full = "\n".join(api_files.values())
|
||||
api_norm = normalize(api_full)
|
||||
print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
|
||||
|
||||
# Source → (normalized_text, index_builder)
|
||||
SOURCE_GITHUB = {
|
||||
"OWASP ASVS 4.0": asvs_norm,
|
||||
"OWASP SAMM 2.0": samm_norm,
|
||||
"OWASP MASVS 2.0": masvs_norm,
|
||||
"OWASP API Security Top 10 (2023)": api_norm,
|
||||
}
|
||||
|
||||
# Build indexes for each source
|
||||
def build_asvs_index(text):
|
||||
items = []
|
||||
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
def build_samm_index(text):
|
||||
items = []
|
||||
# SAMM practices have names like "Strategy & Metrics", sections numbered
|
||||
for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
|
||||
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||
# Also find practice identifiers
|
||||
for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
|
||||
r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
|
||||
r'Incident Management|Requirements Testing|Security Testing|'
|
||||
r'Design Review|Implementation Review|Operations Management)'
|
||||
r'[^.\n]{0,30})', text):
|
||||
items.append((m.start(), m.group(1)[:50], "section"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
def build_masvs_index(text):
|
||||
items = []
|
||||
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
def build_api_index(text):
|
||||
items = []
|
||||
for m in re.finditer(r'(API\d+:\d{4})', text):
|
||||
items.append((m.start(), m.group(1), "category"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
SOURCE_INDEX_BUILDERS = {
|
||||
"OWASP ASVS 4.0": build_asvs_index,
|
||||
"OWASP SAMM 2.0": build_samm_index,
|
||||
"OWASP MASVS 2.0": build_masvs_index,
|
||||
"OWASP API Security Top 10 (2023)": build_api_index,
|
||||
}
|
||||
|
||||
# Build all indexes on normalized text
|
||||
source_indexes = {}
|
||||
for name, norm_text in SOURCE_GITHUB.items():
|
||||
builder = SOURCE_INDEX_BUILDERS[name]
|
||||
idx = builder(norm_text)
|
||||
source_indexes[name] = idx
|
||||
print(f" {name}: {len(idx)} index entries")
|
||||
|
||||
def find_text(orig_text, source_name):
|
||||
"""Find control text in GitHub source. Returns (label, type) or None."""
|
||||
norm_text = SOURCE_GITHUB.get(source_name)
|
||||
if not norm_text:
|
||||
return None
|
||||
idx = source_indexes.get(source_name, [])
|
||||
orig_norm = normalize(orig_text)
|
||||
if len(orig_norm) < 20:
|
||||
return None
|
||||
|
||||
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
||||
for length in [80, 60, 40, 30, 20]:
|
||||
start = max(0, int(len(orig_norm) * start_frac))
|
||||
snippet = orig_norm[start:start+length]
|
||||
if not snippet or len(snippet) < 15:
|
||||
continue
|
||||
pos = norm_text.find(snippet)
|
||||
if pos >= 0:
|
||||
label = "Unknown"
|
||||
typ = "unknown"
|
||||
for h_pos, h_label, h_type in reversed(idx):
|
||||
if h_pos <= pos:
|
||||
label = h_label
|
||||
typ = h_type
|
||||
break
|
||||
return (label, typ)
|
||||
return None
|
||||
|
||||
def find_in_any_github(orig_text, exclude_source=None):
|
||||
"""Try all GitHub sources."""
|
||||
for name in SOURCE_GITHUB:
|
||||
if name == exclude_source:
|
||||
continue
|
||||
result = find_text(orig_text, name)
|
||||
if result:
|
||||
return (name, result[0], result[1])
|
||||
return None
|
||||
|
||||
# ── DB ──
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# ── Process each OWASP source ──
|
||||
total_matched = 0
|
||||
total_cross = 0
|
||||
total_not_found = 0
|
||||
all_updates = []
|
||||
|
||||
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text, release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = %s
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
ORDER BY control_id
|
||||
""", (source,))
|
||||
controls = cur.fetchall()
|
||||
|
||||
if not controls:
|
||||
continue
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{source} — {len(controls)} unmatched active")
|
||||
print(f"{'='*60}")
|
||||
|
||||
matched = 0
|
||||
cross_matched = 0
|
||||
not_found = 0
|
||||
|
||||
for ctrl in controls:
|
||||
uid, cid, title, text, state = ctrl
|
||||
|
||||
# Try own GitHub source
|
||||
result = find_text(text, source)
|
||||
if result:
|
||||
matched += 1
|
||||
total_matched += 1
|
||||
all_updates.append((uid, cid, source, result[0], result[1]))
|
||||
print(f" {cid:10s} → {result[0]:30s} [{result[1]}]")
|
||||
continue
|
||||
|
||||
# Try other GitHub sources
|
||||
cross = find_in_any_github(text, exclude_source=source)
|
||||
if cross:
|
||||
cross_matched += 1
|
||||
total_cross += 1
|
||||
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
|
||||
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
|
||||
continue
|
||||
|
||||
not_found += 1
|
||||
total_not_found += 1
|
||||
|
||||
print(f"\n Own source matched: {matched}")
|
||||
print(f" Cross-source: {cross_matched}")
|
||||
print(f" Not found: {not_found}")
|
||||
|
||||
# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text, release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
ORDER BY control_id
|
||||
""")
|
||||
top10_remaining = cur.fetchall()
|
||||
if top10_remaining:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
|
||||
print(f"{'='*60}")
|
||||
for ctrl in top10_remaining:
|
||||
uid, cid, title, text, state = ctrl
|
||||
cross = find_in_any_github(text)
|
||||
if cross:
|
||||
total_cross += 1
|
||||
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
|
||||
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
|
||||
else:
|
||||
total_not_found += 1
|
||||
|
||||
# ── Summary ──
|
||||
print(f"\n{'='*60}")
|
||||
print(f"ZUSAMMENFASSUNG")
|
||||
print(f"{'='*60}")
|
||||
print(f" Matched in eigener GitHub-Quelle: {total_matched}")
|
||||
print(f" Cross-source matched: {total_cross}")
|
||||
print(f" Nicht gefunden: {total_not_found}")
|
||||
print(f" Total Updates: {len(all_updates)}")
|
||||
|
||||
# ── Apply updates ──
|
||||
if all_updates:
|
||||
print(f"\nApplying {len(all_updates)} updates to DB...")
|
||||
applied = 0
|
||||
for uid, cid, correct_source, label, typ in all_updates:
|
||||
# Update article + article_type, and fix source if cross-matched
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('article', %s, 'article_type', %s)
|
||||
WHERE id = %s
|
||||
AND (source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
||||
""", (label, typ, uid, label, typ))
|
||||
if cur.rowcount > 0:
|
||||
applied += 1
|
||||
|
||||
conn.commit()
|
||||
print(f" Applied: {applied} controls updated")
|
||||
|
||||
# Type distribution
|
||||
type_counts = {}
|
||||
for _, _, _, _, typ in all_updates:
|
||||
type_counts[typ] = type_counts.get(typ, 0) + 1
|
||||
print(f"\n Article type distribution:")
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t:12s}: {c:5d}")
|
||||
|
||||
conn.close()
|
||||
357
scripts/qa/phase5_normalize_and_cleanup.py
Normal file
357
scripts/qa/phase5_normalize_and_cleanup.py
Normal file
@@ -0,0 +1,357 @@
|
||||
"""Phase 5: Source Normalization + Duplicate Hard Delete.
|
||||
|
||||
Steps:
|
||||
1. OSCAL controls: add source_regulation to generation_metadata
|
||||
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
|
||||
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
|
||||
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
|
||||
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
|
||||
6. Clean up canonical_processed_chunks generated_control_ids
|
||||
|
||||
Usage:
|
||||
export DATABASE_URL='postgresql://...'
|
||||
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
DRY_RUN = "--dry-run" in sys.argv
|
||||
STEP_ONLY = None
|
||||
for arg in sys.argv:
|
||||
if arg.startswith("--step"):
|
||||
idx = sys.argv.index(arg)
|
||||
if idx + 1 < len(sys.argv):
|
||||
STEP_ONLY = int(sys.argv[idx + 1])
|
||||
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
def should_run(step):
|
||||
return STEP_ONLY is None or STEP_ONLY == step
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 1: OSCAL controls — add source_regulation to generation_metadata
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(1):
|
||||
print("=" * 70)
|
||||
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE generation_strategy = 'oscal_import'
|
||||
AND (generation_metadata->>'source_regulation' IS NULL
|
||||
OR generation_metadata->>'source_regulation' = '')
|
||||
""")
|
||||
count = cur.fetchone()[0]
|
||||
print(f" OSCAL controls without source_regulation: {count}")
|
||||
|
||||
if count > 0:
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would update {count} controls")
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
|
||||
WHERE generation_strategy = 'oscal_import'
|
||||
AND (generation_metadata->>'source_regulation' IS NULL
|
||||
OR generation_metadata->>'source_regulation' = '')
|
||||
""")
|
||||
print(f" Updated: {cur.rowcount}")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 2: v3 controls with NULL source — tag source as best guess
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(2):
|
||||
print("=" * 70)
|
||||
print("STEP 2: Fix v3 controls with NULL source")
|
||||
print("=" * 70)
|
||||
|
||||
# These 20 controls are v3/document_grouped with no source or regulation.
|
||||
# Based on title analysis, they cover:
|
||||
# - Data protection/privacy topics (DSGVO-adjacent)
|
||||
# - Software security (OWASP/NIST-adjacent)
|
||||
# - Mobile security (OWASP MASVS-adjacent)
|
||||
# Mark them as 'needs_review' and add a flag.
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' IS NULL
|
||||
AND pipeline_version = 3
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
v3_null = cur.fetchall()
|
||||
print(f" v3 controls with NULL source: {len(v3_null)}")
|
||||
|
||||
if v3_null:
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
|
||||
else:
|
||||
for ctrl_id_uuid, control_id, title in v3_null:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET release_state = 'needs_review',
|
||||
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"missing_source": true}'::jsonb
|
||||
WHERE id = %s
|
||||
""", (ctrl_id_uuid,))
|
||||
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 3: Fix empty-string source (DATA-631)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(3):
|
||||
print("=" * 70)
|
||||
print("STEP 3: Fix empty-string source")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title,
|
||||
generation_metadata->>'source_regulation' as reg
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = ''
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
empty_src = cur.fetchall()
|
||||
print(f" Controls with empty source: {len(empty_src)}")
|
||||
|
||||
for ctrl_id_uuid, control_id, title, reg in empty_src:
|
||||
print(f" {control_id} | reg={reg} | {title[:60]}")
|
||||
if reg == 'at_tkg':
|
||||
new_source = 'Telekommunikationsgesetz Oesterreich'
|
||||
else:
|
||||
new_source = f"Unbekannt ({reg})"
|
||||
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would set source='{new_source}'")
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = jsonb_set(
|
||||
source_citation, '{source}', %s::jsonb
|
||||
)
|
||||
WHERE id = %s
|
||||
""", (json.dumps(new_source), ctrl_id_uuid))
|
||||
print(f" Set source='{new_source}'")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 4: Fix OWASP cross-source misattributions
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(4):
|
||||
print("=" * 70)
|
||||
print("STEP 4: Fix OWASP cross-source misattributions")
|
||||
print("=" * 70)
|
||||
|
||||
# Controls where source_citation.source doesn't match the regulation_code
|
||||
OWASP_REG_TO_SOURCE = {
|
||||
'owasp_top10_2021': 'OWASP Top 10 (2021)',
|
||||
'owasp_asvs': 'OWASP ASVS 4.0',
|
||||
'owasp_masvs': 'OWASP MASVS 2.0',
|
||||
'owasp_samm': 'OWASP SAMM 2.0',
|
||||
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
|
||||
}
|
||||
|
||||
# Strategy: Move controls to the regulation_code that matches their actual source
|
||||
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
|
||||
# update the reg to 'owasp_asvs'
|
||||
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
|
||||
|
||||
total_fixed = 0
|
||||
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
|
||||
cur.execute("""
|
||||
SELECT id, control_id, source_citation->>'source' as src
|
||||
FROM compliance.canonical_controls
|
||||
WHERE generation_metadata->>'source_regulation' = %s
|
||||
AND source_citation->>'source' <> %s
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (reg_code, expected_source))
|
||||
mismatches = cur.fetchall()
|
||||
|
||||
if mismatches:
|
||||
print(f"\n {reg_code} → {len(mismatches)} Mismatches:")
|
||||
for ctrl_id_uuid, control_id, actual_source in mismatches:
|
||||
correct_reg = SOURCE_TO_REG.get(actual_source)
|
||||
if correct_reg:
|
||||
print(f" {control_id} | {actual_source} → reg={correct_reg}")
|
||||
if not DRY_RUN:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
generation_metadata, '{source_regulation}', %s::jsonb
|
||||
)
|
||||
WHERE id = %s
|
||||
""", (json.dumps(correct_reg), ctrl_id_uuid))
|
||||
total_fixed += 1
|
||||
else:
|
||||
print(f" {control_id} | {actual_source} → no mapping found")
|
||||
|
||||
if DRY_RUN:
|
||||
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
|
||||
else:
|
||||
print(f"\n Fixed: {total_fixed} misattributions")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 5: Hard delete duplicate/too_close controls
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(5):
|
||||
print("=" * 70)
|
||||
print("STEP 5: Hard delete duplicate/too_close controls")
|
||||
print("=" * 70)
|
||||
|
||||
# Verify no FK references
|
||||
for table, col in [
|
||||
('canonical_control_mappings', 'control_id'),
|
||||
('obligation_extractions', 'control_uuid'),
|
||||
('crosswalk_matrix', 'master_control_uuid'),
|
||||
('obligation_candidates', 'parent_control_uuid'),
|
||||
]:
|
||||
cur.execute(f"""
|
||||
SELECT count(*)
|
||||
FROM compliance.{table} t
|
||||
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
|
||||
WHERE cc.release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
fk_count = cur.fetchone()[0]
|
||||
if fk_count > 0:
|
||||
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
|
||||
print(f" ABORTING Step 5 — clean FK refs first!")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" {table}.{col}: 0 refs ✓")
|
||||
|
||||
# Check self-references
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls child
|
||||
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
|
||||
WHERE parent.release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
self_refs = cur.fetchone()[0]
|
||||
if self_refs > 0:
|
||||
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
|
||||
print(f" ABORTING Step 5!")
|
||||
sys.exit(1)
|
||||
print(f" Self-references: 0 ✓")
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state IN ('duplicate', 'too_close')
|
||||
GROUP BY 1
|
||||
""")
|
||||
to_delete = {}
|
||||
for state, cnt in cur.fetchall():
|
||||
to_delete[state] = cnt
|
||||
print(f"\n {state}: {cnt}")
|
||||
|
||||
total = sum(to_delete.values())
|
||||
print(f"\n TOTAL to delete: {total}")
|
||||
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would delete {total} controls")
|
||||
else:
|
||||
cur.execute("""
|
||||
DELETE FROM compliance.canonical_controls
|
||||
WHERE release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
print(f" Deleted: {cur.rowcount} controls")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 6: Clean up canonical_processed_chunks generated_control_ids
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(6):
|
||||
print("=" * 70)
|
||||
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
|
||||
print("=" * 70)
|
||||
|
||||
if DRY_RUN and should_run(5):
|
||||
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
|
||||
else:
|
||||
# Find chunks that reference non-existent controls
|
||||
cur.execute("""
|
||||
SELECT id, generated_control_ids
|
||||
FROM compliance.canonical_processed_chunks
|
||||
WHERE generated_control_ids IS NOT NULL
|
||||
AND generated_control_ids <> '[]'::jsonb
|
||||
""")
|
||||
chunks = cur.fetchall()
|
||||
print(f" Chunks with generated_control_ids: {len(chunks)}")
|
||||
|
||||
# Get all existing control IDs
|
||||
cur.execute("SELECT id::text FROM compliance.canonical_controls")
|
||||
existing_ids = set(r[0] for r in cur.fetchall())
|
||||
print(f" Existing controls: {len(existing_ids)}")
|
||||
|
||||
cleaned = 0
|
||||
for chunk_id, control_ids in chunks:
|
||||
if isinstance(control_ids, str):
|
||||
control_ids = json.loads(control_ids)
|
||||
if isinstance(control_ids, list):
|
||||
valid_ids = [cid for cid in control_ids if cid in existing_ids]
|
||||
if len(valid_ids) < len(control_ids):
|
||||
removed = len(control_ids) - len(valid_ids)
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_processed_chunks
|
||||
SET generated_control_ids = %s::jsonb
|
||||
WHERE id = %s
|
||||
""", (json.dumps(valid_ids), chunk_id))
|
||||
cleaned += 1
|
||||
|
||||
print(f" Chunks cleaned: {cleaned}")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Final summary
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if not DRY_RUN:
|
||||
conn.commit()
|
||||
print("=" * 70)
|
||||
print("COMMITTED. Final state:")
|
||||
print("=" * 70)
|
||||
else:
|
||||
print("=" * 70)
|
||||
print("[DRY RUN] No changes committed. Current state:")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY 1
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
total = 0
|
||||
active = 0
|
||||
for state, cnt in cur.fetchall():
|
||||
total += cnt
|
||||
if state not in ('duplicate', 'too_close'):
|
||||
active += cnt
|
||||
print(f" {state:15s}: {cnt:5d}")
|
||||
|
||||
print(f"\n TOTAL: {total}")
|
||||
print(f" AKTIV: {active}")
|
||||
|
||||
conn.close()
|
||||
655
scripts/qa/phase74_generate_gap_controls.py
Normal file
655
scripts/qa/phase74_generate_gap_controls.py
Normal file
@@ -0,0 +1,655 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
|
||||
|
||||
Reads gap_analysis_results.json, extracts article text from PDFs,
|
||||
calls Claude Sonnet to generate controls, inserts into DB.
|
||||
|
||||
Usage:
|
||||
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
|
||||
python3 phase74_generate_gap_controls.py # generate and insert
|
||||
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
|
||||
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import argparse
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from pdf_qa_all import (
|
||||
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
|
||||
build_eu_article_index, build_de_law_index, build_nist_index,
|
||||
build_owasp_index, build_generic_index, MAX_ARTICLES,
|
||||
)
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────
|
||||
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
PIPELINE_VERSION = 5
|
||||
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = None
|
||||
|
||||
# ── Source name → regulation_code reverse map ────────────────────────
|
||||
# Built from REGULATION_LICENSE_MAP in control_generator.py
|
||||
SOURCE_TO_REGCODE = {
|
||||
"DSGVO (EU) 2016/679": "eu_2016_679",
|
||||
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
|
||||
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
|
||||
"Cyber Resilience Act (CRA)": "eu_2024_2847",
|
||||
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
|
||||
"EU Blue Guide 2022": "eu_blue_guide_2022",
|
||||
"Markets in Crypto-Assets (MiCA)": "mica",
|
||||
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
|
||||
"AML-Verordnung": "amlr",
|
||||
"Data Governance Act (DGA)": "dga",
|
||||
"Data Act": "data_act",
|
||||
"GPSR (EU) 2023/988": "gpsr",
|
||||
"IFRS-Übernahmeverordnung": "ifrs",
|
||||
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
|
||||
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
|
||||
"NIST SP 800-63-3": "nist_sp800_63_3",
|
||||
"NIST AI Risk Management Framework": "nist_ai_rmf",
|
||||
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
|
||||
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
|
||||
"OWASP Top 10 (2021)": "owasp_top10",
|
||||
"OWASP ASVS 4.0": "owasp_asvs",
|
||||
"OWASP SAMM 2.0": "owasp_samm",
|
||||
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
|
||||
"OWASP MASVS 2.0": "owasp_masvs",
|
||||
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
|
||||
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
|
||||
"CISA Secure by Design": "cisa_sbd",
|
||||
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
|
||||
"Gewerbeordnung (GewO)": "gewo",
|
||||
"Handelsgesetzbuch (HGB)": "hgb",
|
||||
"Abgabenordnung (AO)": "ao",
|
||||
"OECD KI-Empfehlung": "oecd_ai_principles",
|
||||
}
|
||||
|
||||
# License info per regulation code (from REGULATION_LICENSE_MAP)
|
||||
LICENSE_MAP = {
|
||||
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
|
||||
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
||||
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
||||
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
|
||||
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
|
||||
}
|
||||
|
||||
# Domain detection keywords
|
||||
DOMAIN_KEYWORDS = {
|
||||
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
|
||||
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
|
||||
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
|
||||
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
|
||||
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
|
||||
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
|
||||
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
|
||||
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
|
||||
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
|
||||
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
|
||||
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
|
||||
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
|
||||
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
|
||||
}
|
||||
|
||||
# ── Prompt (same as control_generator.py) ────────────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
||||
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
||||
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
|
||||
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
|
||||
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
|
||||
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
|
||||
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
|
||||
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
|
||||
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
|
||||
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
|
||||
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
|
||||
"Abfallwirtschaft", "Forschung"
|
||||
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
||||
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
||||
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
||||
{"requires_any": ["signal"], "description": "Erklaerung"}
|
||||
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
|
||||
"processes_minors_data", "automated_decisions", "employee_monitoring",
|
||||
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
|
||||
|
||||
CATEGORY_LIST = [
|
||||
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
|
||||
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
|
||||
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
|
||||
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
|
||||
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
|
||||
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
|
||||
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
|
||||
]
|
||||
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
|
||||
|
||||
|
||||
def build_prompt(source_name, article_label, article_text, license_type):
|
||||
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
||||
|
||||
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
||||
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
||||
|
||||
Gib JSON zurück mit diesen Feldern:
|
||||
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Sätze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Prüfschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
|
||||
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
|
||||
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
|
||||
{APPLICABILITY_PROMPT}
|
||||
|
||||
Text: {article_text[:3000]}
|
||||
Quelle: {source_name}, {article_label}"""
|
||||
|
||||
|
||||
# ── PDF article extraction ───────────────────────────────────────────
|
||||
|
||||
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
|
||||
"""Extract the text of a specific article from a PDF."""
|
||||
if full_text is None:
|
||||
full_text = read_file(pdf_file)
|
||||
if not full_text:
|
||||
return ""
|
||||
|
||||
if doc_type == "eu_regulation":
|
||||
art_num_match = re.search(r'\d+', article_label)
|
||||
if not art_num_match:
|
||||
return ""
|
||||
num = int(art_num_match.group())
|
||||
pattern = rf'\nArtikel\s+{num}\s*\n'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
||||
return full_text[start:end].strip()[:3000]
|
||||
|
||||
elif doc_type == "de_law":
|
||||
para_match = re.search(r'\d+', article_label)
|
||||
if not para_match:
|
||||
return ""
|
||||
num = int(para_match.group())
|
||||
pattern = rf'\n§\s+{num}\b'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
next_pattern = rf'\n§\s+{num + 1}\b'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
||||
return full_text[start:end].strip()[:3000]
|
||||
|
||||
elif doc_type == "nist":
|
||||
escaped = re.escape(article_label)
|
||||
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
return full_text[start:start + 3000].strip()
|
||||
|
||||
else:
|
||||
# Generic / OWASP / ENISA
|
||||
escaped = re.escape(article_label)
|
||||
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
return full_text[start:start + 3000].strip()
|
||||
|
||||
|
||||
# ── Anthropic API ────────────────────────────────────────────────────
|
||||
|
||||
def call_anthropic(prompt, system_prompt):
|
||||
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
content = data["content"][0]["text"] if data.get("content") else ""
|
||||
usage = data.get("usage", {})
|
||||
parsed = parse_json(content)
|
||||
return parsed, content, usage, None
|
||||
except Exception as e:
|
||||
return None, "", {}, str(e)
|
||||
|
||||
|
||||
def parse_json(text):
|
||||
"""Parse JSON from LLM response, handling markdown fences."""
|
||||
text = text.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
|
||||
text = text.strip()
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, list):
|
||||
return data[0] if data else None
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# ── Domain detection ─────────────────────────────────────────────────
|
||||
|
||||
def detect_domain(text):
|
||||
text_lower = text.lower()
|
||||
scores = {}
|
||||
for domain, keywords in DOMAIN_KEYWORDS.items():
|
||||
score = sum(1 for kw in keywords if kw in text_lower)
|
||||
if score > 0:
|
||||
scores[domain] = score
|
||||
if scores:
|
||||
return max(scores, key=scores.get)
|
||||
return "SEC"
|
||||
|
||||
|
||||
# ── Control ID generation ────────────────────────────────────────────
|
||||
|
||||
def generate_control_id(domain, cur):
|
||||
"""Generate next available control_id for domain prefix.
|
||||
|
||||
Uses MAX(numeric suffix) to find the true highest number,
|
||||
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
|
||||
"""
|
||||
prefix = domain.upper()[:4]
|
||||
cur.execute("""
|
||||
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id LIKE %s
|
||||
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
|
||||
""", (f"{prefix}-%",))
|
||||
row = cur.fetchone()
|
||||
if row and row[0] is not None:
|
||||
return f"{prefix}-{row[0] + 1}"
|
||||
return f"{prefix}-001"
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
||||
parser.add_argument("--source", type=str, help="Filter by source name substring")
|
||||
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
|
||||
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
# Load gap results
|
||||
with open(args.results) as f:
|
||||
gaps = json.load(f)
|
||||
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
||||
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
|
||||
|
||||
if args.source:
|
||||
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
|
||||
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
||||
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
|
||||
|
||||
# DB connection with keepalive + reconnect helper
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
|
||||
def connect_db():
|
||||
"""Create DB connection with TCP keepalive."""
|
||||
c = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public",
|
||||
keepalives=1, keepalives_idle=30,
|
||||
keepalives_interval=10, keepalives_count=5,
|
||||
)
|
||||
return c, c.cursor()
|
||||
|
||||
conn, cur = connect_db()
|
||||
|
||||
def ensure_db():
|
||||
"""Reconnect if connection is dead."""
|
||||
nonlocal conn, cur
|
||||
try:
|
||||
cur.execute("SELECT 1")
|
||||
except Exception:
|
||||
print(" [RECONNECT] DB connection lost, reconnecting...")
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
conn, cur = connect_db()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Get framework UUID
|
||||
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
|
||||
fw_row = cur.fetchone()
|
||||
if not fw_row:
|
||||
print("ERROR: Framework bp_security_v1 not found")
|
||||
sys.exit(1)
|
||||
framework_uuid = fw_row[0]
|
||||
|
||||
# If resuming, load existing articles per source
|
||||
existing_articles = {}
|
||||
if args.resume:
|
||||
cur.execute("""
|
||||
SELECT source_citation->>'source', source_citation->>'article'
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'article' IS NOT NULL
|
||||
""")
|
||||
for src, art in cur.fetchall():
|
||||
existing_articles.setdefault(src, set()).add(art)
|
||||
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
|
||||
|
||||
# Stats
|
||||
stats = Counter()
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
generated_ids = []
|
||||
errors = []
|
||||
t_start = time.time()
|
||||
|
||||
# Pre-read PDFs (cache full text per source)
|
||||
pdf_cache = {}
|
||||
|
||||
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
|
||||
source_name = gap_source["source"]
|
||||
gap_articles = gap_source["gap_articles"]
|
||||
filename = SOURCE_FILE_MAP.get(source_name)
|
||||
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
|
||||
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
|
||||
doc_type = classify_doc(source_name)
|
||||
|
||||
if not filename:
|
||||
stats["skipped_no_pdf"] += len(gap_articles)
|
||||
continue
|
||||
|
||||
# Read PDF once per source
|
||||
if source_name not in pdf_cache:
|
||||
pdf_cache[source_name] = read_file(filename)
|
||||
full_text = pdf_cache[source_name]
|
||||
if not full_text:
|
||||
stats["skipped_no_pdf"] += len(gap_articles)
|
||||
continue
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
|
||||
print(f"{'='*70}")
|
||||
|
||||
for gap in gap_articles:
|
||||
article_label = gap["label"]
|
||||
article_type = gap["type"]
|
||||
|
||||
# Skip if already has controls (resume mode)
|
||||
if args.resume and article_label in existing_articles.get(source_name, set()):
|
||||
stats["skipped_exists"] += 1
|
||||
continue
|
||||
|
||||
# Skip non-substantive NIST sections (intro chapters)
|
||||
if doc_type == "nist" and article_type == "section":
|
||||
section_match = re.match(r'Section (\d+)', article_label)
|
||||
if section_match and int(section_match.group(1)) <= 3:
|
||||
stats["skipped_intro"] += 1
|
||||
continue
|
||||
|
||||
# Extract article text
|
||||
article_text = extract_article_text(filename, article_label, doc_type, full_text)
|
||||
if not article_text or len(article_text) < 30:
|
||||
stats["skipped_short_text"] += 1
|
||||
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
print(f" [DRY] {article_label} ({len(article_text)} chars)")
|
||||
stats["would_generate"] += 1
|
||||
continue
|
||||
|
||||
# Call Anthropic
|
||||
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
|
||||
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
|
||||
|
||||
total_input_tokens += usage.get("input_tokens", 0)
|
||||
total_output_tokens += usage.get("output_tokens", 0)
|
||||
|
||||
if error:
|
||||
stats["api_error"] += 1
|
||||
errors.append(f"{source_name} {article_label}: {error}")
|
||||
print(f" ERROR {article_label}: {error}")
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
if not data:
|
||||
stats["parse_error"] += 1
|
||||
print(f" PARSE ERROR {article_label}")
|
||||
continue
|
||||
|
||||
# Ensure DB is alive before writing
|
||||
ensure_db()
|
||||
|
||||
# Build control
|
||||
title = str(data.get("title", ""))[:200]
|
||||
objective = str(data.get("objective", ""))
|
||||
rationale = str(data.get("rationale", ""))
|
||||
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
|
||||
if not domain or len(domain) < 2:
|
||||
domain = detect_domain(article_text)
|
||||
|
||||
control_id = generate_control_id(domain, cur)
|
||||
severity = str(data.get("severity", "medium")).lower()
|
||||
if severity not in ("low", "medium", "high", "critical"):
|
||||
severity = "medium"
|
||||
|
||||
requirements = data.get("requirements", [])
|
||||
if not isinstance(requirements, list):
|
||||
requirements = [str(requirements)]
|
||||
test_procedure = data.get("test_procedure", [])
|
||||
if not isinstance(test_procedure, list):
|
||||
test_procedure = [str(test_procedure)]
|
||||
evidence = data.get("evidence", [])
|
||||
if not isinstance(evidence, list):
|
||||
evidence = [str(evidence)]
|
||||
tags = data.get("tags", [])
|
||||
if not isinstance(tags, list):
|
||||
tags = []
|
||||
target_audience = data.get("target_audience", [])
|
||||
if not isinstance(target_audience, list):
|
||||
target_audience = []
|
||||
applicable_industries = data.get("applicable_industries", ["all"])
|
||||
if not isinstance(applicable_industries, list):
|
||||
applicable_industries = ["all"]
|
||||
applicable_company_size = data.get("applicable_company_size", ["all"])
|
||||
if not isinstance(applicable_company_size, list):
|
||||
applicable_company_size = ["all"]
|
||||
scope_conditions = data.get("scope_conditions")
|
||||
|
||||
source_citation = {
|
||||
"source": source_name,
|
||||
"article": data.get("source_article", article_label),
|
||||
"paragraph": data.get("source_paragraph", ""),
|
||||
"article_type": article_type,
|
||||
"license": license_info["license"],
|
||||
"source_type": license_info["source_type"],
|
||||
}
|
||||
|
||||
generation_metadata = {
|
||||
"processing_path": "phase74_gap_fill",
|
||||
"license_rule": license_info["rule"],
|
||||
"source_regulation": reg_code,
|
||||
"source_article": article_label,
|
||||
"gap_fill": True,
|
||||
}
|
||||
|
||||
category = str(data.get("category", "")) or None
|
||||
|
||||
# Insert into DB
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls (
|
||||
framework_id, control_id, title, objective, rationale,
|
||||
scope, requirements, test_procedure, evidence,
|
||||
severity, risk_score, implementation_effort,
|
||||
open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, generation_metadata,
|
||||
verification_method, category, generation_strategy,
|
||||
target_audience, pipeline_version,
|
||||
applicable_industries, applicable_company_size, scope_conditions
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s
|
||||
)
|
||||
ON CONFLICT (framework_id, control_id) DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
framework_uuid, control_id, title, objective, rationale,
|
||||
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
|
||||
severity, 5, "m",
|
||||
json.dumps([]), "draft", json.dumps(tags),
|
||||
license_info["rule"], article_text, json.dumps(source_citation),
|
||||
True, json.dumps(generation_metadata),
|
||||
"document", category, "phase74_gap_fill",
|
||||
json.dumps(target_audience), PIPELINE_VERSION,
|
||||
json.dumps(applicable_industries), json.dumps(applicable_company_size),
|
||||
json.dumps(scope_conditions) if scope_conditions else None,
|
||||
))
|
||||
conn.commit()
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
generated_ids.append(str(row[0]))
|
||||
stats["generated"] += 1
|
||||
print(f" OK {control_id}: {title[:60]}")
|
||||
else:
|
||||
stats["conflict"] += 1
|
||||
print(f" CONFLICT {control_id} (already exists)")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
stats["db_error"] += 1
|
||||
errors.append(f"DB {control_id}: {str(e)[:100]}")
|
||||
print(f" DB ERROR {control_id}: {str(e)[:100]}")
|
||||
|
||||
# Rate limit: ~0.5s between calls
|
||||
time.sleep(0.5)
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
elapsed = time.time() - t_start
|
||||
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
|
||||
|
||||
print(f"\n\n{'='*70}")
|
||||
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
|
||||
print(f"{'='*70}")
|
||||
print(f" Laufzeit: {elapsed/60:.1f} min")
|
||||
print(f" API-Kosten: ${cost:.2f}")
|
||||
print(f" Input Tokens: {total_input_tokens:,}")
|
||||
print(f" Output Tokens: {total_output_tokens:,}")
|
||||
print()
|
||||
for key in sorted(stats.keys()):
|
||||
print(f" {key:<25s}: {stats[key]:5d}")
|
||||
print()
|
||||
|
||||
if generated_ids:
|
||||
print(f" Neue Control-IDs: {len(generated_ids)}")
|
||||
# Save generated IDs
|
||||
with open("/tmp/phase74_generated_ids.json", 'w') as f:
|
||||
json.dump(generated_ids, f)
|
||||
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
|
||||
|
||||
if errors:
|
||||
print(f"\n Fehler ({len(errors)}):")
|
||||
for e in errors[:20]:
|
||||
print(f" {e}")
|
||||
if len(errors) > 20:
|
||||
print(f" ... und {len(errors)-20} weitere")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
218
scripts/qa/run_job.sh
Executable file
218
scripts/qa/run_job.sh
Executable file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env bash
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# Robust job runner for QA scripts on Mac Mini
|
||||
#
|
||||
# Usage:
|
||||
# ./run_job.sh <script.py> [args...] # start job
|
||||
# ./run_job.sh --status # show running jobs
|
||||
# ./run_job.sh --kill <script.py> # kill a running job
|
||||
# ./run_job.sh --log <script.py> # tail log
|
||||
#
|
||||
# Features:
|
||||
# - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
|
||||
# - PID-file prevents duplicate runs
|
||||
# - Unbuffered Python output
|
||||
# - Structured log files in /tmp/qa_jobs/
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
JOB_DIR="/tmp/qa_jobs"
|
||||
mkdir -p "$JOB_DIR"
|
||||
|
||||
# ── Load .env ────────────────────────────────────────────────
|
||||
load_env() {
|
||||
local envfile="$PROJECT_DIR/.env"
|
||||
if [[ -f "$envfile" ]]; then
|
||||
# Export all vars from .env
|
||||
set -a
|
||||
# shellcheck disable=SC1090
|
||||
source "$envfile"
|
||||
set +a
|
||||
fi
|
||||
# Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
|
||||
if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
|
||||
export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Job name from script path ─────────────────────────────────
|
||||
job_name() {
|
||||
basename "$1" .py
|
||||
}
|
||||
|
||||
pid_file() {
|
||||
echo "$JOB_DIR/$(job_name "$1").pid"
|
||||
}
|
||||
|
||||
log_file() {
|
||||
echo "$JOB_DIR/$(job_name "$1").log"
|
||||
}
|
||||
|
||||
# ── Status ────────────────────────────────────────────────────
|
||||
show_status() {
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
local found=0
|
||||
for pidfile in "$JOB_DIR"/*.pid; do
|
||||
[[ -f "$pidfile" ]] || continue
|
||||
found=1
|
||||
local name
|
||||
name=$(basename "$pidfile" .pid)
|
||||
local pid
|
||||
pid=$(cat "$pidfile")
|
||||
local logf="$JOB_DIR/$name.log"
|
||||
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
local lines
|
||||
lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
|
||||
local errors
|
||||
errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
|
||||
local last_line
|
||||
last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
|
||||
echo " ● $name (PID $pid) — RUNNING"
|
||||
echo " Log: $logf ($lines lines, $errors errors)"
|
||||
echo " Last: $last_line"
|
||||
else
|
||||
echo " ○ $name (PID $pid) — STOPPED"
|
||||
echo " Log: $logf"
|
||||
rm -f "$pidfile"
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
if [[ $found -eq 0 ]]; then
|
||||
echo " No jobs running."
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Kill ──────────────────────────────────────────────────────
|
||||
kill_job() {
|
||||
local script="$1"
|
||||
local pf
|
||||
pf=$(pid_file "$script")
|
||||
if [[ ! -f "$pf" ]]; then
|
||||
echo "No PID file for $(job_name "$script")"
|
||||
return 1
|
||||
fi
|
||||
local pid
|
||||
pid=$(cat "$pf")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$pid"
|
||||
echo "Killed $(job_name "$script") (PID $pid)"
|
||||
else
|
||||
echo "Process $pid already stopped"
|
||||
fi
|
||||
rm -f "$pf"
|
||||
}
|
||||
|
||||
# ── Tail log ──────────────────────────────────────────────────
|
||||
tail_log() {
|
||||
local script="$1"
|
||||
local lf
|
||||
lf=$(log_file "$script")
|
||||
if [[ ! -f "$lf" ]]; then
|
||||
echo "No log file: $lf"
|
||||
return 1
|
||||
fi
|
||||
tail -50 "$lf"
|
||||
}
|
||||
|
||||
# ── Start job ─────────────────────────────────────────────────
|
||||
start_job() {
|
||||
local script="$1"
|
||||
shift
|
||||
local args=("$@")
|
||||
|
||||
# Resolve script path
|
||||
local script_path="$script"
|
||||
if [[ ! -f "$script_path" ]]; then
|
||||
script_path="$SCRIPT_DIR/$script"
|
||||
fi
|
||||
if [[ ! -f "$script_path" ]]; then
|
||||
echo "ERROR: Script not found: $script"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local name
|
||||
name=$(job_name "$script")
|
||||
local pf
|
||||
pf=$(pid_file "$script")
|
||||
local lf
|
||||
lf=$(log_file "$script")
|
||||
|
||||
# Check for already-running instance
|
||||
if [[ -f "$pf" ]]; then
|
||||
local existing_pid
|
||||
existing_pid=$(cat "$pf")
|
||||
if kill -0 "$existing_pid" 2>/dev/null; then
|
||||
echo "ERROR: $name already running (PID $existing_pid)"
|
||||
echo "Use: $0 --kill $script"
|
||||
return 1
|
||||
fi
|
||||
rm -f "$pf"
|
||||
fi
|
||||
|
||||
# Load environment
|
||||
load_env
|
||||
|
||||
# Verify required env vars
|
||||
if [[ -z "${DATABASE_URL:-}" ]]; then
|
||||
echo "ERROR: DATABASE_URL not set (checked .env)"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Start
|
||||
echo "Starting $name..."
|
||||
echo " Script: $script_path"
|
||||
echo " Args: ${args[*]:-none}"
|
||||
echo " Log: $lf"
|
||||
|
||||
nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
|
||||
local pid=$!
|
||||
echo "$pid" > "$pf"
|
||||
|
||||
echo " PID: $pid"
|
||||
echo ""
|
||||
|
||||
# Wait a moment and check it started OK
|
||||
sleep 3
|
||||
if ! kill -0 "$pid" 2>/dev/null; then
|
||||
echo "ERROR: Process died immediately. Log output:"
|
||||
cat "$lf"
|
||||
rm -f "$pf"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local lines
|
||||
lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
|
||||
echo "Running OK ($lines log lines so far)"
|
||||
echo "Monitor with: $0 --status"
|
||||
echo "Tail log: $0 --log $script"
|
||||
}
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────
|
||||
case "${1:-}" in
|
||||
--status|-s)
|
||||
show_status
|
||||
;;
|
||||
--kill|-k)
|
||||
[[ -n "${2:-}" ]] || { echo "Usage: $0 --kill <script.py>"; exit 1; }
|
||||
kill_job "$2"
|
||||
;;
|
||||
--log|-l)
|
||||
[[ -n "${2:-}" ]] || { echo "Usage: $0 --log <script.py>"; exit 1; }
|
||||
tail_log "$2"
|
||||
;;
|
||||
--help|-h|"")
|
||||
echo "Usage:"
|
||||
echo " $0 <script.py> [args...] Start a QA job"
|
||||
echo " $0 --status Show running jobs"
|
||||
echo " $0 --kill <script.py> Kill a running job"
|
||||
echo " $0 --log <script.py> Tail job log"
|
||||
;;
|
||||
*)
|
||||
start_job "$@"
|
||||
;;
|
||||
esac
|
||||
307
scripts/qa/sync_db.py
Normal file
307
scripts/qa/sync_db.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sync canonical control tables between production and local DB.
|
||||
|
||||
Modes:
|
||||
--pull Production → Local (initial sync, full table copy)
|
||||
--push Local → Production (incremental, only new obligation_candidates)
|
||||
--loop Run --push every N minutes (default 60)
|
||||
|
||||
Usage:
|
||||
python3 sync_db.py --pull # Full sync production → local
|
||||
python3 sync_db.py --push # Push new obligations to production
|
||||
python3 sync_db.py --loop 60 # Push every 60 minutes
|
||||
python3 sync_db.py --pull --tables canonical_controls # Only one table
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
import io
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import psycopg2.extensions
|
||||
|
||||
# Register JSON adapter so dicts are automatically converted to JSONB
|
||||
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
|
||||
|
||||
# ── DB Config ────────────────────────────────────────────────────────
|
||||
|
||||
PROD_URL = os.environ.get(
|
||||
"PROD_DATABASE_URL",
|
||||
"postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
|
||||
"@46.225.100.82:54321/postgres?sslmode=require",
|
||||
)
|
||||
LOCAL_URL = os.environ.get(
|
||||
"LOCAL_DATABASE_URL",
|
||||
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
|
||||
)
|
||||
|
||||
SCHEMA = "compliance"
|
||||
|
||||
# Tables to sync (production → local)
|
||||
SYNC_TABLES = [
|
||||
"canonical_control_frameworks",
|
||||
"canonical_control_licenses",
|
||||
"canonical_control_sources",
|
||||
"canonical_control_categories",
|
||||
"canonical_blocked_sources",
|
||||
"canonical_controls",
|
||||
"canonical_control_mappings",
|
||||
"canonical_processed_chunks",
|
||||
"canonical_generation_jobs",
|
||||
"control_patterns",
|
||||
"crosswalk_matrix",
|
||||
"obligation_extractions",
|
||||
"obligation_candidates",
|
||||
]
|
||||
|
||||
|
||||
def connect(url, label="DB"):
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
params = dict(urllib.parse.parse_qsl(parsed.query))
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname,
|
||||
port=parsed.port or 5432,
|
||||
user=parsed.username,
|
||||
password=parsed.password,
|
||||
dbname=parsed.path.lstrip("/"),
|
||||
sslmode=params.get("sslmode", "prefer"),
|
||||
options=f"-c search_path={SCHEMA},public",
|
||||
keepalives=1,
|
||||
keepalives_idle=30,
|
||||
keepalives_interval=10,
|
||||
keepalives_count=5,
|
||||
)
|
||||
conn.autocommit = False
|
||||
print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
|
||||
return conn
|
||||
|
||||
|
||||
def get_columns(cur, table):
|
||||
cur.execute(f"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
|
||||
ORDER BY ordinal_position
|
||||
""")
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def pull_table(prod_conn, local_conn, table):
|
||||
"""Copy entire table from production to local via SELECT + INSERT."""
|
||||
prod_cur = prod_conn.cursor()
|
||||
local_cur = local_conn.cursor()
|
||||
|
||||
# Check table exists on production
|
||||
prod_cur.execute(f"""
|
||||
SELECT 1 FROM pg_tables
|
||||
WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
|
||||
""")
|
||||
if not prod_cur.fetchone():
|
||||
print(f" SKIP {table} — not found on production")
|
||||
return 0
|
||||
|
||||
# Drop local table
|
||||
local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
|
||||
local_conn.commit()
|
||||
|
||||
# Build simple CREATE TABLE (no constraints, no defaults — just for data)
|
||||
prod_cur.execute(f"""
|
||||
SELECT column_name, data_type, udt_name, character_maximum_length
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
|
||||
ORDER BY ordinal_position
|
||||
""")
|
||||
col_defs = prod_cur.fetchall()
|
||||
|
||||
parts = []
|
||||
col_names = []
|
||||
jsonb_cols = set()
|
||||
for name, dtype, udt, max_len in col_defs:
|
||||
col_names.append(name)
|
||||
if dtype == "ARRAY":
|
||||
type_map = {
|
||||
"_text": "text[]", "_varchar": "varchar[]",
|
||||
"_int4": "integer[]", "_uuid": "uuid[]",
|
||||
"_jsonb": "jsonb[]", "_float8": "float8[]",
|
||||
}
|
||||
sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
|
||||
elif dtype == "USER-DEFINED" and udt == "jsonb":
|
||||
sql_type = "jsonb"
|
||||
jsonb_cols.add(name)
|
||||
elif dtype == "USER-DEFINED":
|
||||
sql_type = udt
|
||||
elif dtype == "jsonb":
|
||||
sql_type = "jsonb"
|
||||
jsonb_cols.add(name)
|
||||
elif max_len:
|
||||
sql_type = f"{dtype}({max_len})"
|
||||
else:
|
||||
sql_type = dtype
|
||||
parts.append(f'"{name}" {sql_type}')
|
||||
|
||||
ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
|
||||
local_cur.execute(ddl)
|
||||
local_conn.commit()
|
||||
|
||||
# Fetch all rows from production
|
||||
col_list = ", ".join(f'"{c}"' for c in col_names)
|
||||
prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
|
||||
rows = prod_cur.fetchall()
|
||||
|
||||
if rows:
|
||||
# Wrap dict/list values in Json for JSONB columns
|
||||
adapted_rows = []
|
||||
for row in rows:
|
||||
adapted = []
|
||||
for i, val in enumerate(row):
|
||||
if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
|
||||
adapted.append(psycopg2.extras.Json(val))
|
||||
else:
|
||||
adapted.append(val)
|
||||
adapted_rows.append(tuple(adapted))
|
||||
|
||||
placeholders = ", ".join(["%s"] * len(col_names))
|
||||
insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
|
||||
psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
|
||||
local_conn.commit()
|
||||
|
||||
print(f" {table}: {len(rows)} rows")
|
||||
return len(rows)
|
||||
|
||||
|
||||
def pull(tables=None):
|
||||
"""Full sync: production → local."""
|
||||
print("\n=== PULL: Production → Local ===\n")
|
||||
|
||||
prod_conn = connect(PROD_URL, "Production")
|
||||
local_conn = connect(LOCAL_URL, "Local")
|
||||
|
||||
# Ensure schema exists
|
||||
local_cur = local_conn.cursor()
|
||||
local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
|
||||
local_conn.commit()
|
||||
|
||||
sync_list = tables if tables else SYNC_TABLES
|
||||
total = 0
|
||||
|
||||
for table in sync_list:
|
||||
try:
|
||||
count = pull_table(prod_conn, local_conn, table)
|
||||
total += count
|
||||
except Exception as e:
|
||||
print(f" ERROR {table}: {e}")
|
||||
local_conn.rollback()
|
||||
prod_conn.rollback()
|
||||
|
||||
print(f"\n Total: {total} rows synced")
|
||||
prod_conn.close()
|
||||
local_conn.close()
|
||||
|
||||
|
||||
def push():
|
||||
"""Incremental push: new obligation_candidates local → production."""
|
||||
print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
|
||||
|
||||
local_conn = connect(LOCAL_URL, "Local")
|
||||
prod_conn = connect(PROD_URL, "Production")
|
||||
|
||||
local_cur = local_conn.cursor()
|
||||
prod_cur = prod_conn.cursor()
|
||||
|
||||
# Find obligation_candidates in local that don't exist in production
|
||||
# Use candidate_id as the unique key
|
||||
local_cur.execute(f"""
|
||||
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
|
||||
""")
|
||||
local_ids = {r[0] for r in local_cur.fetchall()}
|
||||
|
||||
if not local_ids:
|
||||
print(" No obligation_candidates in local DB")
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return 0
|
||||
|
||||
# Check which already exist on production
|
||||
prod_cur.execute(f"""
|
||||
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
|
||||
""")
|
||||
prod_ids = {r[0] for r in prod_cur.fetchall()}
|
||||
|
||||
new_ids = local_ids - prod_ids
|
||||
if not new_ids:
|
||||
print(f" All {len(local_ids)} obligations already on production")
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return 0
|
||||
|
||||
print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
|
||||
|
||||
# Get columns
|
||||
columns = get_columns(local_cur, "obligation_candidates")
|
||||
col_list = ", ".join(columns)
|
||||
placeholders = ", ".join(["%s"] * len(columns))
|
||||
|
||||
# Fetch new rows from local
|
||||
id_list = ", ".join(f"'{i}'" for i in new_ids)
|
||||
local_cur.execute(f"""
|
||||
SELECT {col_list} FROM {SCHEMA}.obligation_candidates
|
||||
WHERE candidate_id IN ({id_list})
|
||||
""")
|
||||
rows = local_cur.fetchall()
|
||||
|
||||
# Insert into production
|
||||
insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
|
||||
psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
|
||||
prod_conn.commit()
|
||||
|
||||
print(f" Pushed {len(rows)} obligations to production")
|
||||
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return len(rows)
|
||||
|
||||
|
||||
def loop(interval_min):
|
||||
"""Run push every N minutes."""
|
||||
print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
|
||||
print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Press Ctrl+C to stop\n")
|
||||
|
||||
while True:
|
||||
try:
|
||||
pushed = push()
|
||||
if pushed:
|
||||
print(f" Next sync in {interval_min} min...")
|
||||
except Exception as e:
|
||||
print(f" SYNC ERROR: {e}")
|
||||
time.sleep(interval_min * 60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sync canonical control tables")
|
||||
parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
|
||||
parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
|
||||
parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
|
||||
parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not any([args.pull, args.push, args.loop]):
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.pull:
|
||||
pull(args.tables)
|
||||
|
||||
if args.push:
|
||||
push()
|
||||
|
||||
if args.loop:
|
||||
loop(args.loop)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
470
scripts/qa/test_pass0a.py
Normal file
470
scripts/qa/test_pass0a.py
Normal file
@@ -0,0 +1,470 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
|
||||
|
||||
Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
|
||||
Copies prompts and quality gate from decomposition_pass.py.
|
||||
|
||||
Usage:
|
||||
python3 test_pass0a.py # 10 controls, Anthropic
|
||||
python3 test_pass0a.py --limit 5 # 5 controls
|
||||
python3 test_pass0a.py --source "DSGVO" # filter by source
|
||||
python3 test_pass0a.py --dry-run # show controls, no LLM call
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
|
||||
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
|
||||
|
||||
# ── Prompts (from decomposition_pass.py) ──────────────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
|
||||
in einzelne atomare Pflichten.
|
||||
|
||||
REGELN (STRIKT EINHALTEN):
|
||||
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
|
||||
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
|
||||
ist zu testen, shall, must, required.
|
||||
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
|
||||
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
|
||||
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
|
||||
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
|
||||
eigenes Control, sondern Evidence).
|
||||
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
|
||||
— NICHT extrahieren.
|
||||
|
||||
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
|
||||
|
||||
|
||||
def build_prompt(title, objective, requirements, test_procedure, source_ref):
|
||||
return f"""\
|
||||
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
|
||||
Pflichten als JSON-Array.
|
||||
|
||||
CONTROL:
|
||||
Titel: {title}
|
||||
Ziel: {objective}
|
||||
Anforderungen: {requirements}
|
||||
Prüfverfahren: {test_procedure}
|
||||
Quellreferenz: {source_ref}
|
||||
|
||||
Antworte als JSON-Array:
|
||||
[
|
||||
{{
|
||||
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
|
||||
"action": "Hauptverb/Handlung",
|
||||
"object": "Gegenstand der Pflicht",
|
||||
"condition": "Auslöser/Bedingung oder null",
|
||||
"normative_strength": "must",
|
||||
"is_test_obligation": false,
|
||||
"is_reporting_obligation": false
|
||||
}}
|
||||
]"""
|
||||
|
||||
|
||||
# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
|
||||
|
||||
# Tier 1: Pflicht (mandatory)
|
||||
_PFLICHT_RE = re.compile(
|
||||
r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
|
||||
r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
|
||||
r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
|
||||
r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
|
||||
r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
|
||||
r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
|
||||
r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
|
||||
r"|\bshall\b|\bmust\b|\brequired\b"
|
||||
r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
|
||||
r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
|
||||
r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
|
||||
r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Tier 2: Empfehlung (recommendation)
|
||||
_EMPFEHLUNG_RE = re.compile(
|
||||
r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
|
||||
r"|\bgewährleisten\b|\bsicherstellen\b"
|
||||
r"|\bshould\b|\bensure\b|\brecommend\w*\b"
|
||||
r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
|
||||
r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
|
||||
r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Tier 3: Kann (optional/permissive)
|
||||
_KANN_RE = re.compile(
|
||||
r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Union (backward compat)
|
||||
_NORMATIVE_RE = re.compile(
|
||||
_PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_RATIONALE_RE = re.compile(
|
||||
r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_TEST_RE = re.compile(
|
||||
r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
|
||||
r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_REPORTING_RE = re.compile(
|
||||
r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
|
||||
r"|\bnotif|\breport\b|\bbehörd",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def classify_obligation_type(txt):
|
||||
"""Classify: pflicht > empfehlung > kann > empfehlung (default)."""
|
||||
if _PFLICHT_RE.search(txt):
|
||||
return "pflicht"
|
||||
if _EMPFEHLUNG_RE.search(txt):
|
||||
return "empfehlung"
|
||||
if _KANN_RE.search(txt):
|
||||
return "kann"
|
||||
return "empfehlung"
|
||||
|
||||
|
||||
def quality_gate(obl_text, parent_uuid):
|
||||
"""Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
|
||||
flags = {}
|
||||
|
||||
# 1. Normative signal (informational)
|
||||
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
|
||||
|
||||
# 1b. Obligation type classification
|
||||
obl_type = classify_obligation_type(obl_text)
|
||||
flags["obligation_type"] = obl_type
|
||||
|
||||
# 2. Single action
|
||||
multi_verb_re = re.compile(
|
||||
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
|
||||
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
flags["single_action"] = not bool(multi_verb_re.search(obl_text))
|
||||
|
||||
# 3. Not rationale
|
||||
normative_count = len(_NORMATIVE_RE.findall(obl_text))
|
||||
rationale_count = len(_RATIONALE_RE.findall(obl_text))
|
||||
flags["not_rationale"] = normative_count >= rationale_count
|
||||
|
||||
# 4. Not evidence-only
|
||||
evidence_only_re = re.compile(
|
||||
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
|
||||
|
||||
# 5. Min length
|
||||
flags["min_length"] = len(obl_text.strip()) >= 20
|
||||
|
||||
# 6. Parent link
|
||||
flags["has_parent_link"] = bool(parent_uuid)
|
||||
|
||||
# Confidence
|
||||
weights = {
|
||||
"has_normative_signal": 0.25, "single_action": 0.20,
|
||||
"not_rationale": 0.20, "not_evidence_only": 0.15,
|
||||
"min_length": 0.10, "has_parent_link": 0.05,
|
||||
}
|
||||
# Bonus for pflicht classification
|
||||
confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
|
||||
if obl_type == "pflicht":
|
||||
confidence = min(confidence + 0.05, 1.0)
|
||||
|
||||
# Pass check — has_normative_signal is NO LONGER critical
|
||||
critical = ["not_evidence_only", "min_length", "has_parent_link"]
|
||||
passed = all(flags.get(k, False) for k in critical)
|
||||
|
||||
return flags, passed, confidence, obl_type
|
||||
|
||||
|
||||
# ── JSON parsing ──────────────────────────────────────────────────────
|
||||
|
||||
def parse_json_array(text):
|
||||
try:
|
||||
result = json.loads(text)
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
if isinstance(result, dict):
|
||||
return [result]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
match = re.search(r"\[[\s\S]*\]", text)
|
||||
if match:
|
||||
try:
|
||||
result = json.loads(match.group())
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
# ── API call ──────────────────────────────────────────────────────────
|
||||
|
||||
def call_anthropic(prompt):
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 8192,
|
||||
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
usage = data.get("usage", {})
|
||||
content = data.get("content", [])
|
||||
text = content[0].get("text", "") if content else ""
|
||||
return text, usage, None
|
||||
|
||||
|
||||
# ── Format helpers ────────────────────────────────────────────────────
|
||||
|
||||
def fmt_json(val):
|
||||
if val is None:
|
||||
return ""
|
||||
if isinstance(val, str):
|
||||
try:
|
||||
val = json.loads(val)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return val
|
||||
if isinstance(val, list):
|
||||
return "\n".join(f" - {item}" for item in val)
|
||||
return str(val)
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
|
||||
parser.add_argument("--limit", type=int, default=10)
|
||||
parser.add_argument("--source", type=str)
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY and not args.dry_run:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
db_url = os.environ["DATABASE_URL"]
|
||||
p = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=p.hostname, port=p.port or 5432,
|
||||
user=p.username, password=p.password,
|
||||
dbname=p.path.lstrip("/"),
|
||||
options="-c search_path=compliance,public",
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Select diverse sample
|
||||
query = """
|
||||
SELECT id, control_id, title, objective, requirements,
|
||||
test_procedure, source_citation, category
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
|
||||
AND parent_control_uuid IS NULL
|
||||
AND title IS NOT NULL AND objective IS NOT NULL
|
||||
AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
|
||||
"""
|
||||
params = []
|
||||
if args.source:
|
||||
query += " AND source_citation->>'source' ILIKE %s"
|
||||
params.append(f"%{args.source}%")
|
||||
|
||||
query += " ORDER BY source_citation->>'source', random()"
|
||||
query += f" LIMIT {args.limit}"
|
||||
|
||||
cur.execute(query, params)
|
||||
controls = cur.fetchall()
|
||||
|
||||
if not controls:
|
||||
print("No controls found.")
|
||||
return
|
||||
|
||||
print(f"{'='*70}")
|
||||
print(f"Pass 0a Test — {len(controls)} Controls")
|
||||
print(f"Model: {ANTHROPIC_MODEL}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total_in = total_out = total_obls = 0
|
||||
type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
|
||||
total_rejected = 0 # only evidence-only / too-short / no-parent
|
||||
all_results = []
|
||||
t_start = time.time()
|
||||
|
||||
for i, row in enumerate(controls, 1):
|
||||
ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
|
||||
|
||||
req_str = fmt_json(reqs)
|
||||
test_str = fmt_json(test_proc)
|
||||
source_str = ""
|
||||
if src_cit:
|
||||
sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
|
||||
source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
|
||||
|
||||
print(f"\n{'─'*70}")
|
||||
print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
|
||||
print(f" Source: {source_str} | Category: {category or 'N/A'}")
|
||||
print(f" Objective: {(objective or '')[:200]}")
|
||||
|
||||
if args.dry_run:
|
||||
print(" [DRY RUN]")
|
||||
continue
|
||||
|
||||
prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
|
||||
|
||||
t0 = time.time()
|
||||
response_text, usage, error = call_anthropic(prompt)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if error:
|
||||
print(f" ERROR: {error}")
|
||||
continue
|
||||
|
||||
in_tok = usage.get("input_tokens", 0)
|
||||
out_tok = usage.get("output_tokens", 0)
|
||||
cached = usage.get("cache_read_input_tokens", 0)
|
||||
total_in += in_tok
|
||||
total_out += out_tok
|
||||
|
||||
obligations = parse_json_array(response_text)
|
||||
total_obls += len(obligations)
|
||||
|
||||
print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
|
||||
f"{f' ({cached} cached)' if cached else ''}"
|
||||
f" | {len(obligations)} obligation(s)")
|
||||
|
||||
for j, obl in enumerate(obligations, 1):
|
||||
obl_text = obl.get("obligation_text", "")
|
||||
action = obl.get("action", "")
|
||||
obj = obl.get("object", "")
|
||||
condition = obl.get("condition")
|
||||
strength = obl.get("normative_strength", "must")
|
||||
is_test = bool(obl.get("is_test_obligation", False))
|
||||
is_report = bool(obl.get("is_reporting_obligation", False))
|
||||
|
||||
# Auto-detect
|
||||
if not is_test and _TEST_RE.search(obl_text):
|
||||
is_test = True
|
||||
if not is_report and _REPORTING_RE.search(obl_text):
|
||||
is_report = True
|
||||
|
||||
flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
|
||||
if passed:
|
||||
type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
|
||||
else:
|
||||
total_rejected += 1
|
||||
|
||||
tag = ""
|
||||
if is_test:
|
||||
tag = " [TEST]"
|
||||
elif is_report:
|
||||
tag = " [MELDEPFLICHT]"
|
||||
|
||||
# Show type instead of PASS/REJECT
|
||||
type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
|
||||
if not passed:
|
||||
status = "REJECT"
|
||||
else:
|
||||
status = type_label.get(obl_type, "EMPFEHLUNG")
|
||||
|
||||
failed = [k for k, v in flags.items()
|
||||
if isinstance(v, bool) and not v]
|
||||
|
||||
print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
|
||||
print(f" {obl_text}")
|
||||
print(f" Handlung: {action} | Gegenstand: {obj}")
|
||||
if condition:
|
||||
print(f" Bedingung: {condition}")
|
||||
if not passed:
|
||||
print(f" Abgelehnt: {', '.join(failed)}")
|
||||
|
||||
all_results.append({
|
||||
"control_id": ctrl_id,
|
||||
"obligation_text": obl_text,
|
||||
"obligation_type": obl_type if passed else "rejected",
|
||||
"action": action,
|
||||
"object": obj,
|
||||
"condition": condition,
|
||||
"confidence": round(conf, 2),
|
||||
"is_test": is_test,
|
||||
"is_reporting": is_report,
|
||||
"passed": passed,
|
||||
"flags": {k: v for k, v in flags.items()},
|
||||
})
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
elapsed_total = time.time() - t_start
|
||||
cost = (total_in * 3 + total_out * 15) / 1_000_000
|
||||
total_classified = sum(type_counts.values())
|
||||
|
||||
print(f"\n\n{'='*70}")
|
||||
print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
|
||||
print(f"{'='*70}")
|
||||
print(f" Controls: {len(controls)}")
|
||||
print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
|
||||
print(f" ── Klassifizierung ──")
|
||||
print(f" Pflicht: {type_counts['pflicht']}"
|
||||
f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
|
||||
print(f" Empfehlung: {type_counts['empfehlung']}"
|
||||
f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
|
||||
print(f" Kann: {type_counts['kann']}"
|
||||
f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
|
||||
print(f" Rejected: {total_rejected}"
|
||||
f" ({total_rejected*100/max(total_obls,1):.0f}%)"
|
||||
f" (nur evidence-only/zu kurz/kein parent)")
|
||||
print(f" ── Kosten ──")
|
||||
print(f" Laufzeit: {elapsed_total:.1f}s")
|
||||
print(f" Tokens: {total_in:,} in / {total_out:,} out")
|
||||
print(f" Kosten: ${cost:.4f}")
|
||||
|
||||
if len(controls) > 0 and not args.dry_run and total_obls > 0:
|
||||
n = 6000
|
||||
factor = n / len(controls)
|
||||
print(f"\n --- Hochrechnung auf {n:,} Controls ---")
|
||||
print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
|
||||
print(f" Kosten: ${cost * factor:.2f}")
|
||||
print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
|
||||
print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
|
||||
pf = int(type_counts['pflicht'] * factor)
|
||||
ef = int(type_counts['empfehlung'] * factor)
|
||||
kf = int(type_counts['kann'] * factor)
|
||||
print(f" Pflicht: ~{pf:,}")
|
||||
print(f" Empfehlung: ~{ef:,}")
|
||||
print(f" Kann: ~{kf:,}")
|
||||
|
||||
# Save results JSON for later analysis
|
||||
if all_results:
|
||||
out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n Ergebnisse gespeichert: {out_path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
308
scripts/qa/test_pass0b_preview.py
Normal file
308
scripts/qa/test_pass0b_preview.py
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Preview Pass 0b: Turn obligation candidates into atomic controls.
|
||||
|
||||
Picks a few obligations from Pass 0a results, calls LLM to compose
|
||||
atomic controls, and writes them to canonical_controls with parent_control_uuid.
|
||||
|
||||
Usage:
|
||||
python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
import urllib.parse
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
|
||||
# Register JSON adapter
|
||||
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
|
||||
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
|
||||
normativen Pflicht ein praxisorientiertes, atomares Security Control.
|
||||
|
||||
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
|
||||
Antworte NUR als JSON. Keine Erklärungen."""
|
||||
|
||||
|
||||
def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
|
||||
return f"""\
|
||||
Erstelle aus der folgenden Pflicht ein atomares Control.
|
||||
|
||||
PFLICHT: {obl_text}
|
||||
HANDLUNG: {action}
|
||||
GEGENSTAND: {obj}
|
||||
|
||||
KONTEXT (Ursprungs-Control):
|
||||
Titel: {parent_title}
|
||||
Kategorie: {category}
|
||||
Quellreferenz: {source_ref}
|
||||
|
||||
Antworte als JSON:
|
||||
{{
|
||||
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
|
||||
"objective": "Was muss erreicht werden? (1-2 Sätze)",
|
||||
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
|
||||
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
|
||||
"evidence": ["Nachweis 1", "Nachweis 2"],
|
||||
"severity": "critical|high|medium|low",
|
||||
"category": "security|privacy|governance|operations|finance|reporting"
|
||||
}}"""
|
||||
|
||||
|
||||
def call_anthropic(prompt):
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
text = data.get("content", [{}])[0].get("text", "")
|
||||
return text, data.get("usage", {}), None
|
||||
|
||||
|
||||
def parse_json_object(text):
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r"\{[\s\S]*\}", text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def generate_control_id(domain, cur):
|
||||
prefix = domain.upper()[:4]
|
||||
cur.execute("""
|
||||
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id LIKE %s
|
||||
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
|
||||
""", (f"{prefix}-%",))
|
||||
row = cur.fetchone()
|
||||
if row and row[0] is not None:
|
||||
return f"{prefix}-{row[0] + 1}"
|
||||
return f"{prefix}-001"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
|
||||
parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
|
||||
parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY and not args.dry_run:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
# Load 0a results
|
||||
with open(args.input) as f:
|
||||
obligations = json.load(f)
|
||||
|
||||
# Filter: only passed, pflicht or empfehlung
|
||||
obligations = [o for o in obligations if o.get("passed", False)]
|
||||
|
||||
if args.control:
|
||||
obligations = [o for o in obligations if o["control_id"] == args.control]
|
||||
|
||||
# Pick diverse sample
|
||||
picked = []
|
||||
seen_types = set()
|
||||
for o in obligations:
|
||||
otype = o["obligation_type"]
|
||||
if otype not in seen_types and len(picked) < args.limit:
|
||||
picked.append(o)
|
||||
seen_types.add(otype)
|
||||
# Fill rest
|
||||
for o in obligations:
|
||||
if o not in picked and len(picked) < args.limit:
|
||||
picked.append(o)
|
||||
|
||||
if not picked:
|
||||
print("No obligations found.")
|
||||
return
|
||||
|
||||
# Connect to DB
|
||||
db_url = os.environ["DATABASE_URL"]
|
||||
p = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=p.hostname, port=p.port or 5432,
|
||||
user=p.username, password=p.password,
|
||||
dbname=p.path.lstrip("/"),
|
||||
options="-c search_path=compliance,public",
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get parent control info
|
||||
ctrl_ids = list(set(o["control_id"] for o in picked))
|
||||
cur.execute("""
|
||||
SELECT control_id, id, title, category, source_citation
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id = ANY(%s)
|
||||
""", (ctrl_ids,))
|
||||
ctrl_map = {}
|
||||
for row in cur.fetchall():
|
||||
sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
|
||||
# Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
|
||||
prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
|
||||
ctrl_map[row[0]] = {
|
||||
"uuid": str(row[1]), "title": row[2], "category": row[3] or "",
|
||||
"source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
|
||||
"domain": prefix,
|
||||
}
|
||||
|
||||
print("=" * 70)
|
||||
print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
|
||||
print("=" * 70)
|
||||
|
||||
created = []
|
||||
for i, obl in enumerate(picked, 1):
|
||||
ctrl = ctrl_map.get(obl["control_id"], {})
|
||||
print(f"\n{'─'*70}")
|
||||
print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
|
||||
print(f" Obligation: {obl['obligation_text'][:120]}")
|
||||
print(f" Parent: {ctrl.get('title', 'N/A')}")
|
||||
|
||||
if args.dry_run:
|
||||
print(" [DRY RUN]")
|
||||
continue
|
||||
|
||||
prompt = build_pass0b_prompt(
|
||||
obl["obligation_text"], obl["action"], obl["object"],
|
||||
ctrl.get("title", ""), ctrl.get("category", ""),
|
||||
ctrl.get("source_ref", ""),
|
||||
)
|
||||
|
||||
t0 = time.time()
|
||||
resp_text, usage, error = call_anthropic(prompt)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if error:
|
||||
print(f" ERROR: {error}")
|
||||
continue
|
||||
|
||||
result = parse_json_object(resp_text)
|
||||
if not result:
|
||||
print(f" PARSE ERROR: {resp_text[:200]}")
|
||||
continue
|
||||
|
||||
in_tok = usage.get("input_tokens", 0)
|
||||
out_tok = usage.get("output_tokens", 0)
|
||||
print(f" LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
|
||||
|
||||
# Generate control_id
|
||||
domain = ctrl.get("domain", "COMP")
|
||||
new_control_id = generate_control_id(domain, cur)
|
||||
|
||||
# Show result
|
||||
print(f"\n === ATOMIC CONTROL: {new_control_id} ===")
|
||||
print(f" Titel: {result.get('title', 'N/A')}")
|
||||
print(f" Ziel: {result.get('objective', 'N/A')}")
|
||||
print(f" Typ: {obl['obligation_type']}")
|
||||
reqs = result.get("requirements", [])
|
||||
if reqs:
|
||||
print(f" Anforderungen:")
|
||||
for r in reqs:
|
||||
print(f" - {r}")
|
||||
tests = result.get("test_procedure", [])
|
||||
if tests:
|
||||
print(f" Pruefverfahren:")
|
||||
for t in tests:
|
||||
print(f" - {t}")
|
||||
evidence = result.get("evidence", [])
|
||||
if evidence:
|
||||
print(f" Nachweise:")
|
||||
for e in evidence:
|
||||
print(f" - {e}")
|
||||
print(f" Severity: {result.get('severity', 'medium')}")
|
||||
print(f" Category: {result.get('category', 'governance')}")
|
||||
|
||||
# Write to DB
|
||||
new_uuid = str(uuid.uuid4())
|
||||
parent_uuid = ctrl.get("uuid")
|
||||
source_cit = {}
|
||||
if ctrl.get("source_ref"):
|
||||
parts = ctrl["source_ref"].strip().split(" ", 1)
|
||||
source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls (
|
||||
id, control_id, title, objective, requirements, test_procedure,
|
||||
evidence, severity, category, release_state,
|
||||
source_citation, generation_metadata, generation_strategy,
|
||||
pipeline_version, parent_control_uuid, framework_id
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
(SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
|
||||
)
|
||||
""", (
|
||||
new_uuid, new_control_id,
|
||||
result.get("title", ""),
|
||||
result.get("objective", ""),
|
||||
json.dumps(result.get("requirements", []), ensure_ascii=False),
|
||||
json.dumps(result.get("test_procedure", []), ensure_ascii=False),
|
||||
json.dumps(result.get("evidence", []), ensure_ascii=False),
|
||||
result.get("severity", "medium"),
|
||||
result.get("category", "governance"),
|
||||
"draft",
|
||||
psycopg2.extras.Json(source_cit),
|
||||
psycopg2.extras.Json({
|
||||
"obligation_type": obl["obligation_type"],
|
||||
"obligation_text": obl["obligation_text"],
|
||||
"pass0b_model": ANTHROPIC_MODEL,
|
||||
"decomposition_method": "pass0b_preview",
|
||||
}),
|
||||
"pass0b_atomic",
|
||||
6, # pipeline_version
|
||||
parent_uuid,
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
created.append({
|
||||
"control_id": new_control_id,
|
||||
"title": result.get("title", ""),
|
||||
"obligation_type": obl["obligation_type"],
|
||||
"parent_control_id": obl["control_id"],
|
||||
})
|
||||
print(f" ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
if created:
|
||||
print(f"\n{'='*70}")
|
||||
print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
|
||||
print(f"{'='*70}")
|
||||
for c in created:
|
||||
print(f" {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user