feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped

- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions

View File

@@ -1,11 +1,29 @@
"""Apply PDF QA results: update source_citation with correct article + article_type."""
"""
Apply PDF QA results: update source_citation with correct article_type + article.
Safety modes:
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
--dry-run: Show what would change without writing.
Usage:
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
"""
import os
import sys
import json
import psycopg2
import urllib.parse
from collections import Counter
RESULTS_FILE = "/tmp/pdf_qa_results.json"
# Parse args
dry_run = "--dry-run" in sys.argv
force_article = "--force-article" in sys.argv
# Load results
with open(RESULTS_FILE) as f:
results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
options="-c search_path=compliance,public"
)
# Update in batches
# Load current DB state for all affected controls
cur = conn.cursor()
updated = 0
ctrl_ids = [r["ctrl_id"] for r in results]
cur.execute("""
SELECT id,
source_citation->>'article' as article,
source_citation->>'article_type' as article_type,
source_citation->>'source' as source
FROM compliance.canonical_controls
WHERE id = ANY(%s::uuid[])
""", (ctrl_ids,))
db_state = {}
for row in cur.fetchall():
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
# Counters
stats = Counter()
updated_type = 0
updated_article = 0
updated_recital = 0
errors = 0
unchanged = 0
for i, r in enumerate(results):
ctrl_id = r["ctrl_id"]
article_label = r["article_label"]
article_type = r["article_type"] # preamble, article, annex, section, unknown
new_article = r["article_label"]
new_type = r["article_type"]
db = db_state.get(ctrl_id, {})
if not db:
stats["missing_in_db"] += 1
continue
old_type = db.get("article_type")
old_article = db.get("article", "").strip()
# Decide what to update
set_type = (old_type != new_type)
set_article = (not old_article) or (force_article and old_article != new_article)
set_recital = (new_type == "preamble")
if set_type:
stats["type_" + ("new" if not old_type else "changed")] += 1
else:
stats["type_unchanged"] += 1
if not old_article and set_article:
stats["article_new"] += 1
elif old_article and old_article != new_article:
if force_article:
stats["article_force_changed"] += 1
else:
stats["article_skipped"] += 1
else:
stats["article_unchanged"] += 1
if set_recital:
stats["recital"] += 1
if dry_run:
continue
try:
# Update source_citation: set article and article_type
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation
|| jsonb_build_object('article', %s, 'article_type', %s),
updated_at = now()
WHERE id = %s::uuid
AND (
source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s
)
""", (article_label, article_type, ctrl_id, article_label, article_type))
# Build JSONB update
updates = {}
if set_type:
updates["article_type"] = new_type
if set_article:
updates["article"] = new_article
if cur.rowcount > 0:
updated += 1
else:
unchanged += 1
if updates:
# Merge into source_citation
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
updated_at = now()
WHERE id = %s::uuid
""", (json.dumps(updates), ctrl_id))
if set_type:
updated_type += 1
if set_article:
updated_article += 1
# Mark preamble as recital_suspect
if set_recital:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata, '{}'::jsonb),
'{recital_suspect}',
'true'::jsonb
),
updated_at = now()
WHERE id = %s::uuid
""", (ctrl_id,))
updated_recital += 1
except Exception as e:
errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
conn.rollback()
continue
if (i + 1) % 500 == 0:
if (i + 1) % 1000 == 0:
conn.commit()
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
print(f" Progress: {i+1}/{len(results)}")
conn.commit()
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
if not dry_run:
conn.commit()
mode = "DRY-RUN" if dry_run else "APPLIED"
print(f"\n{'='*60}")
print(f" Mode: {mode}")
print(f"{'='*60}")
print(f"\n article_type:")
print(f" New (was NULL): {stats['type_new']:5d}")
print(f" Changed: {stats['type_changed']:5d}")
print(f" Unchanged: {stats['type_unchanged']:5d}")
print(f"\n article:")
print(f" New (was empty): {stats['article_new']:5d}")
if force_article:
print(f" Force-changed: {stats['article_force_changed']:5d}")
else:
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
print(f" Unchanged: {stats['article_unchanged']:5d}")
print(f"\n Preamble/Recital: {stats['recital']:5d}")
print(f" Missing in DB: {stats['missing_in_db']:5d}")
if not dry_run:
print(f"\n Updates written:")
print(f" article_type: {updated_type:5d}")
print(f" article: {updated_article:5d}")
print(f" recital_suspect: {updated_recital:5d}")
print(f" Errors: {errors:5d}")
# Verify: count by article_type
cur.execute("""

View File

@@ -0,0 +1,524 @@
#!/usr/bin/env python3
"""
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
Tests 5 representative gap articles from different sources.
Measures: quality (JSON valid, fields complete), response time, cost estimate.
Usage:
python3 benchmark_llm_controls.py
"""
import json
import time
import sys
import os
import requests
from pathlib import Path
# ── Config ──────────────────────────────────────────────────────────
LITELLM_URL = "https://llm-dev.meghsakha.com"
LITELLM_MODEL = "gpt-oss-120b"
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = "claude-sonnet-4-6"
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
try:
import fitz # PyMuPDF
except ImportError:
print("PyMuPDF not available, using pre-extracted texts")
fitz = None
# ── Prompts (identical to control_generator.py) ─────────────────────
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
Verwende ["all"] wenn keine Groessenbeschraenkung.
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
{"requires_any": ["signal"], "description": "Erklaerung"}"""
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
- category: Inhaltliche Kategorie
- target_audience: Liste der Zielgruppen
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
{APPLICABILITY_PROMPT}
Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""
# ── PDF Text Extraction ─────────────────────────────────────────────
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
"""Extract the text of a specific article from a PDF."""
import re
path = PDF_DIR / pdf_file
if not path.exists() or fitz is None:
return ""
doc = fitz.open(str(path))
full_text = ""
for page in doc:
full_text += page.get_text() + "\n"
doc.close()
# Find article boundaries
if doc_type == "eu_regulation":
# Find "Artikel N" heading
art_num = re.search(r'\d+', article_label)
if not art_num:
return ""
num = int(art_num.group())
# Find start of this article
pattern = rf'\nArtikel\s+{num}\s*\n'
match = re.search(pattern, full_text)
if not match:
return f"[Artikel {num} nicht im PDF gefunden]"
start = match.start()
# Find start of next article
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else start + 5000
text = full_text[start:end].strip()
return text[:3000]
elif doc_type == "de_law":
para_num = re.search(r'\d+', article_label)
if not para_num:
return ""
num = int(para_num.group())
pattern = rf'\\s+{num}\b'
match = re.search(pattern, full_text)
if not match:
return f"{num} nicht im PDF gefunden]"
start = match.start()
next_pattern = rf'\\s+{num+1}\b'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else start + 5000
text = full_text[start:end].strip()
return text[:3000]
elif doc_type == "nist":
# Find NIST control family
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
if not match:
return f"[{article_label} nicht im PDF gefunden]"
start = match.start()
text = full_text[start:start+3000].strip()
return text
else:
# Generic section search
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
if not match:
return f"[{article_label} nicht im PDF gefunden]"
start = match.start()
text = full_text[start:start+3000].strip()
return text
# ── API Calls ────────────────────────────────────────────────────────
def call_litellm(prompt: str, system_prompt: str) -> tuple:
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {LITELLM_API_KEY}",
}
payload = {
"model": LITELLM_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": 0.3,
"max_tokens": 4096,
"stream": False,
}
t0 = time.time()
try:
resp = requests.post(
f"{LITELLM_URL}/v1/chat/completions",
headers=headers,
json=payload,
timeout=180,
)
duration = time.time() - t0
if resp.status_code != 200:
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data["choices"][0]["message"]["content"]
usage = data.get("usage", {})
return content, duration, None, usage
except Exception as e:
return "", time.time() - t0, str(e), {}
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": system_prompt,
"messages": [{"role": "user", "content": prompt}],
}
t0 = time.time()
try:
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
duration = time.time() - t0
if resp.status_code != 200:
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
data = resp.json()
content = data["content"][0]["text"] if data.get("content") else ""
usage = data.get("usage", {})
return content, duration, None, usage
except Exception as e:
return "", time.time() - t0, str(e), {}
# ── Quality Assessment ───────────────────────────────────────────────
REQUIRED_FIELDS = [
"title", "objective", "rationale", "requirements",
"test_procedure", "evidence", "severity", "domain",
]
BONUS_FIELDS = [
"tags", "category", "target_audience", "source_article",
"applicable_industries", "applicable_company_size",
]
def assess_quality(raw_text: str) -> dict:
"""Assess the quality of a control generation response."""
result = {
"json_valid": False,
"required_fields": 0,
"required_total": len(REQUIRED_FIELDS),
"bonus_fields": 0,
"bonus_total": len(BONUS_FIELDS),
"requirements_count": 0,
"test_procedure_count": 0,
"evidence_count": 0,
"title_length": 0,
"objective_length": 0,
"score": 0,
}
# Try to parse JSON
text = raw_text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
try:
data = json.loads(text)
if isinstance(data, list):
data = data[0] if data else {}
except json.JSONDecodeError:
# Try to find JSON object
import re
match = re.search(r'\{[\s\S]*\}', text)
if match:
try:
data = json.loads(match.group())
except json.JSONDecodeError:
return result
else:
return result
result["json_valid"] = True
# Check required fields
for f in REQUIRED_FIELDS:
val = data.get(f)
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
result["required_fields"] += 1
# Check bonus fields
for f in BONUS_FIELDS:
val = data.get(f)
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
result["bonus_fields"] += 1
# Depth metrics
reqs = data.get("requirements", [])
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
tp = data.get("test_procedure", [])
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
ev = data.get("evidence", [])
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
result["title_length"] = len(data.get("title", ""))
result["objective_length"] = len(data.get("objective", ""))
# Score: 0-100
score = 0
score += 20 if result["json_valid"] else 0
score += (result["required_fields"] / result["required_total"]) * 40
score += (result["bonus_fields"] / result["bonus_total"]) * 15
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
score += 1 if result["objective_length"] > 50 else 0
result["score"] = round(score, 1)
result["parsed_data"] = data
return result
# ── Test Cases ───────────────────────────────────────────────────────
TEST_CASES = [
{
"source": "DSGVO (EU) 2016/679",
"article": "Artikel 32",
"pdf": "dsgvo_2016_679.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
},
{
"source": "KI-Verordnung (EU) 2024/1689",
"article": "Artikel 9",
"pdf": "ai_act_2024_1689.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Risikomanagement für Hochrisiko-KI",
},
{
"source": "NIS2-Richtlinie (EU) 2022/2555",
"article": "Artikel 21",
"pdf": "nis2_2022_2555.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
},
{
"source": "Cyber Resilience Act (CRA)",
"article": "Artikel 13",
"pdf": "cra_2024_2847.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Pflichten der Hersteller",
},
{
"source": "Bundesdatenschutzgesetz (BDSG)",
"article": "§ 26",
"pdf": "bdsg.pdf",
"doc_type": "de_law",
"license": "DE_LAW",
"description": "Datenverarbeitung im Beschäftigungskontext",
},
]
# ── Main ─────────────────────────────────────────────────────────────
def main():
if not ANTHROPIC_API_KEY:
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
sys.exit(1)
print("=" * 80)
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
print("=" * 80)
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
print(f" Anthropic: {ANTHROPIC_MODEL}")
print(f" Tests: {len(TEST_CASES)}")
print()
# Pre-check LiteLLM
try:
r = requests.get(f"{LITELLM_URL}/v1/models",
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
print(f" LiteLLM OK: {r.status_code}")
except Exception as e:
print(f" LiteLLM ERROR: {e}")
sys.exit(1)
results = []
for i, tc in enumerate(TEST_CASES):
print(f"\n{'='*80}")
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']}{tc['article']}")
print(f" {tc['description']}")
print(f"{'='*80}")
# Extract article text from PDF
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
if not article_text or article_text.startswith("["):
print(f" WARNING: {article_text or 'Empty text'}")
continue
print(f" Text extracted: {len(article_text)} chars")
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
# ── Call LiteLLM ──
print(f"\n --- gpt-oss-120b ---")
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
if litellm_err:
print(f" ERROR: {litellm_err}")
litellm_quality = {"json_valid": False, "score": 0}
else:
print(f" Time: {litellm_time:.1f}s")
print(f" Tokens: {litellm_usage}")
litellm_quality = assess_quality(litellm_raw)
print(f" JSON valid: {litellm_quality['json_valid']}")
print(f" Score: {litellm_quality['score']}/100")
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
print(f" Requirements: {litellm_quality['requirements_count']}, "
f"Tests: {litellm_quality['test_procedure_count']}, "
f"Evidence: {litellm_quality['evidence_count']}")
if litellm_quality.get("parsed_data"):
d = litellm_quality["parsed_data"]
print(f" Title: {d.get('title', 'N/A')}")
# ── Call Anthropic ──
print(f"\n --- Claude Sonnet 4.6 ---")
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
if anthropic_err:
print(f" ERROR: {anthropic_err}")
anthropic_quality = {"json_valid": False, "score": 0}
else:
print(f" Time: {anthropic_time:.1f}s")
print(f" Tokens: {anthropic_usage}")
anthropic_quality = assess_quality(anthropic_raw)
print(f" JSON valid: {anthropic_quality['json_valid']}")
print(f" Score: {anthropic_quality['score']}/100")
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
print(f" Requirements: {anthropic_quality['requirements_count']}, "
f"Tests: {anthropic_quality['test_procedure_count']}, "
f"Evidence: {anthropic_quality['evidence_count']}")
if anthropic_quality.get("parsed_data"):
d = anthropic_quality["parsed_data"]
print(f" Title: {d.get('title', 'N/A')}")
# Compare
print(f"\n --- VERGLEICH ---")
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
f"Sonnet {anthropic_quality.get('score', 0)}/100")
results.append({
"test": f"{tc['source']}{tc['article']}",
"litellm": {
"time": round(litellm_time, 1),
"score": litellm_quality.get("score", 0),
"json_valid": litellm_quality.get("json_valid", False),
"requirements": litellm_quality.get("requirements_count", 0),
"tests": litellm_quality.get("test_procedure_count", 0),
"usage": litellm_usage,
"raw": litellm_raw[:500] if litellm_raw else "",
},
"anthropic": {
"time": round(anthropic_time, 1),
"score": anthropic_quality.get("score", 0),
"json_valid": anthropic_quality.get("json_valid", False),
"requirements": anthropic_quality.get("requirements_count", 0),
"tests": anthropic_quality.get("test_procedure_count", 0),
"usage": anthropic_usage,
"raw": anthropic_raw[:500] if anthropic_raw else "",
},
})
# ── Summary ──────────────────────────────────────────────────────
print(f"\n\n{'='*80}")
print("ZUSAMMENFASSUNG")
print(f"{'='*80}")
if not results:
print(" Keine Ergebnisse.")
return
litellm_scores = [r["litellm"]["score"] for r in results]
anthropic_scores = [r["anthropic"]["score"] for r in results]
litellm_times = [r["litellm"]["time"] for r in results]
anthropic_times = [r["anthropic"]["time"] for r in results]
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
print(f" {'-'*30} {'-'*15} {'-'*15}")
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
print(f" {'Avg Requirements':<30s} "
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
print(f" {'Avg Test Procedures':<30s} "
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
# Cost estimate
# Claude Sonnet: ~$3/M input, ~$15/M output
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
print(f" gpt-oss-120b: $0.00 (self-hosted)")
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
# Extrapolate for 494 gap articles
if results:
cost_per_control = anthropic_cost / len(results)
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
print(f" gpt-oss-120b: $0.00")
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
avg_time_120b = sum(litellm_times) / len(litellm_times)
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
# Save full results
out_path = "/tmp/benchmark_llm_results.json"
with open(out_path, 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n Detaillierte Ergebnisse: {out_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,200 @@
"""Match unmatched Blue Guide controls against the English PDF."""
import os
import re
import json
import unicodedata
import psycopg2
import urllib.parse
try:
import fitz
except ImportError:
print("ERROR: PyMuPDF (fitz) not installed")
exit(1)
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# Read EN PDF
print(f"Reading {PDF_PATH}...")
doc = fitz.open(PDF_PATH)
text = ""
for page in doc:
text += page.get_text() + "\n"
doc.close()
print(f" {len(text):,} chars")
text_norm = normalize(text)
# Build article index for EN Blue Guide
# EN Blue Guide uses "Article N" headings (not "Artikel N")
items = []
# Find where "Article 1" starts — content before is preamble/intro
art1_match = re.search(r'\nArticle\s+1\s*\n', text)
if not art1_match:
# Try section-based structure instead
print(" No 'Article N' headings found, trying section-based index...")
for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
items.append((m.start(), f"Section {m.group(1)}", "section"))
else:
art1_pos = art1_match.start()
# Article headings
for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
items.append((m.start(), f"Article {m.group(1)}", "article"))
# Annex markers
for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
items.append((m.start(), f"Annex {m.group(1)}", "annex"))
# Also try numbered section headings as fallback
for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
items.append((m.start(), f"Section {m.group(1)}", "section"))
items.sort(key=lambda x: x[0])
seen = set()
unique = []
for pos, label, typ in items:
if label not in seen:
seen.add(label)
unique.append((pos, label, typ))
print(f" Index: {len(unique)} sections")
if unique[:5]:
for pos, label, typ in unique[:5]:
print(f" {label} [{typ}] @ pos {pos}")
# Precompute normalized positions
index_norm = []
for pos, label, typ in unique:
norm_pos = len(normalize(text[:pos]))
index_norm.append((norm_pos, label, typ))
# Connect to DB
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get Blue Guide controls without article_type (unmatched)
cur.execute("""
SELECT id, control_id, title, source_original_text,
source_citation->>'article' as existing_article,
source_citation->>'article_type' as existing_type,
release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'EU Blue Guide 2022'
AND source_original_text IS NOT NULL
AND length(source_original_text) > 50
AND (source_citation->>'article_type' IS NULL)
ORDER BY control_id
""")
controls = cur.fetchall()
print(f"\nUnmatched Blue Guide controls: {len(controls)}")
# Match each control
results = []
found = 0
not_found = 0
for ctrl in controls:
ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
orig_norm = normalize(orig_text)
if len(orig_norm) < 30:
not_found += 1
continue
matched = False
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = text_norm.find(snippet)
if pos >= 0:
# Find section
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(index_norm):
if h_pos <= pos:
label = h_label
typ = h_type
break
results.append({
"ctrl_id": str(ctrl_id),
"control_id": control_id,
"source": "EU Blue Guide 2022",
"article_label": label,
"article_type": typ,
})
found += 1
is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
print(f" {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
matched = True
break
if matched:
break
if not matched:
not_found += 1
print(f" {control_id:10s}: NOT FOUND {title[:50]}")
print(f"\n{'='*50}")
print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
# Save results
out_path = "/tmp/blue_guide_en_results.json"
with open(out_path, 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Saved to {out_path}")
# Apply results to DB
if results:
print(f"\nApplying {len(results)} results to DB...")
applied = 0
for r in results:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s::uuid
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (r["article_label"], r["article_type"],
r["ctrl_id"], r["article_label"], r["article_type"]))
if cur.rowcount > 0:
applied += 1
conn.commit()
print(f" Applied: {applied} controls updated")
# Show type distribution
type_counts = {}
for r in results:
t = r["article_type"]
type_counts[t] = type_counts.get(t, 0) + 1
if type_counts:
print(f"\nArticle type distribution:")
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t:12s}: {c:5d}")
conn.close()

188
scripts/qa/gap_analysis.py Normal file
View File

@@ -0,0 +1,188 @@
"""
Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
For each regulation PDF:
1. Extract all articles/sections from the PDF
2. Compare with controls in the DB that reference this article
3. Report gaps (articles with no controls)
Usage:
python3 gap_analysis.py # show all gaps
python3 gap_analysis.py --source "DSGVO" # filter by source
"""
import os
import sys
import json
import re
import psycopg2
import urllib.parse
from pathlib import Path
from collections import defaultdict
# Import from pdf_qa_all
sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
build_eu_article_index, build_de_law_index, build_nist_index,
build_owasp_index, build_generic_index, MAX_ARTICLES
)
# Only analyze sources with significant control counts (skip sources with <5 controls)
MIN_CONTROLS = 5
def main():
source_filter = None
if "--source" in sys.argv:
idx = sys.argv.index("--source")
if idx + 1 < len(sys.argv):
source_filter = sys.argv[idx + 1]
# DB connection
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get all controls grouped by source with their article
cur.execute("""
SELECT source_citation->>'source' as source,
source_citation->>'article' as article,
source_citation->>'article_type' as article_type,
count(*) as cnt
FROM compliance.canonical_controls
WHERE source_citation->>'source' IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
GROUP BY 1, 2, 3
ORDER BY 1, 2
""")
# Build: source -> {article -> (type, count)}
controls_by_source = defaultdict(dict)
for source, article, art_type, cnt in cur.fetchall():
if article:
controls_by_source[source][article] = (art_type or "unknown", cnt)
total_gaps = 0
total_articles_checked = 0
total_covered = 0
gap_report = []
sources_to_check = sorted(SOURCE_FILE_MAP.keys())
if source_filter:
sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
for source_name in sources_to_check:
filename = SOURCE_FILE_MAP.get(source_name)
if filename is None:
continue
controls = controls_by_source.get(source_name, {})
if len(controls) < MIN_CONTROLS and not source_filter:
continue
# Read PDF and build article index
text = read_file(filename)
if text is None:
continue
doc_type = classify_doc(source_name)
max_art = MAX_ARTICLES.get(source_name)
if doc_type == "eu_regulation":
index = build_eu_article_index(text, max_article=max_art)
elif doc_type == "de_law":
index = build_de_law_index(text)
elif doc_type == "nist":
index = build_nist_index(text)
elif doc_type == "owasp":
index = build_owasp_index(text, source_name)
else:
index = build_generic_index(text)
if not index:
continue
# Only look at substantive articles (not preamble, not annex for gap analysis)
substantive_types = {"article", "section", "control", "requirement", "category"}
substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
# Check which articles have controls
covered = []
gaps = []
for pos, label, typ in substantive_articles:
if label in controls:
covered.append(label)
else:
gaps.append((label, typ))
total_articles_checked += len(substantive_articles)
total_covered += len(covered)
total_gaps += len(gaps)
# Count preamble/annex controls
preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
print(f"\n{'='*70}")
print(f"{source_name}")
print(f" PDF articles: {len(substantive_articles)} substantive, "
f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
print(f" DB controls: {sum(v[1] for v in controls.values())} total "
f"({preamble_controls} preamble, {annex_controls} annex)")
print(f" Coverage: {len(covered)}/{len(substantive_articles)} "
f"({coverage_pct:.0f}%)")
if gaps:
print(f" GAPS ({len(gaps)}):")
for label, typ in gaps[:30]: # limit output
print(f" - {label} [{typ}]")
if len(gaps) > 30:
print(f" ... and {len(gaps)-30} more")
gap_report.append({
"source": source_name,
"total_articles": len(substantive_articles),
"covered": len(covered),
"gaps": len(gaps),
"coverage_pct": round(coverage_pct, 1),
"gap_articles": [{"label": l, "type": t} for l, t in gaps],
})
# Summary
print(f"\n{'='*70}")
print("GAP ANALYSIS SUMMARY")
print(f"{'='*70}")
print(f" Sources analyzed: {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
print(f" Total articles in PDFs: {total_articles_checked}")
print(f" Articles with controls: {total_covered}")
print(f" Articles WITHOUT controls: {total_gaps}")
if total_articles_checked:
print(f" Overall coverage: {total_covered/total_articles_checked*100:.1f}%")
print(f"\n Sources with gaps:")
for r in sorted(gap_report, key=lambda x: -x["gaps"]):
print(f" {r['source']:45s} {r['gaps']:4d} gaps "
f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
# Save report
out_path = "/tmp/gap_analysis_results.json"
with open(out_path, 'w') as f:
json.dump(gap_report, f, indent=2, ensure_ascii=False)
print(f"\n Full report saved to {out_path}")
conn.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,288 @@
"""Analyze NIST OSCAL data and compare with existing controls in DB."""
import os
import re
import json
import psycopg2
import urllib.parse
from collections import defaultdict
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
# ── Load SP 800-53 Rev 5 ──
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
sp853 = json.load(f)["catalog"]
print("=" * 70)
print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
print("=" * 70)
print(f" UUID: {sp853.get('uuid', '?')}")
print(f" Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
# Count controls
families = sp853.get("groups", [])
total_base = 0
total_enhancements = 0
total_withdrawn = 0
total_active = 0
family_stats = []
for fam in families:
fam_id = fam.get("id", "?")
fam_title = fam.get("title", "?")
controls = fam.get("controls", [])
base = 0
enhancements = 0
withdrawn = 0
for ctrl in controls:
# Check if withdrawn
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
is_withdrawn = props.get("status") == "withdrawn"
if is_withdrawn:
withdrawn += 1
else:
base += 1
# Count enhancements
for enh in ctrl.get("controls", []):
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
if enh_props.get("status") == "withdrawn":
withdrawn += 1
else:
enhancements += 1
family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
total_base += base
total_enhancements += enhancements
total_withdrawn += withdrawn
total_active = total_base + total_enhancements
print(f"\n Families: {len(families)}")
print(f" Base Controls: {total_base}")
print(f" Enhancements: {total_enhancements}")
print(f" Withdrawn: {total_withdrawn}")
print(f" TOTAL ACTIVE: {total_active}")
print(f"\n Per Family:")
print(f" {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
for fam_id, title, base, enh, wdrn in family_stats:
print(f" {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
# Show example control structure
print(f"\n Example Control (AC-6 Least Privilege):")
for fam in families:
for ctrl in fam.get("controls", []):
if ctrl["id"] == "ac-6":
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
print(f" ID: {ctrl['id']}")
print(f" Label: {props.get('label', '?')}")
print(f" Title: {ctrl['title']}")
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
prose = part.get("prose", "")
print(f" Statement: {prose[:150]}...")
elif part.get("name") == "guidance":
prose = part.get("prose", "")
print(f" Guidance: {prose[:150]}...")
enh_count = len(ctrl.get("controls", []))
print(f" Enhancements: {enh_count}")
links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
print(f" Related: {', '.join(links[:8])}...")
break
# ── Load CSF 2.0 ──
print(f"\n{'='*70}")
print("NIST CSF 2.0 — OSCAL Catalog Analysis")
print("=" * 70)
with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
csf = json.load(f)["catalog"]
csf_groups = csf.get("groups", [])
csf_total = 0
for grp in csf_groups:
func_title = grp.get("title", "?")
cats = grp.get("groups", [])
subcats = 0
for cat in cats:
subcats += len(cat.get("controls", []))
csf_total += subcats
print(f" {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
print(f" TOTAL: {csf_total} subcategories")
# ── Compare with existing DB controls ──
print(f"\n{'='*70}")
print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
print("=" * 70)
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get existing NIST controls
cur.execute("""
SELECT control_id, title,
source_citation->>'source' as source,
source_citation->>'article' as article,
source_citation->>'article_type' as art_type,
release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' LIKE 'NIST%%'
ORDER BY source_citation->>'source', control_id
""")
nist_controls = cur.fetchall()
# Group by source
by_source = defaultdict(list)
for ctrl in nist_controls:
by_source[ctrl[2]].append(ctrl)
print(f"\n Bestehende NIST Controls in DB:")
for src in sorted(by_source.keys()):
ctrls = by_source[src]
active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
with_article = sum(1 for c in ctrls if c[3])
print(f" {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
# For SP 800-53: which control families do we have?
sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
existing_families = set()
existing_articles = set()
for ctrl in sp853_existing:
article = ctrl[3] or ""
if article:
# Extract family prefix (e.g., "AC-6" → "AC")
m = re.match(r'([A-Z]{2})-', article)
if m:
existing_families.add(m.group(1))
existing_articles.add(article)
print(f"\n SP 800-53 in DB:")
print(f" Total: {len(sp853_existing)}")
print(f" Families covered: {len(existing_families)}")
print(f" Unique articles: {len(existing_articles)}")
print(f" Families: {', '.join(sorted(existing_families))}")
# Compare: which OSCAL controls are NOT in our DB?
oscal_controls = {} # id → (label, title, statement)
for fam in families:
for ctrl in fam.get("controls", []):
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
if props.get("status") == "withdrawn":
continue
label = props.get("label", ctrl["id"].upper())
statement = ""
guidance = ""
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
statement = part.get("prose", "")
# Also check sub-items
for sub in part.get("parts", []):
statement += " " + sub.get("prose", "")
elif part.get("name") == "guidance":
guidance = part.get("prose", "")
oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
# Enhancements
for enh in ctrl.get("controls", []):
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
if enh_props.get("status") == "withdrawn":
continue
enh_label = enh_props.get("label", enh["id"].upper())
enh_statement = ""
enh_guidance = ""
for part in enh.get("parts", []):
if part.get("name") == "statement":
enh_statement = part.get("prose", "")
for sub in part.get("parts", []):
enh_statement += " " + sub.get("prose", "")
elif part.get("name") == "guidance":
enh_guidance = part.get("prose", "")
oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
print(f"\n OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
# Find missing: in OSCAL but not in DB
missing = []
covered = []
for label in sorted(oscal_controls.keys()):
if label in existing_articles:
covered.append(label)
else:
missing.append(label)
print(f" In DB vorhanden: {len(covered)}")
print(f" FEHLEND in DB: {len(missing)}")
# Missing by family
missing_by_fam = defaultdict(list)
for label in missing:
fam = label.split("-")[0]
missing_by_fam[fam].append(label)
print(f"\n Fehlende Controls nach Family:")
for fam in sorted(missing_by_fam.keys()):
ctrls = missing_by_fam[fam]
examples = ", ".join(ctrls[:5])
more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
print(f" {fam:4s}: {len(ctrls):3d} fehlend ({examples}{more})")
# Also check CSF 2.0
print(f"\n{'='*70}")
print("NIST CSF 2.0 — Vergleich mit DB")
print("=" * 70)
cur.execute("""
SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
FROM compliance.canonical_controls
WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
""")
csf_row = cur.fetchone()
print(f" CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
csf_subcats = 0
csf_ids = []
for grp in csf_groups:
for cat in grp.get("groups", []):
for subcat in cat.get("controls", []):
csf_subcats += 1
props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
csf_ids.append(props.get("label", subcat["id"]))
print(f" CSF 2.0 OSCAL Subcategories: {csf_subcats}")
print(f" Beispiele: {', '.join(csf_ids[:10])}")
# ── Summary / Potential ──
print(f"\n{'='*70}")
print("POTENTIAL: Was OSCAL uns bringt")
print("=" * 70)
print(f"""
SP 800-53 Rev 5:
- {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
- Jeder Control hat: Statement + Guidance + Assessment-Methoden
- Cross-References zwischen Controls (für Mapping)
- Maschinenlesbare Parameter (ODP)
- Public Domain — keine Lizenzprobleme
CSF 2.0:
- {csf_subcats} Subcategories als Compliance-Controls
- 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
- Direkte Mappings zu SP 800-53 Controls
Nächste Schritte:
1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
2. Statement-Text als source_original_text verwenden
3. article_type='control', article=Label (z.B. 'AC-6')
4. CSF 2.0 als eigene Regulation importieren
5. Cross-References als Grundlage für Control-Mappings nutzen
""")
conn.close()

289
scripts/qa/oscal_import.py Normal file
View File

@@ -0,0 +1,289 @@
"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
import os
import re
import json
import uuid
import psycopg2
import urllib.parse
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
sp853 = json.load(f)["catalog"]
# ── Extract all OSCAL controls ──
def extract_controls(catalog):
"""Extract all active controls with full data."""
controls = []
for fam in catalog.get("groups", []):
fam_id = fam.get("id", "").upper()
fam_title = fam.get("title", "")
for ctrl in fam.get("controls", []):
result = extract_single(ctrl, fam_title)
if result:
controls.append(result)
# Enhancements
for enh in ctrl.get("controls", []):
result = extract_single(enh, fam_title)
if result:
controls.append(result)
return controls
def extract_single(ctrl, family_title):
"""Extract a single control or enhancement."""
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
if props.get("status") == "withdrawn":
return None
label = props.get("label", ctrl["id"].upper())
title = ctrl.get("title", "")
# Extract statement (main requirement text)
statement = ""
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
statement = part.get("prose", "")
# Sub-items (a., b., c., etc.)
for sub in part.get("parts", []):
sub_prose = sub.get("prose", "")
sub_label = ""
for sp in sub.get("props", []):
if sp["name"] == "label":
sub_label = sp.get("value", "")
if sub_label:
statement += f"\n{sub_label} {sub_prose}"
elif sub_prose:
statement += f"\n{sub_prose}"
# Nested sub-sub-items
for subsub in sub.get("parts", []):
ss_prose = subsub.get("prose", "")
ss_label = ""
for sp in subsub.get("props", []):
if sp["name"] == "label":
ss_label = sp.get("value", "")
if ss_label:
statement += f"\n {ss_label} {ss_prose}"
elif ss_prose:
statement += f"\n {ss_prose}"
# Extract guidance
guidance = ""
for part in ctrl.get("parts", []):
if part.get("name") == "guidance":
guidance = part.get("prose", "")
# Cross-references
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
# Parameters
params = []
for p in ctrl.get("params", []):
param_id = p.get("id", "")
param_label = p.get("label", "")
guidelines = ""
for g in p.get("guidelines", []):
guidelines += g.get("prose", "")
select_choices = []
if "select" in p:
for choice in p["select"].get("choice", []):
select_choices.append(choice)
params.append({
"id": param_id,
"label": param_label,
"guidelines": guidelines,
"choices": select_choices,
})
return {
"label": label,
"title": title,
"family": family_title,
"statement": statement.strip(),
"guidance": guidance.strip(),
"related": related,
"params": params,
"is_enhancement": "(" in label,
}
all_oscal = extract_controls(sp853)
print(f"Total OSCAL active controls: {len(all_oscal)}")
# ── Normalize label for comparison ──
def normalize_label(label):
label = re.sub(r'-0+(\d)', r'-\1', label)
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
return label.upper()
# ── DB connection ──
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get existing labels
cur.execute("""
SELECT DISTINCT source_citation->>'article' as article
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
AND source_citation->>'article' IS NOT NULL
""")
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
print(f"Existing DB labels (normalized): {len(existing_labels)}")
# Get highest control_id numbers per prefix
cur.execute("""
SELECT control_id FROM compliance.canonical_controls
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
ORDER BY control_id
""")
existing_ids = set(r[0] for r in cur.fetchall())
# Find next available ID per prefix
def next_control_id(prefix, existing):
"""Find next available control_id like SEC-1234."""
max_num = 0
pattern = re.compile(rf'^{prefix}-(\d+)$')
for eid in existing:
m = pattern.match(eid)
if m:
max_num = max(max_num, int(m.group(1)))
return max_num
# Map NIST families to our control_id prefixes
FAMILY_PREFIX = {
"Access Control": "ACC",
"Awareness and Training": "GOV",
"Audit and Accountability": "LOG",
"Assessment, Authorization, and Monitoring": "GOV",
"Configuration Management": "COMP",
"Contingency Planning": "INC",
"Identification and Authentication": "AUTH",
"Incident Response": "INC",
"Maintenance": "COMP",
"Media Protection": "DATA",
"Physical and Environmental Protection": "SEC",
"Planning": "GOV",
"Program Management": "GOV",
"Personnel Security": "GOV",
"Personally Identifiable Information Processing and Transparency": "DATA",
"Risk Assessment": "GOV",
"System and Services Acquisition": "COMP",
"System and Communications Protection": "NET",
"System and Information Integrity": "SEC",
"Supply Chain Risk Management": "COMP",
}
# Track next IDs
prefix_counters = {}
for prefix in set(FAMILY_PREFIX.values()):
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
print(f"Starting counters: {prefix_counters}")
# ── Filter to only new controls ──
to_import = []
for ctrl in all_oscal:
norm = normalize_label(ctrl["label"])
if norm not in existing_labels:
to_import.append(ctrl)
print(f"\nControls to import: {len(to_import)}")
# ── Import ──
imported = 0
for ctrl in to_import:
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
prefix_counters[prefix] += 1
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
# Build title: "NIST {label}: {title}"
title = f"NIST {ctrl['label']}: {ctrl['title']}"
# source_original_text = statement (the official requirement text)
source_text = ctrl["statement"]
if not source_text:
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
# objective = guidance text
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
# source_citation
citation = {
"source": "NIST SP 800-53 Rev. 5",
"article": ctrl["label"],
"article_type": "control",
"source_type": "standard",
"oscal_import": True,
}
if ctrl["related"]:
citation["related_controls"] = ctrl["related"][:20]
if ctrl["params"]:
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
new_id = str(uuid.uuid4())
cur.execute("""
INSERT INTO compliance.canonical_controls
(id, framework_id, control_id, title, objective, rationale,
severity, source_original_text,
source_citation, pipeline_version, release_state,
generation_strategy, category)
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
""", (
new_id,
FRAMEWORK_ID,
control_id,
title[:500],
objective[:5000],
source_text[:10000],
json.dumps(citation, ensure_ascii=False),
ctrl["family"],
))
imported += 1
conn.commit()
print(f"\nImported: {imported} new controls")
# ── Verify ──
cur.execute("""
SELECT count(*),
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
""")
total, active = cur.fetchone()
print(f"\nSP 800-53 after import: {total} total, {active} active")
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
""")
print(f"\nDB release_state gesamt:")
for row in cur.fetchall():
print(f" {row[0]:15s}: {row[1]:5d}")
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close')
""")
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
# ── Import stats by family ──
fam_counts = {}
for ctrl in to_import:
fam = ctrl["family"]
fam_counts[fam] = fam_counts.get(fam, 0) + 1
print(f"\nImportiert nach Family:")
for fam in sorted(fam_counts.keys()):
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
conn.close()

274
scripts/qa/owasp_cleanup.py Normal file
View File

@@ -0,0 +1,274 @@
"""OWASP Cleanup:
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
2. Fix 47 wrong source attributions (found in different OWASP PDF)
"""
import os
import re
import json
import unicodedata
import psycopg2
import urllib.parse
try:
import fitz
except ImportError:
print("ERROR: PyMuPDF not installed")
exit(1)
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# Load OWASP PDFs
OWASP_PDFS = {
"OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
"OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
"OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
"OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
"OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
}
pdf_norms = {}
for name, filename in OWASP_PDFS.items():
path = os.path.join(PDF_DIR, filename)
if not os.path.exists(path):
continue
doc = fitz.open(path)
text = ""
for page in doc:
text += page.get_text() + "\n"
doc.close()
pdf_norms[name] = normalize(text)
def build_owasp_index(text_norm, source_name):
# We need the raw text for regex, but we already normalized.
# Rebuild index from normalized text.
items = []
if "Top 10" in source_name and "API" not in source_name:
for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
items.append((m.start(), m.group(1), "category"))
elif "API" in source_name:
for m in re.finditer(r'(API\d+:\d{4})', text_norm):
items.append((m.start(), m.group(1), "category"))
elif "ASVS" in source_name:
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
items.append((m.start(), m.group(1), "requirement"))
elif "MASVS" in source_name:
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
unique = []
for pos, label, typ in items:
if label not in seen:
seen.add(label)
unique.append((pos, label, typ))
return unique
pdf_indexes = {}
for name, norm in pdf_norms.items():
pdf_indexes[name] = build_owasp_index(norm, name)
def find_in_pdf(orig_text, source_name):
"""Find control text in a specific PDF. Returns (label, type) or None."""
pdf_norm = pdf_norms.get(source_name)
if not pdf_norm:
return None
orig_norm = normalize(orig_text)
if len(orig_norm) < 20:
return None
idx = pdf_indexes.get(source_name, [])
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = pdf_norm.find(snippet)
if pos >= 0:
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(idx):
if h_pos <= pos:
label = h_label
typ = h_type
break
return (label, typ)
return None
# DB
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# ═══════════════════════════════════════════════════════════════
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
# ═══════════════════════════════════════════════════════════════
print("=" * 60)
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
print("=" * 60)
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""")
top10_unmatched = cur.fetchall()
print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
# Separate: found in other OWASP PDF vs not found anywhere
to_mark_dup = []
to_fix_source = []
for ctrl in top10_unmatched:
uid, cid, title, text, state = ctrl
# Check if found in another OWASP PDF
found_in = None
found_result = None
for other_src in OWASP_PDFS:
if other_src == 'OWASP Top 10 (2021)':
continue
result = find_in_pdf(text, other_src)
if result:
found_in = other_src
found_result = result
break
if found_in:
to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
else:
to_mark_dup.append((uid, cid))
print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
# Mark as duplicate
dup_marked = 0
for uid, cid in to_mark_dup:
cur.execute("""
UPDATE compliance.canonical_controls
SET release_state = 'duplicate'
WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
""", (uid,))
if cur.rowcount > 0:
dup_marked += 1
print(f" Marked as duplicate: {dup_marked}")
# ═══════════════════════════════════════════════════════════════
# STEP 2: Fix wrong source attributions across ALL OWASP sources
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("STEP 2: Fix wrong OWASP source attributions")
print("=" * 60)
all_fixes = list(to_fix_source) # Start with Top 10 fixes
# Also check ASVS, SAMM, MASVS
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
cur.execute("""
SELECT id, control_id, title, source_original_text
FROM compliance.canonical_controls
WHERE source_citation->>'source' = %s
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
""", (source,))
controls = cur.fetchall()
for ctrl in controls:
uid, cid, title, text = ctrl
# Try own PDF first
result = find_in_pdf(text, source)
if result:
# Found in own PDF! Update article info
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (result[0], result[1], uid, result[0], result[1]))
continue
# Try other OWASP PDFs
for other_src in OWASP_PDFS:
if other_src == source:
continue
result = find_in_pdf(text, other_src)
if result:
all_fixes.append((uid, cid, other_src, result[0], result[1]))
break
print(f" Total wrong-source controls found: {len(all_fixes)}")
# Apply source fixes
fixed = 0
for uid, cid, correct_source, label, typ in all_fixes:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
WHERE id = %s
""", (correct_source, label, typ, uid,))
if cur.rowcount > 0:
fixed += 1
print(f" {cid:10s}{correct_source} / {label} [{typ}]")
print(f" Fixed: {fixed} controls")
conn.commit()
# ═══════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("ZUSAMMENFASSUNG")
print("=" * 60)
print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
print(f" Wrong source attribution → fixed: {fixed}")
# Final counts
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
""")
print(f"\n DB release_state nach Cleanup:")
for row in cur.fetchall():
print(f" {row[0]:15s}: {row[1]:5d}")
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close')
""")
active = cur.fetchone()[0]
print(f"\n Aktive Controls: {active}")
conn.close()

View File

@@ -0,0 +1,316 @@
"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
import os
import re
import unicodedata
import psycopg2
import urllib.parse
from pathlib import Path
GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# ── Load Markdown sources ──
def load_markdown_dir(path, pattern="*.md"):
"""Load all markdown files, return combined text and per-file index."""
texts = {}
for f in sorted(path.glob(pattern)):
try:
texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
return texts
# ASVS 4.0 — V-files contain requirements
asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
asvs_files = load_markdown_dir(asvs_dir)
asvs_full = "\n".join(asvs_files.values())
asvs_norm = normalize(asvs_full)
print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
# SAMM core — YAML + Markdown
samm_dir = GITHUB_DIR / "samm-core"
samm_texts = {}
for f in samm_dir.rglob("*.yml"):
try:
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
for f in samm_dir.rglob("*.md"):
try:
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
samm_full = "\n".join(samm_texts.values())
samm_norm = normalize(samm_full)
print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
# MASVS — control markdown files
masvs_dir = GITHUB_DIR / "masvs"
masvs_files = {}
for f in masvs_dir.rglob("*.md"):
try:
masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
masvs_full = "\n".join(masvs_files.values())
masvs_norm = normalize(masvs_full)
print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
# API Security
api_dir = GITHUB_DIR / "api-security"
api_files = {}
for f in api_dir.rglob("*.md"):
try:
api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
api_full = "\n".join(api_files.values())
api_norm = normalize(api_full)
print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
# Source → (normalized_text, index_builder)
SOURCE_GITHUB = {
"OWASP ASVS 4.0": asvs_norm,
"OWASP SAMM 2.0": samm_norm,
"OWASP MASVS 2.0": masvs_norm,
"OWASP API Security Top 10 (2023)": api_norm,
}
# Build indexes for each source
def build_asvs_index(text):
items = []
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
def build_samm_index(text):
items = []
# SAMM practices have names like "Strategy & Metrics", sections numbered
for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
items.append((m.start(), f"Section {m.group(1)}", "section"))
# Also find practice identifiers
for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
r'Incident Management|Requirements Testing|Security Testing|'
r'Design Review|Implementation Review|Operations Management)'
r'[^.\n]{0,30})', text):
items.append((m.start(), m.group(1)[:50], "section"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
def build_masvs_index(text):
items = []
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
def build_api_index(text):
items = []
for m in re.finditer(r'(API\d+:\d{4})', text):
items.append((m.start(), m.group(1), "category"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
SOURCE_INDEX_BUILDERS = {
"OWASP ASVS 4.0": build_asvs_index,
"OWASP SAMM 2.0": build_samm_index,
"OWASP MASVS 2.0": build_masvs_index,
"OWASP API Security Top 10 (2023)": build_api_index,
}
# Build all indexes on normalized text
source_indexes = {}
for name, norm_text in SOURCE_GITHUB.items():
builder = SOURCE_INDEX_BUILDERS[name]
idx = builder(norm_text)
source_indexes[name] = idx
print(f" {name}: {len(idx)} index entries")
def find_text(orig_text, source_name):
"""Find control text in GitHub source. Returns (label, type) or None."""
norm_text = SOURCE_GITHUB.get(source_name)
if not norm_text:
return None
idx = source_indexes.get(source_name, [])
orig_norm = normalize(orig_text)
if len(orig_norm) < 20:
return None
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = norm_text.find(snippet)
if pos >= 0:
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(idx):
if h_pos <= pos:
label = h_label
typ = h_type
break
return (label, typ)
return None
def find_in_any_github(orig_text, exclude_source=None):
"""Try all GitHub sources."""
for name in SOURCE_GITHUB:
if name == exclude_source:
continue
result = find_text(orig_text, name)
if result:
return (name, result[0], result[1])
return None
# ── DB ──
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# ── Process each OWASP source ──
total_matched = 0
total_cross = 0
total_not_found = 0
all_updates = []
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = %s
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""", (source,))
controls = cur.fetchall()
if not controls:
continue
print(f"\n{'='*60}")
print(f"{source}{len(controls)} unmatched active")
print(f"{'='*60}")
matched = 0
cross_matched = 0
not_found = 0
for ctrl in controls:
uid, cid, title, text, state = ctrl
# Try own GitHub source
result = find_text(text, source)
if result:
matched += 1
total_matched += 1
all_updates.append((uid, cid, source, result[0], result[1]))
print(f" {cid:10s}{result[0]:30s} [{result[1]}]")
continue
# Try other GitHub sources
cross = find_in_any_github(text, exclude_source=source)
if cross:
cross_matched += 1
total_cross += 1
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
continue
not_found += 1
total_not_found += 1
print(f"\n Own source matched: {matched}")
print(f" Cross-source: {cross_matched}")
print(f" Not found: {not_found}")
# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""")
top10_remaining = cur.fetchall()
if top10_remaining:
print(f"\n{'='*60}")
print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
print(f"{'='*60}")
for ctrl in top10_remaining:
uid, cid, title, text, state = ctrl
cross = find_in_any_github(text)
if cross:
total_cross += 1
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
else:
total_not_found += 1
# ── Summary ──
print(f"\n{'='*60}")
print(f"ZUSAMMENFASSUNG")
print(f"{'='*60}")
print(f" Matched in eigener GitHub-Quelle: {total_matched}")
print(f" Cross-source matched: {total_cross}")
print(f" Nicht gefunden: {total_not_found}")
print(f" Total Updates: {len(all_updates)}")
# ── Apply updates ──
if all_updates:
print(f"\nApplying {len(all_updates)} updates to DB...")
applied = 0
for uid, cid, correct_source, label, typ in all_updates:
# Update article + article_type, and fix source if cross-matched
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (label, typ, uid, label, typ))
if cur.rowcount > 0:
applied += 1
conn.commit()
print(f" Applied: {applied} controls updated")
# Type distribution
type_counts = {}
for _, _, _, _, typ in all_updates:
type_counts[typ] = type_counts.get(typ, 0) + 1
print(f"\n Article type distribution:")
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t:12s}: {c:5d}")
conn.close()

View File

@@ -0,0 +1,357 @@
"""Phase 5: Source Normalization + Duplicate Hard Delete.
Steps:
1. OSCAL controls: add source_regulation to generation_metadata
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
6. Clean up canonical_processed_chunks generated_control_ids
Usage:
export DATABASE_URL='postgresql://...'
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
"""
import os
import sys
import json
import psycopg2
import urllib.parse
DRY_RUN = "--dry-run" in sys.argv
STEP_ONLY = None
for arg in sys.argv:
if arg.startswith("--step"):
idx = sys.argv.index(arg)
if idx + 1 < len(sys.argv):
STEP_ONLY = int(sys.argv[idx + 1])
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
def should_run(step):
return STEP_ONLY is None or STEP_ONLY == step
# ══════════════════════════════════════════════════════════════════
# Step 1: OSCAL controls — add source_regulation to generation_metadata
# ══════════════════════════════════════════════════════════════════
if should_run(1):
print("=" * 70)
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
print("=" * 70)
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE generation_strategy = 'oscal_import'
AND (generation_metadata->>'source_regulation' IS NULL
OR generation_metadata->>'source_regulation' = '')
""")
count = cur.fetchone()[0]
print(f" OSCAL controls without source_regulation: {count}")
if count > 0:
if DRY_RUN:
print(f" [DRY RUN] Would update {count} controls")
else:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
WHERE generation_strategy = 'oscal_import'
AND (generation_metadata->>'source_regulation' IS NULL
OR generation_metadata->>'source_regulation' = '')
""")
print(f" Updated: {cur.rowcount}")
print()
# ══════════════════════════════════════════════════════════════════
# Step 2: v3 controls with NULL source — tag source as best guess
# ══════════════════════════════════════════════════════════════════
if should_run(2):
print("=" * 70)
print("STEP 2: Fix v3 controls with NULL source")
print("=" * 70)
# These 20 controls are v3/document_grouped with no source or regulation.
# Based on title analysis, they cover:
# - Data protection/privacy topics (DSGVO-adjacent)
# - Software security (OWASP/NIST-adjacent)
# - Mobile security (OWASP MASVS-adjacent)
# Mark them as 'needs_review' and add a flag.
cur.execute("""
SELECT id, control_id, title
FROM compliance.canonical_controls
WHERE source_citation->>'source' IS NULL
AND pipeline_version = 3
AND release_state NOT IN ('duplicate', 'too_close')
""")
v3_null = cur.fetchall()
print(f" v3 controls with NULL source: {len(v3_null)}")
if v3_null:
if DRY_RUN:
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
else:
for ctrl_id_uuid, control_id, title in v3_null:
cur.execute("""
UPDATE compliance.canonical_controls
SET release_state = 'needs_review',
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"missing_source": true}'::jsonb
WHERE id = %s
""", (ctrl_id_uuid,))
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
print()
# ══════════════════════════════════════════════════════════════════
# Step 3: Fix empty-string source (DATA-631)
# ══════════════════════════════════════════════════════════════════
if should_run(3):
print("=" * 70)
print("STEP 3: Fix empty-string source")
print("=" * 70)
cur.execute("""
SELECT id, control_id, title,
generation_metadata->>'source_regulation' as reg
FROM compliance.canonical_controls
WHERE source_citation->>'source' = ''
AND release_state NOT IN ('duplicate', 'too_close')
""")
empty_src = cur.fetchall()
print(f" Controls with empty source: {len(empty_src)}")
for ctrl_id_uuid, control_id, title, reg in empty_src:
print(f" {control_id} | reg={reg} | {title[:60]}")
if reg == 'at_tkg':
new_source = 'Telekommunikationsgesetz Oesterreich'
else:
new_source = f"Unbekannt ({reg})"
if DRY_RUN:
print(f" [DRY RUN] Would set source='{new_source}'")
else:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = jsonb_set(
source_citation, '{source}', %s::jsonb
)
WHERE id = %s
""", (json.dumps(new_source), ctrl_id_uuid))
print(f" Set source='{new_source}'")
print()
# ══════════════════════════════════════════════════════════════════
# Step 4: Fix OWASP cross-source misattributions
# ══════════════════════════════════════════════════════════════════
if should_run(4):
print("=" * 70)
print("STEP 4: Fix OWASP cross-source misattributions")
print("=" * 70)
# Controls where source_citation.source doesn't match the regulation_code
OWASP_REG_TO_SOURCE = {
'owasp_top10_2021': 'OWASP Top 10 (2021)',
'owasp_asvs': 'OWASP ASVS 4.0',
'owasp_masvs': 'OWASP MASVS 2.0',
'owasp_samm': 'OWASP SAMM 2.0',
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
}
# Strategy: Move controls to the regulation_code that matches their actual source
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
# update the reg to 'owasp_asvs'
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
total_fixed = 0
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
cur.execute("""
SELECT id, control_id, source_citation->>'source' as src
FROM compliance.canonical_controls
WHERE generation_metadata->>'source_regulation' = %s
AND source_citation->>'source' <> %s
AND release_state NOT IN ('duplicate', 'too_close')
""", (reg_code, expected_source))
mismatches = cur.fetchall()
if mismatches:
print(f"\n {reg_code}{len(mismatches)} Mismatches:")
for ctrl_id_uuid, control_id, actual_source in mismatches:
correct_reg = SOURCE_TO_REG.get(actual_source)
if correct_reg:
print(f" {control_id} | {actual_source} → reg={correct_reg}")
if not DRY_RUN:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
generation_metadata, '{source_regulation}', %s::jsonb
)
WHERE id = %s
""", (json.dumps(correct_reg), ctrl_id_uuid))
total_fixed += 1
else:
print(f" {control_id} | {actual_source} → no mapping found")
if DRY_RUN:
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
else:
print(f"\n Fixed: {total_fixed} misattributions")
print()
# ══════════════════════════════════════════════════════════════════
# Step 5: Hard delete duplicate/too_close controls
# ══════════════════════════════════════════════════════════════════
if should_run(5):
print("=" * 70)
print("STEP 5: Hard delete duplicate/too_close controls")
print("=" * 70)
# Verify no FK references
for table, col in [
('canonical_control_mappings', 'control_id'),
('obligation_extractions', 'control_uuid'),
('crosswalk_matrix', 'master_control_uuid'),
('obligation_candidates', 'parent_control_uuid'),
]:
cur.execute(f"""
SELECT count(*)
FROM compliance.{table} t
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
WHERE cc.release_state IN ('duplicate', 'too_close')
""")
fk_count = cur.fetchone()[0]
if fk_count > 0:
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
print(f" ABORTING Step 5 — clean FK refs first!")
sys.exit(1)
else:
print(f" {table}.{col}: 0 refs ✓")
# Check self-references
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls child
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
WHERE parent.release_state IN ('duplicate', 'too_close')
""")
self_refs = cur.fetchone()[0]
if self_refs > 0:
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
print(f" ABORTING Step 5!")
sys.exit(1)
print(f" Self-references: 0 ✓")
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
WHERE release_state IN ('duplicate', 'too_close')
GROUP BY 1
""")
to_delete = {}
for state, cnt in cur.fetchall():
to_delete[state] = cnt
print(f"\n {state}: {cnt}")
total = sum(to_delete.values())
print(f"\n TOTAL to delete: {total}")
if DRY_RUN:
print(f" [DRY RUN] Would delete {total} controls")
else:
cur.execute("""
DELETE FROM compliance.canonical_controls
WHERE release_state IN ('duplicate', 'too_close')
""")
print(f" Deleted: {cur.rowcount} controls")
print()
# ══════════════════════════════════════════════════════════════════
# Step 6: Clean up canonical_processed_chunks generated_control_ids
# ══════════════════════════════════════════════════════════════════
if should_run(6):
print("=" * 70)
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
print("=" * 70)
if DRY_RUN and should_run(5):
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
else:
# Find chunks that reference non-existent controls
cur.execute("""
SELECT id, generated_control_ids
FROM compliance.canonical_processed_chunks
WHERE generated_control_ids IS NOT NULL
AND generated_control_ids <> '[]'::jsonb
""")
chunks = cur.fetchall()
print(f" Chunks with generated_control_ids: {len(chunks)}")
# Get all existing control IDs
cur.execute("SELECT id::text FROM compliance.canonical_controls")
existing_ids = set(r[0] for r in cur.fetchall())
print(f" Existing controls: {len(existing_ids)}")
cleaned = 0
for chunk_id, control_ids in chunks:
if isinstance(control_ids, str):
control_ids = json.loads(control_ids)
if isinstance(control_ids, list):
valid_ids = [cid for cid in control_ids if cid in existing_ids]
if len(valid_ids) < len(control_ids):
removed = len(control_ids) - len(valid_ids)
cur.execute("""
UPDATE compliance.canonical_processed_chunks
SET generated_control_ids = %s::jsonb
WHERE id = %s
""", (json.dumps(valid_ids), chunk_id))
cleaned += 1
print(f" Chunks cleaned: {cleaned}")
print()
# ══════════════════════════════════════════════════════════════════
# Final summary
# ══════════════════════════════════════════════════════════════════
if not DRY_RUN:
conn.commit()
print("=" * 70)
print("COMMITTED. Final state:")
print("=" * 70)
else:
print("=" * 70)
print("[DRY RUN] No changes committed. Current state:")
print("=" * 70)
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY 1
ORDER BY count(*) DESC
""")
total = 0
active = 0
for state, cnt in cur.fetchall():
total += cnt
if state not in ('duplicate', 'too_close'):
active += cnt
print(f" {state:15s}: {cnt:5d}")
print(f"\n TOTAL: {total}")
print(f" AKTIV: {active}")
conn.close()

View File

@@ -0,0 +1,655 @@
#!/usr/bin/env python3
"""
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
Reads gap_analysis_results.json, extracts article text from PDFs,
calls Claude Sonnet to generate controls, inserts into DB.
Usage:
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
python3 phase74_generate_gap_controls.py # generate and insert
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
"""
import os
import sys
import json
import re
import time
import hashlib
import argparse
import psycopg2
import urllib.parse
import requests
from pathlib import Path
from collections import Counter
sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
build_eu_article_index, build_de_law_index, build_nist_index,
build_owasp_index, build_generic_index, MAX_ARTICLES,
)
# ── Config ──────────────────────────────────────────────────────────
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PIPELINE_VERSION = 5
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
try:
import fitz
except ImportError:
fitz = None
# ── Source name → regulation_code reverse map ────────────────────────
# Built from REGULATION_LICENSE_MAP in control_generator.py
SOURCE_TO_REGCODE = {
"DSGVO (EU) 2016/679": "eu_2016_679",
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
"Cyber Resilience Act (CRA)": "eu_2024_2847",
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
"EU Blue Guide 2022": "eu_blue_guide_2022",
"Markets in Crypto-Assets (MiCA)": "mica",
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
"AML-Verordnung": "amlr",
"Data Governance Act (DGA)": "dga",
"Data Act": "data_act",
"GPSR (EU) 2023/988": "gpsr",
"IFRS-Übernahmeverordnung": "ifrs",
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
"NIST SP 800-63-3": "nist_sp800_63_3",
"NIST AI Risk Management Framework": "nist_ai_rmf",
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
"OWASP Top 10 (2021)": "owasp_top10",
"OWASP ASVS 4.0": "owasp_asvs",
"OWASP SAMM 2.0": "owasp_samm",
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
"OWASP MASVS 2.0": "owasp_masvs",
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
"CISA Secure by Design": "cisa_sbd",
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
"Gewerbeordnung (GewO)": "gewo",
"Handelsgesetzbuch (HGB)": "hgb",
"Abgabenordnung (AO)": "ao",
"OECD KI-Empfehlung": "oecd_ai_principles",
}
# License info per regulation code (from REGULATION_LICENSE_MAP)
LICENSE_MAP = {
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
}
# Domain detection keywords
DOMAIN_KEYWORDS = {
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
}
# ── Prompt (same as control_generator.py) ────────────────────────────
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
"Abfallwirtschaft", "Forschung"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
Verwende ["all"] wenn keine Groessenbeschraenkung.
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
{"requires_any": ["signal"], "description": "Erklaerung"}
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
"processes_minors_data", "automated_decisions", "employee_monitoring",
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
CATEGORY_LIST = [
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
]
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
def build_prompt(source_name, article_label, article_text, license_type):
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
{APPLICABILITY_PROMPT}
Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""
# ── PDF article extraction ───────────────────────────────────────────
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
"""Extract the text of a specific article from a PDF."""
if full_text is None:
full_text = read_file(pdf_file)
if not full_text:
return ""
if doc_type == "eu_regulation":
art_num_match = re.search(r'\d+', article_label)
if not art_num_match:
return ""
num = int(art_num_match.group())
pattern = rf'\nArtikel\s+{num}\s*\n'
match = re.search(pattern, full_text)
if not match:
return ""
start = match.start()
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else min(start + 5000, len(full_text))
return full_text[start:end].strip()[:3000]
elif doc_type == "de_law":
para_match = re.search(r'\d+', article_label)
if not para_match:
return ""
num = int(para_match.group())
pattern = rf'\\s+{num}\b'
match = re.search(pattern, full_text)
if not match:
return ""
start = match.start()
next_pattern = rf'\\s+{num + 1}\b'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else min(start + 5000, len(full_text))
return full_text[start:end].strip()[:3000]
elif doc_type == "nist":
escaped = re.escape(article_label)
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
if not match:
return ""
start = match.start()
return full_text[start:start + 3000].strip()
else:
# Generic / OWASP / ENISA
escaped = re.escape(article_label)
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
if not match:
return ""
start = match.start()
return full_text[start:start + 3000].strip()
# ── Anthropic API ────────────────────────────────────────────────────
def call_anthropic(prompt, system_prompt):
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": system_prompt,
"messages": [{"role": "user", "content": prompt}],
}
try:
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data["content"][0]["text"] if data.get("content") else ""
usage = data.get("usage", {})
parsed = parse_json(content)
return parsed, content, usage, None
except Exception as e:
return None, "", {}, str(e)
def parse_json(text):
"""Parse JSON from LLM response, handling markdown fences."""
text = text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
text = text.strip()
try:
data = json.loads(text)
if isinstance(data, list):
return data[0] if data else None
return data
except json.JSONDecodeError:
match = re.search(r'\{[\s\S]*\}', text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
return None
return None
# ── Domain detection ─────────────────────────────────────────────────
def detect_domain(text):
text_lower = text.lower()
scores = {}
for domain, keywords in DOMAIN_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[domain] = score
if scores:
return max(scores, key=scores.get)
return "SEC"
# ── Control ID generation ────────────────────────────────────────────
def generate_control_id(domain, cur):
"""Generate next available control_id for domain prefix.
Uses MAX(numeric suffix) to find the true highest number,
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
"""
prefix = domain.upper()[:4]
cur.execute("""
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
FROM compliance.canonical_controls
WHERE control_id LIKE %s
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
""", (f"{prefix}-%",))
row = cur.fetchone()
if row and row[0] is not None:
return f"{prefix}-{row[0] + 1}"
return f"{prefix}-001"
# ── Main ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
parser.add_argument("--source", type=str, help="Filter by source name substring")
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
args = parser.parse_args()
if not ANTHROPIC_API_KEY:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
# Load gap results
with open(args.results) as f:
gaps = json.load(f)
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
if args.source:
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
# DB connection with keepalive + reconnect helper
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
def connect_db():
"""Create DB connection with TCP keepalive."""
c = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public",
keepalives=1, keepalives_idle=30,
keepalives_interval=10, keepalives_count=5,
)
return c, c.cursor()
conn, cur = connect_db()
def ensure_db():
"""Reconnect if connection is dead."""
nonlocal conn, cur
try:
cur.execute("SELECT 1")
except Exception:
print(" [RECONNECT] DB connection lost, reconnecting...")
try:
conn.close()
except Exception:
pass
conn, cur = connect_db()
return True
return False
# Get framework UUID
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
fw_row = cur.fetchone()
if not fw_row:
print("ERROR: Framework bp_security_v1 not found")
sys.exit(1)
framework_uuid = fw_row[0]
# If resuming, load existing articles per source
existing_articles = {}
if args.resume:
cur.execute("""
SELECT source_citation->>'source', source_citation->>'article'
FROM compliance.canonical_controls
WHERE source_citation->>'article' IS NOT NULL
""")
for src, art in cur.fetchall():
existing_articles.setdefault(src, set()).add(art)
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
# Stats
stats = Counter()
total_input_tokens = 0
total_output_tokens = 0
generated_ids = []
errors = []
t_start = time.time()
# Pre-read PDFs (cache full text per source)
pdf_cache = {}
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
source_name = gap_source["source"]
gap_articles = gap_source["gap_articles"]
filename = SOURCE_FILE_MAP.get(source_name)
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
doc_type = classify_doc(source_name)
if not filename:
stats["skipped_no_pdf"] += len(gap_articles)
continue
# Read PDF once per source
if source_name not in pdf_cache:
pdf_cache[source_name] = read_file(filename)
full_text = pdf_cache[source_name]
if not full_text:
stats["skipped_no_pdf"] += len(gap_articles)
continue
print(f"\n{'='*70}")
print(f"{source_name}{len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
print(f"{'='*70}")
for gap in gap_articles:
article_label = gap["label"]
article_type = gap["type"]
# Skip if already has controls (resume mode)
if args.resume and article_label in existing_articles.get(source_name, set()):
stats["skipped_exists"] += 1
continue
# Skip non-substantive NIST sections (intro chapters)
if doc_type == "nist" and article_type == "section":
section_match = re.match(r'Section (\d+)', article_label)
if section_match and int(section_match.group(1)) <= 3:
stats["skipped_intro"] += 1
continue
# Extract article text
article_text = extract_article_text(filename, article_label, doc_type, full_text)
if not article_text or len(article_text) < 30:
stats["skipped_short_text"] += 1
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
continue
if args.dry_run:
print(f" [DRY] {article_label} ({len(article_text)} chars)")
stats["would_generate"] += 1
continue
# Call Anthropic
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
total_input_tokens += usage.get("input_tokens", 0)
total_output_tokens += usage.get("output_tokens", 0)
if error:
stats["api_error"] += 1
errors.append(f"{source_name} {article_label}: {error}")
print(f" ERROR {article_label}: {error}")
time.sleep(5)
continue
if not data:
stats["parse_error"] += 1
print(f" PARSE ERROR {article_label}")
continue
# Ensure DB is alive before writing
ensure_db()
# Build control
title = str(data.get("title", ""))[:200]
objective = str(data.get("objective", ""))
rationale = str(data.get("rationale", ""))
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
if not domain or len(domain) < 2:
domain = detect_domain(article_text)
control_id = generate_control_id(domain, cur)
severity = str(data.get("severity", "medium")).lower()
if severity not in ("low", "medium", "high", "critical"):
severity = "medium"
requirements = data.get("requirements", [])
if not isinstance(requirements, list):
requirements = [str(requirements)]
test_procedure = data.get("test_procedure", [])
if not isinstance(test_procedure, list):
test_procedure = [str(test_procedure)]
evidence = data.get("evidence", [])
if not isinstance(evidence, list):
evidence = [str(evidence)]
tags = data.get("tags", [])
if not isinstance(tags, list):
tags = []
target_audience = data.get("target_audience", [])
if not isinstance(target_audience, list):
target_audience = []
applicable_industries = data.get("applicable_industries", ["all"])
if not isinstance(applicable_industries, list):
applicable_industries = ["all"]
applicable_company_size = data.get("applicable_company_size", ["all"])
if not isinstance(applicable_company_size, list):
applicable_company_size = ["all"]
scope_conditions = data.get("scope_conditions")
source_citation = {
"source": source_name,
"article": data.get("source_article", article_label),
"paragraph": data.get("source_paragraph", ""),
"article_type": article_type,
"license": license_info["license"],
"source_type": license_info["source_type"],
}
generation_metadata = {
"processing_path": "phase74_gap_fill",
"license_rule": license_info["rule"],
"source_regulation": reg_code,
"source_article": article_label,
"gap_fill": True,
}
category = str(data.get("category", "")) or None
# Insert into DB
try:
cur.execute("""
INSERT INTO compliance.canonical_controls (
framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort,
open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata,
verification_method, category, generation_strategy,
target_audience, pipeline_version,
applicable_industries, applicable_company_size, scope_conditions
) VALUES (
%s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s, %s,
%s, %s,
%s, %s, %s
)
ON CONFLICT (framework_id, control_id) DO NOTHING
RETURNING id
""", (
framework_uuid, control_id, title, objective, rationale,
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
severity, 5, "m",
json.dumps([]), "draft", json.dumps(tags),
license_info["rule"], article_text, json.dumps(source_citation),
True, json.dumps(generation_metadata),
"document", category, "phase74_gap_fill",
json.dumps(target_audience), PIPELINE_VERSION,
json.dumps(applicable_industries), json.dumps(applicable_company_size),
json.dumps(scope_conditions) if scope_conditions else None,
))
conn.commit()
row = cur.fetchone()
if row:
generated_ids.append(str(row[0]))
stats["generated"] += 1
print(f" OK {control_id}: {title[:60]}")
else:
stats["conflict"] += 1
print(f" CONFLICT {control_id} (already exists)")
except Exception as e:
conn.rollback()
stats["db_error"] += 1
errors.append(f"DB {control_id}: {str(e)[:100]}")
print(f" DB ERROR {control_id}: {str(e)[:100]}")
# Rate limit: ~0.5s between calls
time.sleep(0.5)
# ── Summary ──────────────────────────────────────────────────────
elapsed = time.time() - t_start
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
print(f"\n\n{'='*70}")
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
print(f"{'='*70}")
print(f" Laufzeit: {elapsed/60:.1f} min")
print(f" API-Kosten: ${cost:.2f}")
print(f" Input Tokens: {total_input_tokens:,}")
print(f" Output Tokens: {total_output_tokens:,}")
print()
for key in sorted(stats.keys()):
print(f" {key:<25s}: {stats[key]:5d}")
print()
if generated_ids:
print(f" Neue Control-IDs: {len(generated_ids)}")
# Save generated IDs
with open("/tmp/phase74_generated_ids.json", 'w') as f:
json.dump(generated_ids, f)
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
if errors:
print(f"\n Fehler ({len(errors)}):")
for e in errors[:20]:
print(f" {e}")
if len(errors) > 20:
print(f" ... und {len(errors)-20} weitere")
conn.close()
if __name__ == "__main__":
main()

218
scripts/qa/run_job.sh Executable file
View File

@@ -0,0 +1,218 @@
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────
# Robust job runner for QA scripts on Mac Mini
#
# Usage:
# ./run_job.sh <script.py> [args...] # start job
# ./run_job.sh --status # show running jobs
# ./run_job.sh --kill <script.py> # kill a running job
# ./run_job.sh --log <script.py> # tail log
#
# Features:
# - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
# - PID-file prevents duplicate runs
# - Unbuffered Python output
# - Structured log files in /tmp/qa_jobs/
# ─────────────────────────────────────────────────────────────
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
JOB_DIR="/tmp/qa_jobs"
mkdir -p "$JOB_DIR"
# ── Load .env ────────────────────────────────────────────────
load_env() {
local envfile="$PROJECT_DIR/.env"
if [[ -f "$envfile" ]]; then
# Export all vars from .env
set -a
# shellcheck disable=SC1090
source "$envfile"
set +a
fi
# Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
fi
}
# ── Job name from script path ─────────────────────────────────
job_name() {
basename "$1" .py
}
pid_file() {
echo "$JOB_DIR/$(job_name "$1").pid"
}
log_file() {
echo "$JOB_DIR/$(job_name "$1").log"
}
# ── Status ────────────────────────────────────────────────────
show_status() {
echo "═══════════════════════════════════════════════════════"
echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
echo "═══════════════════════════════════════════════════════"
local found=0
for pidfile in "$JOB_DIR"/*.pid; do
[[ -f "$pidfile" ]] || continue
found=1
local name
name=$(basename "$pidfile" .pid)
local pid
pid=$(cat "$pidfile")
local logf="$JOB_DIR/$name.log"
if kill -0 "$pid" 2>/dev/null; then
local lines
lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
local errors
errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
local last_line
last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
echo "$name (PID $pid) — RUNNING"
echo " Log: $logf ($lines lines, $errors errors)"
echo " Last: $last_line"
else
echo "$name (PID $pid) — STOPPED"
echo " Log: $logf"
rm -f "$pidfile"
fi
echo ""
done
if [[ $found -eq 0 ]]; then
echo " No jobs running."
fi
}
# ── Kill ──────────────────────────────────────────────────────
kill_job() {
local script="$1"
local pf
pf=$(pid_file "$script")
if [[ ! -f "$pf" ]]; then
echo "No PID file for $(job_name "$script")"
return 1
fi
local pid
pid=$(cat "$pf")
if kill -0 "$pid" 2>/dev/null; then
kill "$pid"
echo "Killed $(job_name "$script") (PID $pid)"
else
echo "Process $pid already stopped"
fi
rm -f "$pf"
}
# ── Tail log ──────────────────────────────────────────────────
tail_log() {
local script="$1"
local lf
lf=$(log_file "$script")
if [[ ! -f "$lf" ]]; then
echo "No log file: $lf"
return 1
fi
tail -50 "$lf"
}
# ── Start job ─────────────────────────────────────────────────
start_job() {
local script="$1"
shift
local args=("$@")
# Resolve script path
local script_path="$script"
if [[ ! -f "$script_path" ]]; then
script_path="$SCRIPT_DIR/$script"
fi
if [[ ! -f "$script_path" ]]; then
echo "ERROR: Script not found: $script"
return 1
fi
local name
name=$(job_name "$script")
local pf
pf=$(pid_file "$script")
local lf
lf=$(log_file "$script")
# Check for already-running instance
if [[ -f "$pf" ]]; then
local existing_pid
existing_pid=$(cat "$pf")
if kill -0 "$existing_pid" 2>/dev/null; then
echo "ERROR: $name already running (PID $existing_pid)"
echo "Use: $0 --kill $script"
return 1
fi
rm -f "$pf"
fi
# Load environment
load_env
# Verify required env vars
if [[ -z "${DATABASE_URL:-}" ]]; then
echo "ERROR: DATABASE_URL not set (checked .env)"
return 1
fi
# Start
echo "Starting $name..."
echo " Script: $script_path"
echo " Args: ${args[*]:-none}"
echo " Log: $lf"
nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
local pid=$!
echo "$pid" > "$pf"
echo " PID: $pid"
echo ""
# Wait a moment and check it started OK
sleep 3
if ! kill -0 "$pid" 2>/dev/null; then
echo "ERROR: Process died immediately. Log output:"
cat "$lf"
rm -f "$pf"
return 1
fi
local lines
lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
echo "Running OK ($lines log lines so far)"
echo "Monitor with: $0 --status"
echo "Tail log: $0 --log $script"
}
# ── Main ──────────────────────────────────────────────────────
case "${1:-}" in
--status|-s)
show_status
;;
--kill|-k)
[[ -n "${2:-}" ]] || { echo "Usage: $0 --kill <script.py>"; exit 1; }
kill_job "$2"
;;
--log|-l)
[[ -n "${2:-}" ]] || { echo "Usage: $0 --log <script.py>"; exit 1; }
tail_log "$2"
;;
--help|-h|"")
echo "Usage:"
echo " $0 <script.py> [args...] Start a QA job"
echo " $0 --status Show running jobs"
echo " $0 --kill <script.py> Kill a running job"
echo " $0 --log <script.py> Tail job log"
;;
*)
start_job "$@"
;;
esac

307
scripts/qa/sync_db.py Normal file
View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""Sync canonical control tables between production and local DB.
Modes:
--pull Production → Local (initial sync, full table copy)
--push Local → Production (incremental, only new obligation_candidates)
--loop Run --push every N minutes (default 60)
Usage:
python3 sync_db.py --pull # Full sync production → local
python3 sync_db.py --push # Push new obligations to production
python3 sync_db.py --loop 60 # Push every 60 minutes
python3 sync_db.py --pull --tables canonical_controls # Only one table
"""
import argparse
import json
import os
import sys
import time
import urllib.parse
import io
import psycopg2
import psycopg2.extras
import psycopg2.extensions
# Register JSON adapter so dicts are automatically converted to JSONB
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
# ── DB Config ────────────────────────────────────────────────────────
PROD_URL = os.environ.get(
"PROD_DATABASE_URL",
"postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
"@46.225.100.82:54321/postgres?sslmode=require",
)
LOCAL_URL = os.environ.get(
"LOCAL_DATABASE_URL",
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
)
SCHEMA = "compliance"
# Tables to sync (production → local)
SYNC_TABLES = [
"canonical_control_frameworks",
"canonical_control_licenses",
"canonical_control_sources",
"canonical_control_categories",
"canonical_blocked_sources",
"canonical_controls",
"canonical_control_mappings",
"canonical_processed_chunks",
"canonical_generation_jobs",
"control_patterns",
"crosswalk_matrix",
"obligation_extractions",
"obligation_candidates",
]
def connect(url, label="DB"):
parsed = urllib.parse.urlparse(url)
params = dict(urllib.parse.parse_qsl(parsed.query))
conn = psycopg2.connect(
host=parsed.hostname,
port=parsed.port or 5432,
user=parsed.username,
password=parsed.password,
dbname=parsed.path.lstrip("/"),
sslmode=params.get("sslmode", "prefer"),
options=f"-c search_path={SCHEMA},public",
keepalives=1,
keepalives_idle=30,
keepalives_interval=10,
keepalives_count=5,
)
conn.autocommit = False
print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
return conn
def get_columns(cur, table):
cur.execute(f"""
SELECT column_name FROM information_schema.columns
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
ORDER BY ordinal_position
""")
return [r[0] for r in cur.fetchall()]
def pull_table(prod_conn, local_conn, table):
"""Copy entire table from production to local via SELECT + INSERT."""
prod_cur = prod_conn.cursor()
local_cur = local_conn.cursor()
# Check table exists on production
prod_cur.execute(f"""
SELECT 1 FROM pg_tables
WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
""")
if not prod_cur.fetchone():
print(f" SKIP {table} — not found on production")
return 0
# Drop local table
local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
local_conn.commit()
# Build simple CREATE TABLE (no constraints, no defaults — just for data)
prod_cur.execute(f"""
SELECT column_name, data_type, udt_name, character_maximum_length
FROM information_schema.columns
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
ORDER BY ordinal_position
""")
col_defs = prod_cur.fetchall()
parts = []
col_names = []
jsonb_cols = set()
for name, dtype, udt, max_len in col_defs:
col_names.append(name)
if dtype == "ARRAY":
type_map = {
"_text": "text[]", "_varchar": "varchar[]",
"_int4": "integer[]", "_uuid": "uuid[]",
"_jsonb": "jsonb[]", "_float8": "float8[]",
}
sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
elif dtype == "USER-DEFINED" and udt == "jsonb":
sql_type = "jsonb"
jsonb_cols.add(name)
elif dtype == "USER-DEFINED":
sql_type = udt
elif dtype == "jsonb":
sql_type = "jsonb"
jsonb_cols.add(name)
elif max_len:
sql_type = f"{dtype}({max_len})"
else:
sql_type = dtype
parts.append(f'"{name}" {sql_type}')
ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
local_cur.execute(ddl)
local_conn.commit()
# Fetch all rows from production
col_list = ", ".join(f'"{c}"' for c in col_names)
prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
rows = prod_cur.fetchall()
if rows:
# Wrap dict/list values in Json for JSONB columns
adapted_rows = []
for row in rows:
adapted = []
for i, val in enumerate(row):
if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
adapted.append(psycopg2.extras.Json(val))
else:
adapted.append(val)
adapted_rows.append(tuple(adapted))
placeholders = ", ".join(["%s"] * len(col_names))
insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
local_conn.commit()
print(f" {table}: {len(rows)} rows")
return len(rows)
def pull(tables=None):
"""Full sync: production → local."""
print("\n=== PULL: Production → Local ===\n")
prod_conn = connect(PROD_URL, "Production")
local_conn = connect(LOCAL_URL, "Local")
# Ensure schema exists
local_cur = local_conn.cursor()
local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
local_conn.commit()
sync_list = tables if tables else SYNC_TABLES
total = 0
for table in sync_list:
try:
count = pull_table(prod_conn, local_conn, table)
total += count
except Exception as e:
print(f" ERROR {table}: {e}")
local_conn.rollback()
prod_conn.rollback()
print(f"\n Total: {total} rows synced")
prod_conn.close()
local_conn.close()
def push():
"""Incremental push: new obligation_candidates local → production."""
print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
local_conn = connect(LOCAL_URL, "Local")
prod_conn = connect(PROD_URL, "Production")
local_cur = local_conn.cursor()
prod_cur = prod_conn.cursor()
# Find obligation_candidates in local that don't exist in production
# Use candidate_id as the unique key
local_cur.execute(f"""
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
""")
local_ids = {r[0] for r in local_cur.fetchall()}
if not local_ids:
print(" No obligation_candidates in local DB")
local_conn.close()
prod_conn.close()
return 0
# Check which already exist on production
prod_cur.execute(f"""
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
""")
prod_ids = {r[0] for r in prod_cur.fetchall()}
new_ids = local_ids - prod_ids
if not new_ids:
print(f" All {len(local_ids)} obligations already on production")
local_conn.close()
prod_conn.close()
return 0
print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
# Get columns
columns = get_columns(local_cur, "obligation_candidates")
col_list = ", ".join(columns)
placeholders = ", ".join(["%s"] * len(columns))
# Fetch new rows from local
id_list = ", ".join(f"'{i}'" for i in new_ids)
local_cur.execute(f"""
SELECT {col_list} FROM {SCHEMA}.obligation_candidates
WHERE candidate_id IN ({id_list})
""")
rows = local_cur.fetchall()
# Insert into production
insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
prod_conn.commit()
print(f" Pushed {len(rows)} obligations to production")
local_conn.close()
prod_conn.close()
return len(rows)
def loop(interval_min):
"""Run push every N minutes."""
print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Press Ctrl+C to stop\n")
while True:
try:
pushed = push()
if pushed:
print(f" Next sync in {interval_min} min...")
except Exception as e:
print(f" SYNC ERROR: {e}")
time.sleep(interval_min * 60)
def main():
parser = argparse.ArgumentParser(description="Sync canonical control tables")
parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
args = parser.parse_args()
if not any([args.pull, args.push, args.loop]):
parser.print_help()
return
if args.pull:
pull(args.tables)
if args.push:
push()
if args.loop:
loop(args.loop)
if __name__ == "__main__":
main()

470
scripts/qa/test_pass0a.py Normal file
View File

@@ -0,0 +1,470 @@
#!/usr/bin/env python3
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
Copies prompts and quality gate from decomposition_pass.py.
Usage:
python3 test_pass0a.py # 10 controls, Anthropic
python3 test_pass0a.py --limit 5 # 5 controls
python3 test_pass0a.py --source "DSGVO" # filter by source
python3 test_pass0a.py --dry-run # show controls, no LLM call
"""
import argparse
import json
import os
import re
import sys
import time
import urllib.parse
import psycopg2
import requests
# ── Config ────────────────────────────────────────────────────────────
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
# ── Prompts (from decomposition_pass.py) ──────────────────────────────
SYSTEM_PROMPT = """\
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
in einzelne atomare Pflichten.
REGELN (STRIKT EINHALTEN):
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
ist zu testen, shall, must, required.
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
eigenes Control, sondern Evidence).
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
— NICHT extrahieren.
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
def build_prompt(title, objective, requirements, test_procedure, source_ref):
return f"""\
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
Pflichten als JSON-Array.
CONTROL:
Titel: {title}
Ziel: {objective}
Anforderungen: {requirements}
Prüfverfahren: {test_procedure}
Quellreferenz: {source_ref}
Antworte als JSON-Array:
[
{{
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
"action": "Hauptverb/Handlung",
"object": "Gegenstand der Pflicht",
"condition": "Auslöser/Bedingung oder null",
"normative_strength": "must",
"is_test_obligation": false,
"is_reporting_obligation": false
}}
]"""
# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
# Tier 1: Pflicht (mandatory)
_PFLICHT_RE = re.compile(
r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
r"|\bshall\b|\bmust\b|\brequired\b"
r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
re.IGNORECASE,
)
# Tier 2: Empfehlung (recommendation)
_EMPFEHLUNG_RE = re.compile(
r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
r"|\bgewährleisten\b|\bsicherstellen\b"
r"|\bshould\b|\bensure\b|\brecommend\w*\b"
r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
re.IGNORECASE,
)
# Tier 3: Kann (optional/permissive)
_KANN_RE = re.compile(
r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
re.IGNORECASE,
)
# Union (backward compat)
_NORMATIVE_RE = re.compile(
_PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
re.IGNORECASE,
)
_RATIONALE_RE = re.compile(
r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
re.IGNORECASE,
)
_TEST_RE = re.compile(
r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
re.IGNORECASE,
)
_REPORTING_RE = re.compile(
r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
r"|\bnotif|\breport\b|\bbehörd",
re.IGNORECASE,
)
def classify_obligation_type(txt):
"""Classify: pflicht > empfehlung > kann > empfehlung (default)."""
if _PFLICHT_RE.search(txt):
return "pflicht"
if _EMPFEHLUNG_RE.search(txt):
return "empfehlung"
if _KANN_RE.search(txt):
return "kann"
return "empfehlung"
def quality_gate(obl_text, parent_uuid):
"""Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
flags = {}
# 1. Normative signal (informational)
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
# 1b. Obligation type classification
obl_type = classify_obligation_type(obl_text)
flags["obligation_type"] = obl_type
# 2. Single action
multi_verb_re = re.compile(
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
re.IGNORECASE,
)
flags["single_action"] = not bool(multi_verb_re.search(obl_text))
# 3. Not rationale
normative_count = len(_NORMATIVE_RE.findall(obl_text))
rationale_count = len(_RATIONALE_RE.findall(obl_text))
flags["not_rationale"] = normative_count >= rationale_count
# 4. Not evidence-only
evidence_only_re = re.compile(
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
re.IGNORECASE,
)
flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
# 5. Min length
flags["min_length"] = len(obl_text.strip()) >= 20
# 6. Parent link
flags["has_parent_link"] = bool(parent_uuid)
# Confidence
weights = {
"has_normative_signal": 0.25, "single_action": 0.20,
"not_rationale": 0.20, "not_evidence_only": 0.15,
"min_length": 0.10, "has_parent_link": 0.05,
}
# Bonus for pflicht classification
confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
if obl_type == "pflicht":
confidence = min(confidence + 0.05, 1.0)
# Pass check — has_normative_signal is NO LONGER critical
critical = ["not_evidence_only", "min_length", "has_parent_link"]
passed = all(flags.get(k, False) for k in critical)
return flags, passed, confidence, obl_type
# ── JSON parsing ──────────────────────────────────────────────────────
def parse_json_array(text):
try:
result = json.loads(text)
if isinstance(result, list):
return result
if isinstance(result, dict):
return [result]
except json.JSONDecodeError:
pass
match = re.search(r"\[[\s\S]*\]", text)
if match:
try:
result = json.loads(match.group())
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass
return []
# ── API call ──────────────────────────────────────────────────────────
def call_anthropic(prompt):
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 8192,
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
"messages": [{"role": "user", "content": prompt}],
}
resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
usage = data.get("usage", {})
content = data.get("content", [])
text = content[0].get("text", "") if content else ""
return text, usage, None
# ── Format helpers ────────────────────────────────────────────────────
def fmt_json(val):
if val is None:
return ""
if isinstance(val, str):
try:
val = json.loads(val)
except (json.JSONDecodeError, TypeError):
return val
if isinstance(val, list):
return "\n".join(f" - {item}" for item in val)
return str(val)
# ── Main ──────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
parser.add_argument("--limit", type=int, default=10)
parser.add_argument("--source", type=str)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not ANTHROPIC_API_KEY and not args.dry_run:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
db_url = os.environ["DATABASE_URL"]
p = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=p.hostname, port=p.port or 5432,
user=p.username, password=p.password,
dbname=p.path.lstrip("/"),
options="-c search_path=compliance,public",
)
cur = conn.cursor()
# Select diverse sample
query = """
SELECT id, control_id, title, objective, requirements,
test_procedure, source_citation, category
FROM compliance.canonical_controls
WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
AND parent_control_uuid IS NULL
AND title IS NOT NULL AND objective IS NOT NULL
AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
"""
params = []
if args.source:
query += " AND source_citation->>'source' ILIKE %s"
params.append(f"%{args.source}%")
query += " ORDER BY source_citation->>'source', random()"
query += f" LIMIT {args.limit}"
cur.execute(query, params)
controls = cur.fetchall()
if not controls:
print("No controls found.")
return
print(f"{'='*70}")
print(f"Pass 0a Test — {len(controls)} Controls")
print(f"Model: {ANTHROPIC_MODEL}")
print(f"{'='*70}")
total_in = total_out = total_obls = 0
type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
total_rejected = 0 # only evidence-only / too-short / no-parent
all_results = []
t_start = time.time()
for i, row in enumerate(controls, 1):
ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
req_str = fmt_json(reqs)
test_str = fmt_json(test_proc)
source_str = ""
if src_cit:
sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
print(f"\n{''*70}")
print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
print(f" Source: {source_str} | Category: {category or 'N/A'}")
print(f" Objective: {(objective or '')[:200]}")
if args.dry_run:
print(" [DRY RUN]")
continue
prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
t0 = time.time()
response_text, usage, error = call_anthropic(prompt)
elapsed = time.time() - t0
if error:
print(f" ERROR: {error}")
continue
in_tok = usage.get("input_tokens", 0)
out_tok = usage.get("output_tokens", 0)
cached = usage.get("cache_read_input_tokens", 0)
total_in += in_tok
total_out += out_tok
obligations = parse_json_array(response_text)
total_obls += len(obligations)
print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
f"{f' ({cached} cached)' if cached else ''}"
f" | {len(obligations)} obligation(s)")
for j, obl in enumerate(obligations, 1):
obl_text = obl.get("obligation_text", "")
action = obl.get("action", "")
obj = obl.get("object", "")
condition = obl.get("condition")
strength = obl.get("normative_strength", "must")
is_test = bool(obl.get("is_test_obligation", False))
is_report = bool(obl.get("is_reporting_obligation", False))
# Auto-detect
if not is_test and _TEST_RE.search(obl_text):
is_test = True
if not is_report and _REPORTING_RE.search(obl_text):
is_report = True
flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
if passed:
type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
else:
total_rejected += 1
tag = ""
if is_test:
tag = " [TEST]"
elif is_report:
tag = " [MELDEPFLICHT]"
# Show type instead of PASS/REJECT
type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
if not passed:
status = "REJECT"
else:
status = type_label.get(obl_type, "EMPFEHLUNG")
failed = [k for k, v in flags.items()
if isinstance(v, bool) and not v]
print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
print(f" {obl_text}")
print(f" Handlung: {action} | Gegenstand: {obj}")
if condition:
print(f" Bedingung: {condition}")
if not passed:
print(f" Abgelehnt: {', '.join(failed)}")
all_results.append({
"control_id": ctrl_id,
"obligation_text": obl_text,
"obligation_type": obl_type if passed else "rejected",
"action": action,
"object": obj,
"condition": condition,
"confidence": round(conf, 2),
"is_test": is_test,
"is_reporting": is_report,
"passed": passed,
"flags": {k: v for k, v in flags.items()},
})
time.sleep(0.5)
# ── Summary ──────────────────────────────────────────────────────
elapsed_total = time.time() - t_start
cost = (total_in * 3 + total_out * 15) / 1_000_000
total_classified = sum(type_counts.values())
print(f"\n\n{'='*70}")
print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
print(f"{'='*70}")
print(f" Controls: {len(controls)}")
print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
print(f" ── Klassifizierung ──")
print(f" Pflicht: {type_counts['pflicht']}"
f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
print(f" Empfehlung: {type_counts['empfehlung']}"
f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
print(f" Kann: {type_counts['kann']}"
f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
print(f" Rejected: {total_rejected}"
f" ({total_rejected*100/max(total_obls,1):.0f}%)"
f" (nur evidence-only/zu kurz/kein parent)")
print(f" ── Kosten ──")
print(f" Laufzeit: {elapsed_total:.1f}s")
print(f" Tokens: {total_in:,} in / {total_out:,} out")
print(f" Kosten: ${cost:.4f}")
if len(controls) > 0 and not args.dry_run and total_obls > 0:
n = 6000
factor = n / len(controls)
print(f"\n --- Hochrechnung auf {n:,} Controls ---")
print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
print(f" Kosten: ${cost * factor:.2f}")
print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
pf = int(type_counts['pflicht'] * factor)
ef = int(type_counts['empfehlung'] * factor)
kf = int(type_counts['kann'] * factor)
print(f" Pflicht: ~{pf:,}")
print(f" Empfehlung: ~{ef:,}")
print(f" Kann: ~{kf:,}")
# Save results JSON for later analysis
if all_results:
out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
with open(out_path, "w") as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"\n Ergebnisse gespeichert: {out_path}")
conn.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,308 @@
#!/usr/bin/env python3
"""Preview Pass 0b: Turn obligation candidates into atomic controls.
Picks a few obligations from Pass 0a results, calls LLM to compose
atomic controls, and writes them to canonical_controls with parent_control_uuid.
Usage:
python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
"""
import argparse
import json
import os
import re
import sys
import time
import uuid
import urllib.parse
import psycopg2
import psycopg2.extras
import requests
# Register JSON adapter
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
SYSTEM_PROMPT = """\
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
normativen Pflicht ein praxisorientiertes, atomares Security Control.
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
Antworte NUR als JSON. Keine Erklärungen."""
def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
return f"""\
Erstelle aus der folgenden Pflicht ein atomares Control.
PFLICHT: {obl_text}
HANDLUNG: {action}
GEGENSTAND: {obj}
KONTEXT (Ursprungs-Control):
Titel: {parent_title}
Kategorie: {category}
Quellreferenz: {source_ref}
Antworte als JSON:
{{
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
"objective": "Was muss erreicht werden? (1-2 Sätze)",
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
"evidence": ["Nachweis 1", "Nachweis 2"],
"severity": "critical|high|medium|low",
"category": "security|privacy|governance|operations|finance|reporting"
}}"""
def call_anthropic(prompt):
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
"messages": [{"role": "user", "content": prompt}],
}
resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
text = data.get("content", [{}])[0].get("text", "")
return text, data.get("usage", {}), None
def parse_json_object(text):
try:
return json.loads(text)
except json.JSONDecodeError:
match = re.search(r"\{[\s\S]*\}", text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return None
def generate_control_id(domain, cur):
prefix = domain.upper()[:4]
cur.execute("""
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
FROM compliance.canonical_controls
WHERE control_id LIKE %s
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
""", (f"{prefix}-%",))
row = cur.fetchone()
if row and row[0] is not None:
return f"{prefix}-{row[0] + 1}"
return f"{prefix}-001"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not ANTHROPIC_API_KEY and not args.dry_run:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
# Load 0a results
with open(args.input) as f:
obligations = json.load(f)
# Filter: only passed, pflicht or empfehlung
obligations = [o for o in obligations if o.get("passed", False)]
if args.control:
obligations = [o for o in obligations if o["control_id"] == args.control]
# Pick diverse sample
picked = []
seen_types = set()
for o in obligations:
otype = o["obligation_type"]
if otype not in seen_types and len(picked) < args.limit:
picked.append(o)
seen_types.add(otype)
# Fill rest
for o in obligations:
if o not in picked and len(picked) < args.limit:
picked.append(o)
if not picked:
print("No obligations found.")
return
# Connect to DB
db_url = os.environ["DATABASE_URL"]
p = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=p.hostname, port=p.port or 5432,
user=p.username, password=p.password,
dbname=p.path.lstrip("/"),
options="-c search_path=compliance,public",
)
cur = conn.cursor()
# Get parent control info
ctrl_ids = list(set(o["control_id"] for o in picked))
cur.execute("""
SELECT control_id, id, title, category, source_citation
FROM compliance.canonical_controls
WHERE control_id = ANY(%s)
""", (ctrl_ids,))
ctrl_map = {}
for row in cur.fetchall():
sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
# Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
ctrl_map[row[0]] = {
"uuid": str(row[1]), "title": row[2], "category": row[3] or "",
"source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
"domain": prefix,
}
print("=" * 70)
print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
print("=" * 70)
created = []
for i, obl in enumerate(picked, 1):
ctrl = ctrl_map.get(obl["control_id"], {})
print(f"\n{''*70}")
print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
print(f" Obligation: {obl['obligation_text'][:120]}")
print(f" Parent: {ctrl.get('title', 'N/A')}")
if args.dry_run:
print(" [DRY RUN]")
continue
prompt = build_pass0b_prompt(
obl["obligation_text"], obl["action"], obl["object"],
ctrl.get("title", ""), ctrl.get("category", ""),
ctrl.get("source_ref", ""),
)
t0 = time.time()
resp_text, usage, error = call_anthropic(prompt)
elapsed = time.time() - t0
if error:
print(f" ERROR: {error}")
continue
result = parse_json_object(resp_text)
if not result:
print(f" PARSE ERROR: {resp_text[:200]}")
continue
in_tok = usage.get("input_tokens", 0)
out_tok = usage.get("output_tokens", 0)
print(f" LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
# Generate control_id
domain = ctrl.get("domain", "COMP")
new_control_id = generate_control_id(domain, cur)
# Show result
print(f"\n === ATOMIC CONTROL: {new_control_id} ===")
print(f" Titel: {result.get('title', 'N/A')}")
print(f" Ziel: {result.get('objective', 'N/A')}")
print(f" Typ: {obl['obligation_type']}")
reqs = result.get("requirements", [])
if reqs:
print(f" Anforderungen:")
for r in reqs:
print(f" - {r}")
tests = result.get("test_procedure", [])
if tests:
print(f" Pruefverfahren:")
for t in tests:
print(f" - {t}")
evidence = result.get("evidence", [])
if evidence:
print(f" Nachweise:")
for e in evidence:
print(f" - {e}")
print(f" Severity: {result.get('severity', 'medium')}")
print(f" Category: {result.get('category', 'governance')}")
# Write to DB
new_uuid = str(uuid.uuid4())
parent_uuid = ctrl.get("uuid")
source_cit = {}
if ctrl.get("source_ref"):
parts = ctrl["source_ref"].strip().split(" ", 1)
source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
cur.execute("""
INSERT INTO compliance.canonical_controls (
id, control_id, title, objective, requirements, test_procedure,
evidence, severity, category, release_state,
source_citation, generation_metadata, generation_strategy,
pipeline_version, parent_control_uuid, framework_id
) VALUES (
%s, %s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s,
(SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
)
""", (
new_uuid, new_control_id,
result.get("title", ""),
result.get("objective", ""),
json.dumps(result.get("requirements", []), ensure_ascii=False),
json.dumps(result.get("test_procedure", []), ensure_ascii=False),
json.dumps(result.get("evidence", []), ensure_ascii=False),
result.get("severity", "medium"),
result.get("category", "governance"),
"draft",
psycopg2.extras.Json(source_cit),
psycopg2.extras.Json({
"obligation_type": obl["obligation_type"],
"obligation_text": obl["obligation_text"],
"pass0b_model": ANTHROPIC_MODEL,
"decomposition_method": "pass0b_preview",
}),
"pass0b_atomic",
6, # pipeline_version
parent_uuid,
))
conn.commit()
created.append({
"control_id": new_control_id,
"title": result.get("title", ""),
"obligation_type": obl["obligation_type"],
"parent_control_id": obl["control_id"],
})
print(f" ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
time.sleep(0.5)
if created:
print(f"\n{'='*70}")
print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
print(f"{'='*70}")
for c in created:
print(f" {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
conn.close()
if __name__ == "__main__":
main()