Files
breakpilot-compliance/scripts/qa/benchmark_llm_controls.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

525 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
Tests 5 representative gap articles from different sources.
Measures: quality (JSON valid, fields complete), response time, cost estimate.
Usage:
python3 benchmark_llm_controls.py
"""
import json
import time
import sys
import os
import requests
from pathlib import Path
# ── Config ──────────────────────────────────────────────────────────
LITELLM_URL = "https://llm-dev.meghsakha.com"
LITELLM_MODEL = "gpt-oss-120b"
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = "claude-sonnet-4-6"
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
try:
import fitz # PyMuPDF
except ImportError:
print("PyMuPDF not available, using pre-extracted texts")
fitz = None
# ── Prompts (identical to control_generator.py) ─────────────────────
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
Verwende ["all"] wenn keine Groessenbeschraenkung.
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
{"requires_any": ["signal"], "description": "Erklaerung"}"""
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
- category: Inhaltliche Kategorie
- target_audience: Liste der Zielgruppen
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
{APPLICABILITY_PROMPT}
Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""
# ── PDF Text Extraction ─────────────────────────────────────────────
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
"""Extract the text of a specific article from a PDF."""
import re
path = PDF_DIR / pdf_file
if not path.exists() or fitz is None:
return ""
doc = fitz.open(str(path))
full_text = ""
for page in doc:
full_text += page.get_text() + "\n"
doc.close()
# Find article boundaries
if doc_type == "eu_regulation":
# Find "Artikel N" heading
art_num = re.search(r'\d+', article_label)
if not art_num:
return ""
num = int(art_num.group())
# Find start of this article
pattern = rf'\nArtikel\s+{num}\s*\n'
match = re.search(pattern, full_text)
if not match:
return f"[Artikel {num} nicht im PDF gefunden]"
start = match.start()
# Find start of next article
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else start + 5000
text = full_text[start:end].strip()
return text[:3000]
elif doc_type == "de_law":
para_num = re.search(r'\d+', article_label)
if not para_num:
return ""
num = int(para_num.group())
pattern = rf'\\s+{num}\b'
match = re.search(pattern, full_text)
if not match:
return f"{num} nicht im PDF gefunden]"
start = match.start()
next_pattern = rf'\\s+{num+1}\b'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else start + 5000
text = full_text[start:end].strip()
return text[:3000]
elif doc_type == "nist":
# Find NIST control family
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
if not match:
return f"[{article_label} nicht im PDF gefunden]"
start = match.start()
text = full_text[start:start+3000].strip()
return text
else:
# Generic section search
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
if not match:
return f"[{article_label} nicht im PDF gefunden]"
start = match.start()
text = full_text[start:start+3000].strip()
return text
# ── API Calls ────────────────────────────────────────────────────────
def call_litellm(prompt: str, system_prompt: str) -> tuple:
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {LITELLM_API_KEY}",
}
payload = {
"model": LITELLM_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": 0.3,
"max_tokens": 4096,
"stream": False,
}
t0 = time.time()
try:
resp = requests.post(
f"{LITELLM_URL}/v1/chat/completions",
headers=headers,
json=payload,
timeout=180,
)
duration = time.time() - t0
if resp.status_code != 200:
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data["choices"][0]["message"]["content"]
usage = data.get("usage", {})
return content, duration, None, usage
except Exception as e:
return "", time.time() - t0, str(e), {}
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": system_prompt,
"messages": [{"role": "user", "content": prompt}],
}
t0 = time.time()
try:
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
duration = time.time() - t0
if resp.status_code != 200:
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
data = resp.json()
content = data["content"][0]["text"] if data.get("content") else ""
usage = data.get("usage", {})
return content, duration, None, usage
except Exception as e:
return "", time.time() - t0, str(e), {}
# ── Quality Assessment ───────────────────────────────────────────────
REQUIRED_FIELDS = [
"title", "objective", "rationale", "requirements",
"test_procedure", "evidence", "severity", "domain",
]
BONUS_FIELDS = [
"tags", "category", "target_audience", "source_article",
"applicable_industries", "applicable_company_size",
]
def assess_quality(raw_text: str) -> dict:
"""Assess the quality of a control generation response."""
result = {
"json_valid": False,
"required_fields": 0,
"required_total": len(REQUIRED_FIELDS),
"bonus_fields": 0,
"bonus_total": len(BONUS_FIELDS),
"requirements_count": 0,
"test_procedure_count": 0,
"evidence_count": 0,
"title_length": 0,
"objective_length": 0,
"score": 0,
}
# Try to parse JSON
text = raw_text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
try:
data = json.loads(text)
if isinstance(data, list):
data = data[0] if data else {}
except json.JSONDecodeError:
# Try to find JSON object
import re
match = re.search(r'\{[\s\S]*\}', text)
if match:
try:
data = json.loads(match.group())
except json.JSONDecodeError:
return result
else:
return result
result["json_valid"] = True
# Check required fields
for f in REQUIRED_FIELDS:
val = data.get(f)
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
result["required_fields"] += 1
# Check bonus fields
for f in BONUS_FIELDS:
val = data.get(f)
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
result["bonus_fields"] += 1
# Depth metrics
reqs = data.get("requirements", [])
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
tp = data.get("test_procedure", [])
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
ev = data.get("evidence", [])
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
result["title_length"] = len(data.get("title", ""))
result["objective_length"] = len(data.get("objective", ""))
# Score: 0-100
score = 0
score += 20 if result["json_valid"] else 0
score += (result["required_fields"] / result["required_total"]) * 40
score += (result["bonus_fields"] / result["bonus_total"]) * 15
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
score += 1 if result["objective_length"] > 50 else 0
result["score"] = round(score, 1)
result["parsed_data"] = data
return result
# ── Test Cases ───────────────────────────────────────────────────────
TEST_CASES = [
{
"source": "DSGVO (EU) 2016/679",
"article": "Artikel 32",
"pdf": "dsgvo_2016_679.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
},
{
"source": "KI-Verordnung (EU) 2024/1689",
"article": "Artikel 9",
"pdf": "ai_act_2024_1689.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Risikomanagement für Hochrisiko-KI",
},
{
"source": "NIS2-Richtlinie (EU) 2022/2555",
"article": "Artikel 21",
"pdf": "nis2_2022_2555.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
},
{
"source": "Cyber Resilience Act (CRA)",
"article": "Artikel 13",
"pdf": "cra_2024_2847.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Pflichten der Hersteller",
},
{
"source": "Bundesdatenschutzgesetz (BDSG)",
"article": "§ 26",
"pdf": "bdsg.pdf",
"doc_type": "de_law",
"license": "DE_LAW",
"description": "Datenverarbeitung im Beschäftigungskontext",
},
]
# ── Main ─────────────────────────────────────────────────────────────
def main():
if not ANTHROPIC_API_KEY:
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
sys.exit(1)
print("=" * 80)
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
print("=" * 80)
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
print(f" Anthropic: {ANTHROPIC_MODEL}")
print(f" Tests: {len(TEST_CASES)}")
print()
# Pre-check LiteLLM
try:
r = requests.get(f"{LITELLM_URL}/v1/models",
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
print(f" LiteLLM OK: {r.status_code}")
except Exception as e:
print(f" LiteLLM ERROR: {e}")
sys.exit(1)
results = []
for i, tc in enumerate(TEST_CASES):
print(f"\n{'='*80}")
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']}{tc['article']}")
print(f" {tc['description']}")
print(f"{'='*80}")
# Extract article text from PDF
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
if not article_text or article_text.startswith("["):
print(f" WARNING: {article_text or 'Empty text'}")
continue
print(f" Text extracted: {len(article_text)} chars")
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
# ── Call LiteLLM ──
print(f"\n --- gpt-oss-120b ---")
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
if litellm_err:
print(f" ERROR: {litellm_err}")
litellm_quality = {"json_valid": False, "score": 0}
else:
print(f" Time: {litellm_time:.1f}s")
print(f" Tokens: {litellm_usage}")
litellm_quality = assess_quality(litellm_raw)
print(f" JSON valid: {litellm_quality['json_valid']}")
print(f" Score: {litellm_quality['score']}/100")
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
print(f" Requirements: {litellm_quality['requirements_count']}, "
f"Tests: {litellm_quality['test_procedure_count']}, "
f"Evidence: {litellm_quality['evidence_count']}")
if litellm_quality.get("parsed_data"):
d = litellm_quality["parsed_data"]
print(f" Title: {d.get('title', 'N/A')}")
# ── Call Anthropic ──
print(f"\n --- Claude Sonnet 4.6 ---")
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
if anthropic_err:
print(f" ERROR: {anthropic_err}")
anthropic_quality = {"json_valid": False, "score": 0}
else:
print(f" Time: {anthropic_time:.1f}s")
print(f" Tokens: {anthropic_usage}")
anthropic_quality = assess_quality(anthropic_raw)
print(f" JSON valid: {anthropic_quality['json_valid']}")
print(f" Score: {anthropic_quality['score']}/100")
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
print(f" Requirements: {anthropic_quality['requirements_count']}, "
f"Tests: {anthropic_quality['test_procedure_count']}, "
f"Evidence: {anthropic_quality['evidence_count']}")
if anthropic_quality.get("parsed_data"):
d = anthropic_quality["parsed_data"]
print(f" Title: {d.get('title', 'N/A')}")
# Compare
print(f"\n --- VERGLEICH ---")
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
f"Sonnet {anthropic_quality.get('score', 0)}/100")
results.append({
"test": f"{tc['source']}{tc['article']}",
"litellm": {
"time": round(litellm_time, 1),
"score": litellm_quality.get("score", 0),
"json_valid": litellm_quality.get("json_valid", False),
"requirements": litellm_quality.get("requirements_count", 0),
"tests": litellm_quality.get("test_procedure_count", 0),
"usage": litellm_usage,
"raw": litellm_raw[:500] if litellm_raw else "",
},
"anthropic": {
"time": round(anthropic_time, 1),
"score": anthropic_quality.get("score", 0),
"json_valid": anthropic_quality.get("json_valid", False),
"requirements": anthropic_quality.get("requirements_count", 0),
"tests": anthropic_quality.get("test_procedure_count", 0),
"usage": anthropic_usage,
"raw": anthropic_raw[:500] if anthropic_raw else "",
},
})
# ── Summary ──────────────────────────────────────────────────────
print(f"\n\n{'='*80}")
print("ZUSAMMENFASSUNG")
print(f"{'='*80}")
if not results:
print(" Keine Ergebnisse.")
return
litellm_scores = [r["litellm"]["score"] for r in results]
anthropic_scores = [r["anthropic"]["score"] for r in results]
litellm_times = [r["litellm"]["time"] for r in results]
anthropic_times = [r["anthropic"]["time"] for r in results]
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
print(f" {'-'*30} {'-'*15} {'-'*15}")
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
print(f" {'Avg Requirements':<30s} "
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
print(f" {'Avg Test Procedures':<30s} "
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
# Cost estimate
# Claude Sonnet: ~$3/M input, ~$15/M output
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
print(f" gpt-oss-120b: $0.00 (self-hosted)")
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
# Extrapolate for 494 gap articles
if results:
cost_per_control = anthropic_cost / len(results)
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
print(f" gpt-oss-120b: $0.00")
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
avg_time_120b = sum(litellm_times) / len(litellm_times)
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
# Save full results
out_path = "/tmp/benchmark_llm_results.json"
with open(out_path, 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n Detaillierte Ergebnisse: {out_path}")
if __name__ == "__main__":
main()