Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
525 lines
21 KiB
Python
525 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
|
|
|
|
Tests 5 representative gap articles from different sources.
|
|
Measures: quality (JSON valid, fields complete), response time, cost estimate.
|
|
|
|
Usage:
|
|
python3 benchmark_llm_controls.py
|
|
"""
|
|
import json
|
|
import time
|
|
import sys
|
|
import os
|
|
import requests
|
|
from pathlib import Path
|
|
|
|
# ── Config ──────────────────────────────────────────────────────────
|
|
LITELLM_URL = "https://llm-dev.meghsakha.com"
|
|
LITELLM_MODEL = "gpt-oss-120b"
|
|
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
|
|
|
|
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
|
ANTHROPIC_MODEL = "claude-sonnet-4-6"
|
|
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
|
|
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
|
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
print("PyMuPDF not available, using pre-extracted texts")
|
|
fitz = None
|
|
|
|
# ── Prompts (identical to control_generator.py) ─────────────────────
|
|
|
|
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
|
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
|
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
|
|
|
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
|
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
|
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
|
|
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
|
|
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
|
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
|
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
|
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
|
{"requires_any": ["signal"], "description": "Erklaerung"}"""
|
|
|
|
|
|
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
|
|
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
|
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
|
|
|
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
|
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
|
|
|
Gib JSON zurück mit diesen Feldern:
|
|
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
|
- objective: Was soll erreicht werden? (1-3 Sätze)
|
|
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
|
- requirements: Liste von konkreten Anforderungen (Strings)
|
|
- test_procedure: Liste von Prüfschritten (Strings)
|
|
- evidence: Liste von Nachweisdokumenten (Strings)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags
|
|
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
|
|
- category: Inhaltliche Kategorie
|
|
- target_audience: Liste der Zielgruppen
|
|
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
|
|
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
|
|
{APPLICABILITY_PROMPT}
|
|
|
|
Text: {article_text[:3000]}
|
|
Quelle: {source_name}, {article_label}"""
|
|
|
|
|
|
# ── PDF Text Extraction ─────────────────────────────────────────────
|
|
|
|
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
|
|
"""Extract the text of a specific article from a PDF."""
|
|
import re
|
|
|
|
path = PDF_DIR / pdf_file
|
|
if not path.exists() or fitz is None:
|
|
return ""
|
|
|
|
doc = fitz.open(str(path))
|
|
full_text = ""
|
|
for page in doc:
|
|
full_text += page.get_text() + "\n"
|
|
doc.close()
|
|
|
|
# Find article boundaries
|
|
if doc_type == "eu_regulation":
|
|
# Find "Artikel N" heading
|
|
art_num = re.search(r'\d+', article_label)
|
|
if not art_num:
|
|
return ""
|
|
num = int(art_num.group())
|
|
# Find start of this article
|
|
pattern = rf'\nArtikel\s+{num}\s*\n'
|
|
match = re.search(pattern, full_text)
|
|
if not match:
|
|
return f"[Artikel {num} nicht im PDF gefunden]"
|
|
start = match.start()
|
|
# Find start of next article
|
|
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
|
|
next_match = re.search(next_pattern, full_text)
|
|
end = next_match.start() if next_match else start + 5000
|
|
text = full_text[start:end].strip()
|
|
return text[:3000]
|
|
|
|
elif doc_type == "de_law":
|
|
para_num = re.search(r'\d+', article_label)
|
|
if not para_num:
|
|
return ""
|
|
num = int(para_num.group())
|
|
pattern = rf'\n§\s+{num}\b'
|
|
match = re.search(pattern, full_text)
|
|
if not match:
|
|
return f"[§ {num} nicht im PDF gefunden]"
|
|
start = match.start()
|
|
next_pattern = rf'\n§\s+{num+1}\b'
|
|
next_match = re.search(next_pattern, full_text)
|
|
end = next_match.start() if next_match else start + 5000
|
|
text = full_text[start:end].strip()
|
|
return text[:3000]
|
|
|
|
elif doc_type == "nist":
|
|
# Find NIST control family
|
|
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
|
|
if not match:
|
|
return f"[{article_label} nicht im PDF gefunden]"
|
|
start = match.start()
|
|
text = full_text[start:start+3000].strip()
|
|
return text
|
|
|
|
else:
|
|
# Generic section search
|
|
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
|
|
if not match:
|
|
return f"[{article_label} nicht im PDF gefunden]"
|
|
start = match.start()
|
|
text = full_text[start:start+3000].strip()
|
|
return text
|
|
|
|
|
|
# ── API Calls ────────────────────────────────────────────────────────
|
|
|
|
def call_litellm(prompt: str, system_prompt: str) -> tuple:
|
|
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {LITELLM_API_KEY}",
|
|
}
|
|
payload = {
|
|
"model": LITELLM_MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
"temperature": 0.3,
|
|
"max_tokens": 4096,
|
|
"stream": False,
|
|
}
|
|
|
|
t0 = time.time()
|
|
try:
|
|
resp = requests.post(
|
|
f"{LITELLM_URL}/v1/chat/completions",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=180,
|
|
)
|
|
duration = time.time() - t0
|
|
if resp.status_code != 200:
|
|
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
|
data = resp.json()
|
|
content = data["choices"][0]["message"]["content"]
|
|
usage = data.get("usage", {})
|
|
return content, duration, None, usage
|
|
except Exception as e:
|
|
return "", time.time() - t0, str(e), {}
|
|
|
|
|
|
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
|
|
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
payload = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 4096,
|
|
"system": system_prompt,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}
|
|
|
|
t0 = time.time()
|
|
try:
|
|
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
|
|
duration = time.time() - t0
|
|
if resp.status_code != 200:
|
|
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
|
|
data = resp.json()
|
|
content = data["content"][0]["text"] if data.get("content") else ""
|
|
usage = data.get("usage", {})
|
|
return content, duration, None, usage
|
|
except Exception as e:
|
|
return "", time.time() - t0, str(e), {}
|
|
|
|
|
|
# ── Quality Assessment ───────────────────────────────────────────────
|
|
|
|
REQUIRED_FIELDS = [
|
|
"title", "objective", "rationale", "requirements",
|
|
"test_procedure", "evidence", "severity", "domain",
|
|
]
|
|
|
|
BONUS_FIELDS = [
|
|
"tags", "category", "target_audience", "source_article",
|
|
"applicable_industries", "applicable_company_size",
|
|
]
|
|
|
|
|
|
def assess_quality(raw_text: str) -> dict:
|
|
"""Assess the quality of a control generation response."""
|
|
result = {
|
|
"json_valid": False,
|
|
"required_fields": 0,
|
|
"required_total": len(REQUIRED_FIELDS),
|
|
"bonus_fields": 0,
|
|
"bonus_total": len(BONUS_FIELDS),
|
|
"requirements_count": 0,
|
|
"test_procedure_count": 0,
|
|
"evidence_count": 0,
|
|
"title_length": 0,
|
|
"objective_length": 0,
|
|
"score": 0,
|
|
}
|
|
|
|
# Try to parse JSON
|
|
text = raw_text.strip()
|
|
if text.startswith("```"):
|
|
lines = text.split("\n")
|
|
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
|
|
|
|
try:
|
|
data = json.loads(text)
|
|
if isinstance(data, list):
|
|
data = data[0] if data else {}
|
|
except json.JSONDecodeError:
|
|
# Try to find JSON object
|
|
import re
|
|
match = re.search(r'\{[\s\S]*\}', text)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group())
|
|
except json.JSONDecodeError:
|
|
return result
|
|
else:
|
|
return result
|
|
|
|
result["json_valid"] = True
|
|
|
|
# Check required fields
|
|
for f in REQUIRED_FIELDS:
|
|
val = data.get(f)
|
|
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
|
|
result["required_fields"] += 1
|
|
|
|
# Check bonus fields
|
|
for f in BONUS_FIELDS:
|
|
val = data.get(f)
|
|
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
|
|
result["bonus_fields"] += 1
|
|
|
|
# Depth metrics
|
|
reqs = data.get("requirements", [])
|
|
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
|
|
tp = data.get("test_procedure", [])
|
|
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
|
|
ev = data.get("evidence", [])
|
|
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
|
|
result["title_length"] = len(data.get("title", ""))
|
|
result["objective_length"] = len(data.get("objective", ""))
|
|
|
|
# Score: 0-100
|
|
score = 0
|
|
score += 20 if result["json_valid"] else 0
|
|
score += (result["required_fields"] / result["required_total"]) * 40
|
|
score += (result["bonus_fields"] / result["bonus_total"]) * 15
|
|
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
|
|
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
|
|
score += 1 if result["objective_length"] > 50 else 0
|
|
result["score"] = round(score, 1)
|
|
|
|
result["parsed_data"] = data
|
|
return result
|
|
|
|
|
|
# ── Test Cases ───────────────────────────────────────────────────────
|
|
|
|
TEST_CASES = [
|
|
{
|
|
"source": "DSGVO (EU) 2016/679",
|
|
"article": "Artikel 32",
|
|
"pdf": "dsgvo_2016_679.pdf",
|
|
"doc_type": "eu_regulation",
|
|
"license": "EU_LAW",
|
|
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
|
|
},
|
|
{
|
|
"source": "KI-Verordnung (EU) 2024/1689",
|
|
"article": "Artikel 9",
|
|
"pdf": "ai_act_2024_1689.pdf",
|
|
"doc_type": "eu_regulation",
|
|
"license": "EU_LAW",
|
|
"description": "Risikomanagement für Hochrisiko-KI",
|
|
},
|
|
{
|
|
"source": "NIS2-Richtlinie (EU) 2022/2555",
|
|
"article": "Artikel 21",
|
|
"pdf": "nis2_2022_2555.pdf",
|
|
"doc_type": "eu_regulation",
|
|
"license": "EU_LAW",
|
|
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
|
|
},
|
|
{
|
|
"source": "Cyber Resilience Act (CRA)",
|
|
"article": "Artikel 13",
|
|
"pdf": "cra_2024_2847.pdf",
|
|
"doc_type": "eu_regulation",
|
|
"license": "EU_LAW",
|
|
"description": "Pflichten der Hersteller",
|
|
},
|
|
{
|
|
"source": "Bundesdatenschutzgesetz (BDSG)",
|
|
"article": "§ 26",
|
|
"pdf": "bdsg.pdf",
|
|
"doc_type": "de_law",
|
|
"license": "DE_LAW",
|
|
"description": "Datenverarbeitung im Beschäftigungskontext",
|
|
},
|
|
]
|
|
|
|
|
|
# ── Main ─────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
if not ANTHROPIC_API_KEY:
|
|
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
|
|
sys.exit(1)
|
|
|
|
print("=" * 80)
|
|
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
|
|
print("=" * 80)
|
|
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
|
|
print(f" Anthropic: {ANTHROPIC_MODEL}")
|
|
print(f" Tests: {len(TEST_CASES)}")
|
|
print()
|
|
|
|
# Pre-check LiteLLM
|
|
try:
|
|
r = requests.get(f"{LITELLM_URL}/v1/models",
|
|
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
|
|
print(f" LiteLLM OK: {r.status_code}")
|
|
except Exception as e:
|
|
print(f" LiteLLM ERROR: {e}")
|
|
sys.exit(1)
|
|
|
|
results = []
|
|
|
|
for i, tc in enumerate(TEST_CASES):
|
|
print(f"\n{'='*80}")
|
|
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
|
|
print(f" {tc['description']}")
|
|
print(f"{'='*80}")
|
|
|
|
# Extract article text from PDF
|
|
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
|
|
if not article_text or article_text.startswith("["):
|
|
print(f" WARNING: {article_text or 'Empty text'}")
|
|
continue
|
|
|
|
print(f" Text extracted: {len(article_text)} chars")
|
|
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
|
|
|
|
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
|
|
|
|
# ── Call LiteLLM ──
|
|
print(f"\n --- gpt-oss-120b ---")
|
|
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
|
|
if litellm_err:
|
|
print(f" ERROR: {litellm_err}")
|
|
litellm_quality = {"json_valid": False, "score": 0}
|
|
else:
|
|
print(f" Time: {litellm_time:.1f}s")
|
|
print(f" Tokens: {litellm_usage}")
|
|
litellm_quality = assess_quality(litellm_raw)
|
|
print(f" JSON valid: {litellm_quality['json_valid']}")
|
|
print(f" Score: {litellm_quality['score']}/100")
|
|
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
|
|
print(f" Requirements: {litellm_quality['requirements_count']}, "
|
|
f"Tests: {litellm_quality['test_procedure_count']}, "
|
|
f"Evidence: {litellm_quality['evidence_count']}")
|
|
if litellm_quality.get("parsed_data"):
|
|
d = litellm_quality["parsed_data"]
|
|
print(f" Title: {d.get('title', 'N/A')}")
|
|
|
|
# ── Call Anthropic ──
|
|
print(f"\n --- Claude Sonnet 4.6 ---")
|
|
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
|
|
if anthropic_err:
|
|
print(f" ERROR: {anthropic_err}")
|
|
anthropic_quality = {"json_valid": False, "score": 0}
|
|
else:
|
|
print(f" Time: {anthropic_time:.1f}s")
|
|
print(f" Tokens: {anthropic_usage}")
|
|
anthropic_quality = assess_quality(anthropic_raw)
|
|
print(f" JSON valid: {anthropic_quality['json_valid']}")
|
|
print(f" Score: {anthropic_quality['score']}/100")
|
|
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
|
|
print(f" Requirements: {anthropic_quality['requirements_count']}, "
|
|
f"Tests: {anthropic_quality['test_procedure_count']}, "
|
|
f"Evidence: {anthropic_quality['evidence_count']}")
|
|
if anthropic_quality.get("parsed_data"):
|
|
d = anthropic_quality["parsed_data"]
|
|
print(f" Title: {d.get('title', 'N/A')}")
|
|
|
|
# Compare
|
|
print(f"\n --- VERGLEICH ---")
|
|
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
|
|
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
|
|
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
|
|
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
|
|
f"Sonnet {anthropic_quality.get('score', 0)}/100")
|
|
|
|
results.append({
|
|
"test": f"{tc['source']} — {tc['article']}",
|
|
"litellm": {
|
|
"time": round(litellm_time, 1),
|
|
"score": litellm_quality.get("score", 0),
|
|
"json_valid": litellm_quality.get("json_valid", False),
|
|
"requirements": litellm_quality.get("requirements_count", 0),
|
|
"tests": litellm_quality.get("test_procedure_count", 0),
|
|
"usage": litellm_usage,
|
|
"raw": litellm_raw[:500] if litellm_raw else "",
|
|
},
|
|
"anthropic": {
|
|
"time": round(anthropic_time, 1),
|
|
"score": anthropic_quality.get("score", 0),
|
|
"json_valid": anthropic_quality.get("json_valid", False),
|
|
"requirements": anthropic_quality.get("requirements_count", 0),
|
|
"tests": anthropic_quality.get("test_procedure_count", 0),
|
|
"usage": anthropic_usage,
|
|
"raw": anthropic_raw[:500] if anthropic_raw else "",
|
|
},
|
|
})
|
|
|
|
# ── Summary ──────────────────────────────────────────────────────
|
|
print(f"\n\n{'='*80}")
|
|
print("ZUSAMMENFASSUNG")
|
|
print(f"{'='*80}")
|
|
|
|
if not results:
|
|
print(" Keine Ergebnisse.")
|
|
return
|
|
|
|
litellm_scores = [r["litellm"]["score"] for r in results]
|
|
anthropic_scores = [r["anthropic"]["score"] for r in results]
|
|
litellm_times = [r["litellm"]["time"] for r in results]
|
|
anthropic_times = [r["anthropic"]["time"] for r in results]
|
|
|
|
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
|
|
print(f" {'-'*30} {'-'*15} {'-'*15}")
|
|
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
|
|
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
|
|
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
|
|
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
|
|
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
|
|
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
|
|
print(f" {'Avg Requirements':<30s} "
|
|
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
|
|
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
|
|
print(f" {'Avg Test Procedures':<30s} "
|
|
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
|
|
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
|
|
|
|
# Cost estimate
|
|
# Claude Sonnet: ~$3/M input, ~$15/M output
|
|
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
|
|
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
|
|
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
|
|
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
|
|
|
|
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
|
|
print(f" gpt-oss-120b: $0.00 (self-hosted)")
|
|
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
|
|
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
|
|
|
|
# Extrapolate for 494 gap articles
|
|
if results:
|
|
cost_per_control = anthropic_cost / len(results)
|
|
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
|
|
print(f" gpt-oss-120b: $0.00")
|
|
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
|
|
avg_time_120b = sum(litellm_times) / len(litellm_times)
|
|
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
|
|
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
|
|
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
|
|
|
|
# Save full results
|
|
out_path = "/tmp/benchmark_llm_results.json"
|
|
with open(out_path, 'w') as f:
|
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
print(f"\n Detaillierte Ergebnisse: {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|