feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
524
scripts/qa/benchmark_llm_controls.py
Normal file
524
scripts/qa/benchmark_llm_controls.py
Normal file
@@ -0,0 +1,524 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
|
||||
|
||||
Tests 5 representative gap articles from different sources.
|
||||
Measures: quality (JSON valid, fields complete), response time, cost estimate.
|
||||
|
||||
Usage:
|
||||
python3 benchmark_llm_controls.py
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────
|
||||
LITELLM_URL = "https://llm-dev.meghsakha.com"
|
||||
LITELLM_MODEL = "gpt-oss-120b"
|
||||
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
|
||||
|
||||
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_MODEL = "claude-sonnet-4-6"
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
print("PyMuPDF not available, using pre-extracted texts")
|
||||
fitz = None
|
||||
|
||||
# ── Prompts (identical to control_generator.py) ─────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
||||
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
||||
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
|
||||
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
|
||||
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
||||
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
||||
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
||||
{"requires_any": ["signal"], "description": "Erklaerung"}"""
|
||||
|
||||
|
||||
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
|
||||
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
||||
|
||||
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
||||
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
||||
|
||||
Gib JSON zurück mit diesen Feldern:
|
||||
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Sätze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Prüfschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
|
||||
- category: Inhaltliche Kategorie
|
||||
- target_audience: Liste der Zielgruppen
|
||||
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
|
||||
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
|
||||
{APPLICABILITY_PROMPT}
|
||||
|
||||
Text: {article_text[:3000]}
|
||||
Quelle: {source_name}, {article_label}"""
|
||||
|
||||
|
||||
# ── PDF Text Extraction ─────────────────────────────────────────────
|
||||
|
||||
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
|
||||
"""Extract the text of a specific article from a PDF."""
|
||||
import re
|
||||
|
||||
path = PDF_DIR / pdf_file
|
||||
if not path.exists() or fitz is None:
|
||||
return ""
|
||||
|
||||
doc = fitz.open(str(path))
|
||||
full_text = ""
|
||||
for page in doc:
|
||||
full_text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
|
||||
# Find article boundaries
|
||||
if doc_type == "eu_regulation":
|
||||
# Find "Artikel N" heading
|
||||
art_num = re.search(r'\d+', article_label)
|
||||
if not art_num:
|
||||
return ""
|
||||
num = int(art_num.group())
|
||||
# Find start of this article
|
||||
pattern = rf'\nArtikel\s+{num}\s*\n'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return f"[Artikel {num} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
# Find start of next article
|
||||
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else start + 5000
|
||||
text = full_text[start:end].strip()
|
||||
return text[:3000]
|
||||
|
||||
elif doc_type == "de_law":
|
||||
para_num = re.search(r'\d+', article_label)
|
||||
if not para_num:
|
||||
return ""
|
||||
num = int(para_num.group())
|
||||
pattern = rf'\n§\s+{num}\b'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return f"[§ {num} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
next_pattern = rf'\n§\s+{num+1}\b'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else start + 5000
|
||||
text = full_text[start:end].strip()
|
||||
return text[:3000]
|
||||
|
||||
elif doc_type == "nist":
|
||||
# Find NIST control family
|
||||
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
|
||||
if not match:
|
||||
return f"[{article_label} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
text = full_text[start:start+3000].strip()
|
||||
return text
|
||||
|
||||
else:
|
||||
# Generic section search
|
||||
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
|
||||
if not match:
|
||||
return f"[{article_label} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
text = full_text[start:start+3000].strip()
|
||||
return text
|
||||
|
||||
|
||||
# ── API Calls ────────────────────────────────────────────────────────
|
||||
|
||||
def call_litellm(prompt: str, system_prompt: str) -> tuple:
|
||||
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {LITELLM_API_KEY}",
|
||||
}
|
||||
payload = {
|
||||
"model": LITELLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.3,
|
||||
"max_tokens": 4096,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{LITELLM_URL}/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=180,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
usage = data.get("usage", {})
|
||||
return content, duration, None, usage
|
||||
except Exception as e:
|
||||
return "", time.time() - t0, str(e), {}
|
||||
|
||||
|
||||
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
|
||||
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
|
||||
duration = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
|
||||
data = resp.json()
|
||||
content = data["content"][0]["text"] if data.get("content") else ""
|
||||
usage = data.get("usage", {})
|
||||
return content, duration, None, usage
|
||||
except Exception as e:
|
||||
return "", time.time() - t0, str(e), {}
|
||||
|
||||
|
||||
# ── Quality Assessment ───────────────────────────────────────────────
|
||||
|
||||
REQUIRED_FIELDS = [
|
||||
"title", "objective", "rationale", "requirements",
|
||||
"test_procedure", "evidence", "severity", "domain",
|
||||
]
|
||||
|
||||
BONUS_FIELDS = [
|
||||
"tags", "category", "target_audience", "source_article",
|
||||
"applicable_industries", "applicable_company_size",
|
||||
]
|
||||
|
||||
|
||||
def assess_quality(raw_text: str) -> dict:
|
||||
"""Assess the quality of a control generation response."""
|
||||
result = {
|
||||
"json_valid": False,
|
||||
"required_fields": 0,
|
||||
"required_total": len(REQUIRED_FIELDS),
|
||||
"bonus_fields": 0,
|
||||
"bonus_total": len(BONUS_FIELDS),
|
||||
"requirements_count": 0,
|
||||
"test_procedure_count": 0,
|
||||
"evidence_count": 0,
|
||||
"title_length": 0,
|
||||
"objective_length": 0,
|
||||
"score": 0,
|
||||
}
|
||||
|
||||
# Try to parse JSON
|
||||
text = raw_text.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, list):
|
||||
data = data[0] if data else {}
|
||||
except json.JSONDecodeError:
|
||||
# Try to find JSON object
|
||||
import re
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return result
|
||||
else:
|
||||
return result
|
||||
|
||||
result["json_valid"] = True
|
||||
|
||||
# Check required fields
|
||||
for f in REQUIRED_FIELDS:
|
||||
val = data.get(f)
|
||||
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
|
||||
result["required_fields"] += 1
|
||||
|
||||
# Check bonus fields
|
||||
for f in BONUS_FIELDS:
|
||||
val = data.get(f)
|
||||
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
|
||||
result["bonus_fields"] += 1
|
||||
|
||||
# Depth metrics
|
||||
reqs = data.get("requirements", [])
|
||||
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
|
||||
tp = data.get("test_procedure", [])
|
||||
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
|
||||
ev = data.get("evidence", [])
|
||||
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
|
||||
result["title_length"] = len(data.get("title", ""))
|
||||
result["objective_length"] = len(data.get("objective", ""))
|
||||
|
||||
# Score: 0-100
|
||||
score = 0
|
||||
score += 20 if result["json_valid"] else 0
|
||||
score += (result["required_fields"] / result["required_total"]) * 40
|
||||
score += (result["bonus_fields"] / result["bonus_total"]) * 15
|
||||
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
|
||||
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
|
||||
score += 1 if result["objective_length"] > 50 else 0
|
||||
result["score"] = round(score, 1)
|
||||
|
||||
result["parsed_data"] = data
|
||||
return result
|
||||
|
||||
|
||||
# ── Test Cases ───────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASES = [
|
||||
{
|
||||
"source": "DSGVO (EU) 2016/679",
|
||||
"article": "Artikel 32",
|
||||
"pdf": "dsgvo_2016_679.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
|
||||
},
|
||||
{
|
||||
"source": "KI-Verordnung (EU) 2024/1689",
|
||||
"article": "Artikel 9",
|
||||
"pdf": "ai_act_2024_1689.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Risikomanagement für Hochrisiko-KI",
|
||||
},
|
||||
{
|
||||
"source": "NIS2-Richtlinie (EU) 2022/2555",
|
||||
"article": "Artikel 21",
|
||||
"pdf": "nis2_2022_2555.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
|
||||
},
|
||||
{
|
||||
"source": "Cyber Resilience Act (CRA)",
|
||||
"article": "Artikel 13",
|
||||
"pdf": "cra_2024_2847.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Pflichten der Hersteller",
|
||||
},
|
||||
{
|
||||
"source": "Bundesdatenschutzgesetz (BDSG)",
|
||||
"article": "§ 26",
|
||||
"pdf": "bdsg.pdf",
|
||||
"doc_type": "de_law",
|
||||
"license": "DE_LAW",
|
||||
"description": "Datenverarbeitung im Beschäftigungskontext",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if not ANTHROPIC_API_KEY:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
print("=" * 80)
|
||||
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
|
||||
print("=" * 80)
|
||||
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
|
||||
print(f" Anthropic: {ANTHROPIC_MODEL}")
|
||||
print(f" Tests: {len(TEST_CASES)}")
|
||||
print()
|
||||
|
||||
# Pre-check LiteLLM
|
||||
try:
|
||||
r = requests.get(f"{LITELLM_URL}/v1/models",
|
||||
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
|
||||
print(f" LiteLLM OK: {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f" LiteLLM ERROR: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
results = []
|
||||
|
||||
for i, tc in enumerate(TEST_CASES):
|
||||
print(f"\n{'='*80}")
|
||||
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
|
||||
print(f" {tc['description']}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Extract article text from PDF
|
||||
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
|
||||
if not article_text or article_text.startswith("["):
|
||||
print(f" WARNING: {article_text or 'Empty text'}")
|
||||
continue
|
||||
|
||||
print(f" Text extracted: {len(article_text)} chars")
|
||||
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
|
||||
|
||||
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
|
||||
|
||||
# ── Call LiteLLM ──
|
||||
print(f"\n --- gpt-oss-120b ---")
|
||||
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
|
||||
if litellm_err:
|
||||
print(f" ERROR: {litellm_err}")
|
||||
litellm_quality = {"json_valid": False, "score": 0}
|
||||
else:
|
||||
print(f" Time: {litellm_time:.1f}s")
|
||||
print(f" Tokens: {litellm_usage}")
|
||||
litellm_quality = assess_quality(litellm_raw)
|
||||
print(f" JSON valid: {litellm_quality['json_valid']}")
|
||||
print(f" Score: {litellm_quality['score']}/100")
|
||||
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
|
||||
print(f" Requirements: {litellm_quality['requirements_count']}, "
|
||||
f"Tests: {litellm_quality['test_procedure_count']}, "
|
||||
f"Evidence: {litellm_quality['evidence_count']}")
|
||||
if litellm_quality.get("parsed_data"):
|
||||
d = litellm_quality["parsed_data"]
|
||||
print(f" Title: {d.get('title', 'N/A')}")
|
||||
|
||||
# ── Call Anthropic ──
|
||||
print(f"\n --- Claude Sonnet 4.6 ---")
|
||||
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
|
||||
if anthropic_err:
|
||||
print(f" ERROR: {anthropic_err}")
|
||||
anthropic_quality = {"json_valid": False, "score": 0}
|
||||
else:
|
||||
print(f" Time: {anthropic_time:.1f}s")
|
||||
print(f" Tokens: {anthropic_usage}")
|
||||
anthropic_quality = assess_quality(anthropic_raw)
|
||||
print(f" JSON valid: {anthropic_quality['json_valid']}")
|
||||
print(f" Score: {anthropic_quality['score']}/100")
|
||||
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
|
||||
print(f" Requirements: {anthropic_quality['requirements_count']}, "
|
||||
f"Tests: {anthropic_quality['test_procedure_count']}, "
|
||||
f"Evidence: {anthropic_quality['evidence_count']}")
|
||||
if anthropic_quality.get("parsed_data"):
|
||||
d = anthropic_quality["parsed_data"]
|
||||
print(f" Title: {d.get('title', 'N/A')}")
|
||||
|
||||
# Compare
|
||||
print(f"\n --- VERGLEICH ---")
|
||||
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
|
||||
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
|
||||
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
|
||||
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
|
||||
f"Sonnet {anthropic_quality.get('score', 0)}/100")
|
||||
|
||||
results.append({
|
||||
"test": f"{tc['source']} — {tc['article']}",
|
||||
"litellm": {
|
||||
"time": round(litellm_time, 1),
|
||||
"score": litellm_quality.get("score", 0),
|
||||
"json_valid": litellm_quality.get("json_valid", False),
|
||||
"requirements": litellm_quality.get("requirements_count", 0),
|
||||
"tests": litellm_quality.get("test_procedure_count", 0),
|
||||
"usage": litellm_usage,
|
||||
"raw": litellm_raw[:500] if litellm_raw else "",
|
||||
},
|
||||
"anthropic": {
|
||||
"time": round(anthropic_time, 1),
|
||||
"score": anthropic_quality.get("score", 0),
|
||||
"json_valid": anthropic_quality.get("json_valid", False),
|
||||
"requirements": anthropic_quality.get("requirements_count", 0),
|
||||
"tests": anthropic_quality.get("test_procedure_count", 0),
|
||||
"usage": anthropic_usage,
|
||||
"raw": anthropic_raw[:500] if anthropic_raw else "",
|
||||
},
|
||||
})
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
print(f"\n\n{'='*80}")
|
||||
print("ZUSAMMENFASSUNG")
|
||||
print(f"{'='*80}")
|
||||
|
||||
if not results:
|
||||
print(" Keine Ergebnisse.")
|
||||
return
|
||||
|
||||
litellm_scores = [r["litellm"]["score"] for r in results]
|
||||
anthropic_scores = [r["anthropic"]["score"] for r in results]
|
||||
litellm_times = [r["litellm"]["time"] for r in results]
|
||||
anthropic_times = [r["anthropic"]["time"] for r in results]
|
||||
|
||||
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
|
||||
print(f" {'-'*30} {'-'*15} {'-'*15}")
|
||||
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
|
||||
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
|
||||
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
|
||||
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
|
||||
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
|
||||
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
|
||||
print(f" {'Avg Requirements':<30s} "
|
||||
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
|
||||
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
|
||||
print(f" {'Avg Test Procedures':<30s} "
|
||||
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
|
||||
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
|
||||
|
||||
# Cost estimate
|
||||
# Claude Sonnet: ~$3/M input, ~$15/M output
|
||||
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
|
||||
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
|
||||
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
|
||||
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
|
||||
|
||||
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
|
||||
print(f" gpt-oss-120b: $0.00 (self-hosted)")
|
||||
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
|
||||
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
|
||||
|
||||
# Extrapolate for 494 gap articles
|
||||
if results:
|
||||
cost_per_control = anthropic_cost / len(results)
|
||||
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
|
||||
print(f" gpt-oss-120b: $0.00")
|
||||
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
|
||||
avg_time_120b = sum(litellm_times) / len(litellm_times)
|
||||
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
|
||||
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
|
||||
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
|
||||
|
||||
# Save full results
|
||||
out_path = "/tmp/benchmark_llm_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n Detaillierte Ergebnisse: {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user