#!/usr/bin/env python3 """ Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation. Tests 5 representative gap articles from different sources. Measures: quality (JSON valid, fields complete), response time, cost estimate. Usage: python3 benchmark_llm_controls.py """ import json import time import sys import os import requests from pathlib import Path # ── Config ────────────────────────────────────────────────────────── LITELLM_URL = "https://llm-dev.meghsakha.com" LITELLM_MODEL = "gpt-oss-120b" LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag" ANTHROPIC_URL = "https://api.anthropic.com/v1/messages" ANTHROPIC_MODEL = "claude-sonnet-4-6" ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs")) try: import fitz # PyMuPDF except ImportError: print("PyMuPDF not available, using pre-extracted texts") fitz = None # ── Prompts (identical to control_generator.py) ───────────────────── SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung. Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array.""" APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist. Verwende ["all"] wenn der Control branchenuebergreifend gilt. Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen", "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst" - applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control? Verwende ["all"] wenn keine Groessenbeschraenkung. Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise" - scope_conditions: null wenn keine besonderen Bedingungen, sonst: {"requires_any": ["signal"], "description": "Erklaerung"}""" def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str: return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control. Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}). WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung. Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein. Gib JSON zurück mit diesen Feldern: - title: Kurzer prägnanter Titel (max 100 Zeichen) - objective: Was soll erreicht werden? (1-3 Sätze) - rationale: Warum ist das wichtig? (1-2 Sätze) - requirements: Liste von konkreten Anforderungen (Strings) - test_procedure: Liste von Prüfschritten (Strings) - evidence: Liste von Nachweisdokumenten (Strings) - severity: low/medium/high/critical - tags: Liste von Tags - domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV) - category: Inhaltliche Kategorie - target_audience: Liste der Zielgruppen - source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42") - source_paragraph: Absatz-Referenz (z.B. "Absatz 5") {APPLICABILITY_PROMPT} Text: {article_text[:3000]} Quelle: {source_name}, {article_label}""" # ── PDF Text Extraction ───────────────────────────────────────────── def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str: """Extract the text of a specific article from a PDF.""" import re path = PDF_DIR / pdf_file if not path.exists() or fitz is None: return "" doc = fitz.open(str(path)) full_text = "" for page in doc: full_text += page.get_text() + "\n" doc.close() # Find article boundaries if doc_type == "eu_regulation": # Find "Artikel N" heading art_num = re.search(r'\d+', article_label) if not art_num: return "" num = int(art_num.group()) # Find start of this article pattern = rf'\nArtikel\s+{num}\s*\n' match = re.search(pattern, full_text) if not match: return f"[Artikel {num} nicht im PDF gefunden]" start = match.start() # Find start of next article next_pattern = rf'\nArtikel\s+{num+1}\s*\n' next_match = re.search(next_pattern, full_text) end = next_match.start() if next_match else start + 5000 text = full_text[start:end].strip() return text[:3000] elif doc_type == "de_law": para_num = re.search(r'\d+', article_label) if not para_num: return "" num = int(para_num.group()) pattern = rf'\n§\s+{num}\b' match = re.search(pattern, full_text) if not match: return f"[§ {num} nicht im PDF gefunden]" start = match.start() next_pattern = rf'\n§\s+{num+1}\b' next_match = re.search(next_pattern, full_text) end = next_match.start() if next_match else start + 5000 text = full_text[start:end].strip() return text[:3000] elif doc_type == "nist": # Find NIST control family match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text) if not match: return f"[{article_label} nicht im PDF gefunden]" start = match.start() text = full_text[start:start+3000].strip() return text else: # Generic section search match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text) if not match: return f"[{article_label} nicht im PDF gefunden]" start = match.start() text = full_text[start:start+3000].strip() return text # ── API Calls ──────────────────────────────────────────────────────── def call_litellm(prompt: str, system_prompt: str) -> tuple: """Call LiteLLM API. Returns (response_text, duration_seconds, error).""" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {LITELLM_API_KEY}", } payload = { "model": LITELLM_MODEL, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ], "temperature": 0.3, "max_tokens": 4096, "stream": False, } t0 = time.time() try: resp = requests.post( f"{LITELLM_URL}/v1/chat/completions", headers=headers, json=payload, timeout=180, ) duration = time.time() - t0 if resp.status_code != 200: return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}" data = resp.json() content = data["choices"][0]["message"]["content"] usage = data.get("usage", {}) return content, duration, None, usage except Exception as e: return "", time.time() - t0, str(e), {} def call_anthropic(prompt: str, system_prompt: str) -> tuple: """Call Anthropic API. Returns (response_text, duration_seconds, error).""" headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json", } payload = { "model": ANTHROPIC_MODEL, "max_tokens": 4096, "system": system_prompt, "messages": [{"role": "user", "content": prompt}], } t0 = time.time() try: resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180) duration = time.time() - t0 if resp.status_code != 200: return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {} data = resp.json() content = data["content"][0]["text"] if data.get("content") else "" usage = data.get("usage", {}) return content, duration, None, usage except Exception as e: return "", time.time() - t0, str(e), {} # ── Quality Assessment ─────────────────────────────────────────────── REQUIRED_FIELDS = [ "title", "objective", "rationale", "requirements", "test_procedure", "evidence", "severity", "domain", ] BONUS_FIELDS = [ "tags", "category", "target_audience", "source_article", "applicable_industries", "applicable_company_size", ] def assess_quality(raw_text: str) -> dict: """Assess the quality of a control generation response.""" result = { "json_valid": False, "required_fields": 0, "required_total": len(REQUIRED_FIELDS), "bonus_fields": 0, "bonus_total": len(BONUS_FIELDS), "requirements_count": 0, "test_procedure_count": 0, "evidence_count": 0, "title_length": 0, "objective_length": 0, "score": 0, } # Try to parse JSON text = raw_text.strip() if text.startswith("```"): lines = text.split("\n") text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:]) try: data = json.loads(text) if isinstance(data, list): data = data[0] if data else {} except json.JSONDecodeError: # Try to find JSON object import re match = re.search(r'\{[\s\S]*\}', text) if match: try: data = json.loads(match.group()) except json.JSONDecodeError: return result else: return result result["json_valid"] = True # Check required fields for f in REQUIRED_FIELDS: val = data.get(f) if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0): result["required_fields"] += 1 # Check bonus fields for f in BONUS_FIELDS: val = data.get(f) if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0): result["bonus_fields"] += 1 # Depth metrics reqs = data.get("requirements", []) result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0 tp = data.get("test_procedure", []) result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0 ev = data.get("evidence", []) result["evidence_count"] = len(ev) if isinstance(ev, list) else 0 result["title_length"] = len(data.get("title", "")) result["objective_length"] = len(data.get("objective", "")) # Score: 0-100 score = 0 score += 20 if result["json_valid"] else 0 score += (result["required_fields"] / result["required_total"]) * 40 score += (result["bonus_fields"] / result["bonus_total"]) * 15 score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests score += 1 if result["objective_length"] > 50 else 0 result["score"] = round(score, 1) result["parsed_data"] = data return result # ── Test Cases ─────────────────────────────────────────────────────── TEST_CASES = [ { "source": "DSGVO (EU) 2016/679", "article": "Artikel 32", "pdf": "dsgvo_2016_679.pdf", "doc_type": "eu_regulation", "license": "EU_LAW", "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz", }, { "source": "KI-Verordnung (EU) 2024/1689", "article": "Artikel 9", "pdf": "ai_act_2024_1689.pdf", "doc_type": "eu_regulation", "license": "EU_LAW", "description": "Risikomanagement für Hochrisiko-KI", }, { "source": "NIS2-Richtlinie (EU) 2022/2555", "article": "Artikel 21", "pdf": "nis2_2022_2555.pdf", "doc_type": "eu_regulation", "license": "EU_LAW", "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht", }, { "source": "Cyber Resilience Act (CRA)", "article": "Artikel 13", "pdf": "cra_2024_2847.pdf", "doc_type": "eu_regulation", "license": "EU_LAW", "description": "Pflichten der Hersteller", }, { "source": "Bundesdatenschutzgesetz (BDSG)", "article": "§ 26", "pdf": "bdsg.pdf", "doc_type": "de_law", "license": "DE_LAW", "description": "Datenverarbeitung im Beschäftigungskontext", }, ] # ── Main ───────────────────────────────────────────────────────────── def main(): if not ANTHROPIC_API_KEY: print("ERROR: Set ANTHROPIC_API_KEY environment variable") sys.exit(1) print("=" * 80) print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6") print("=" * 80) print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}") print(f" Anthropic: {ANTHROPIC_MODEL}") print(f" Tests: {len(TEST_CASES)}") print() # Pre-check LiteLLM try: r = requests.get(f"{LITELLM_URL}/v1/models", headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10) print(f" LiteLLM OK: {r.status_code}") except Exception as e: print(f" LiteLLM ERROR: {e}") sys.exit(1) results = [] for i, tc in enumerate(TEST_CASES): print(f"\n{'='*80}") print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}") print(f" {tc['description']}") print(f"{'='*80}") # Extract article text from PDF article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"]) if not article_text or article_text.startswith("["): print(f" WARNING: {article_text or 'Empty text'}") continue print(f" Text extracted: {len(article_text)} chars") print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...") prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"]) # ── Call LiteLLM ── print(f"\n --- gpt-oss-120b ---") litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT) if litellm_err: print(f" ERROR: {litellm_err}") litellm_quality = {"json_valid": False, "score": 0} else: print(f" Time: {litellm_time:.1f}s") print(f" Tokens: {litellm_usage}") litellm_quality = assess_quality(litellm_raw) print(f" JSON valid: {litellm_quality['json_valid']}") print(f" Score: {litellm_quality['score']}/100") print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}") print(f" Requirements: {litellm_quality['requirements_count']}, " f"Tests: {litellm_quality['test_procedure_count']}, " f"Evidence: {litellm_quality['evidence_count']}") if litellm_quality.get("parsed_data"): d = litellm_quality["parsed_data"] print(f" Title: {d.get('title', 'N/A')}") # ── Call Anthropic ── print(f"\n --- Claude Sonnet 4.6 ---") anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT) if anthropic_err: print(f" ERROR: {anthropic_err}") anthropic_quality = {"json_valid": False, "score": 0} else: print(f" Time: {anthropic_time:.1f}s") print(f" Tokens: {anthropic_usage}") anthropic_quality = assess_quality(anthropic_raw) print(f" JSON valid: {anthropic_quality['json_valid']}") print(f" Score: {anthropic_quality['score']}/100") print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}") print(f" Requirements: {anthropic_quality['requirements_count']}, " f"Tests: {anthropic_quality['test_procedure_count']}, " f"Evidence: {anthropic_quality['evidence_count']}") if anthropic_quality.get("parsed_data"): d = anthropic_quality["parsed_data"] print(f" Title: {d.get('title', 'N/A')}") # Compare print(f"\n --- VERGLEICH ---") speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0 print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s " f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})") print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs " f"Sonnet {anthropic_quality.get('score', 0)}/100") results.append({ "test": f"{tc['source']} — {tc['article']}", "litellm": { "time": round(litellm_time, 1), "score": litellm_quality.get("score", 0), "json_valid": litellm_quality.get("json_valid", False), "requirements": litellm_quality.get("requirements_count", 0), "tests": litellm_quality.get("test_procedure_count", 0), "usage": litellm_usage, "raw": litellm_raw[:500] if litellm_raw else "", }, "anthropic": { "time": round(anthropic_time, 1), "score": anthropic_quality.get("score", 0), "json_valid": anthropic_quality.get("json_valid", False), "requirements": anthropic_quality.get("requirements_count", 0), "tests": anthropic_quality.get("test_procedure_count", 0), "usage": anthropic_usage, "raw": anthropic_raw[:500] if anthropic_raw else "", }, }) # ── Summary ────────────────────────────────────────────────────── print(f"\n\n{'='*80}") print("ZUSAMMENFASSUNG") print(f"{'='*80}") if not results: print(" Keine Ergebnisse.") return litellm_scores = [r["litellm"]["score"] for r in results] anthropic_scores = [r["anthropic"]["score"] for r in results] litellm_times = [r["litellm"]["time"] for r in results] anthropic_times = [r["anthropic"]["time"] for r in results] print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}") print(f" {'-'*30} {'-'*15} {'-'*15}") print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} " f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}") print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} " f"{sum(anthropic_times)/len(anthropic_times):>13.1f}") print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} " f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}") print(f" {'Avg Requirements':<30s} " f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} " f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}") print(f" {'Avg Test Procedures':<30s} " f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} " f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}") # Cost estimate # Claude Sonnet: ~$3/M input, ~$15/M output # gpt-oss-120b: self-hosted = $0 API cost (only compute) total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results) total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results) anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000 print(f"\n Kostenvergleich (fuer {len(results)} Controls):") print(f" gpt-oss-120b: $0.00 (self-hosted)") print(f" Claude Sonnet: ${anthropic_cost:.4f} " f"({total_anthropic_input} input + {total_anthropic_output} output tokens)") # Extrapolate for 494 gap articles if results: cost_per_control = anthropic_cost / len(results) print(f"\n Hochrechnung fuer 494 Luecken-Artikel:") print(f" gpt-oss-120b: $0.00") print(f" Claude Sonnet: ${cost_per_control * 494:.2f}") avg_time_120b = sum(litellm_times) / len(litellm_times) avg_time_sonnet = sum(anthropic_times) / len(anthropic_times) print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)") print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)") # Save full results out_path = "/tmp/benchmark_llm_results.json" with open(out_path, 'w') as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\n Detaillierte Ergebnisse: {out_path}") if __name__ == "__main__": main()