breakpilot-compliance/scripts/qa/benchmark_llm_controls.py

#!/usr/bin/env python3
"""
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.

Tests 5 representative gap articles from different sources.
Measures: quality (JSON valid, fields complete), response time, cost estimate.

Usage:
    python3 benchmark_llm_controls.py
"""
import json
import time
import sys
import os
import requests
from pathlib import Path

# ── Config ──────────────────────────────────────────────────────────
LITELLM_URL = "https://llm-dev.meghsakha.com"
LITELLM_MODEL = "gpt-oss-120b"
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"

ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = "claude-sonnet-4-6"
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")

PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))

try:
    import fitz  # PyMuPDF
except ImportError:
    print("PyMuPDF not available, using pre-extracted texts")
    fitz = None

# ── Prompts (identical to control_generator.py) ─────────────────────

SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""

APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
  Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
  "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
  Verwende ["all"] wenn keine Groessenbeschraenkung.
  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
  {"requires_any": ["signal"], "description": "Erklaerung"}"""


def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).

WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.

Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
- category: Inhaltliche Kategorie
- target_audience: Liste der Zielgruppen
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
{APPLICABILITY_PROMPT}

Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""


# ── PDF Text Extraction ─────────────────────────────────────────────

def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
    """Extract the text of a specific article from a PDF."""
    import re

    path = PDF_DIR / pdf_file
    if not path.exists() or fitz is None:
        return ""

    doc = fitz.open(str(path))
    full_text = ""
    for page in doc:
        full_text += page.get_text() + "\n"
    doc.close()

    # Find article boundaries
    if doc_type == "eu_regulation":
        # Find "Artikel N" heading
        art_num = re.search(r'\d+', article_label)
        if not art_num:
            return ""
        num = int(art_num.group())
        # Find start of this article
        pattern = rf'\nArtikel\s+{num}\s*\n'
        match = re.search(pattern, full_text)
        if not match:
            return f"[Artikel {num} nicht im PDF gefunden]"
        start = match.start()
        # Find start of next article
        next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
        next_match = re.search(next_pattern, full_text)
        end = next_match.start() if next_match else start + 5000
        text = full_text[start:end].strip()
        return text[:3000]

    elif doc_type == "de_law":
        para_num = re.search(r'\d+', article_label)
        if not para_num:
            return ""
        num = int(para_num.group())
        pattern = rf'\n§\s+{num}\b'
        match = re.search(pattern, full_text)
        if not match:
            return f"[§ {num} nicht im PDF gefunden]"
        start = match.start()
        next_pattern = rf'\n§\s+{num+1}\b'
        next_match = re.search(next_pattern, full_text)
        end = next_match.start() if next_match else start + 5000
        text = full_text[start:end].strip()
        return text[:3000]

    elif doc_type == "nist":
        # Find NIST control family
        match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
        if not match:
            return f"[{article_label} nicht im PDF gefunden]"
        start = match.start()
        text = full_text[start:start+3000].strip()
        return text

    else:
        # Generic section search
        match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
        if not match:
            return f"[{article_label} nicht im PDF gefunden]"
        start = match.start()
        text = full_text[start:start+3000].strip()
        return text


# ── API Calls ────────────────────────────────────────────────────────

def call_litellm(prompt: str, system_prompt: str) -> tuple:
    """Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {LITELLM_API_KEY}",
    }
    payload = {
        "model": LITELLM_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        "temperature": 0.3,
        "max_tokens": 4096,
        "stream": False,
    }

    t0 = time.time()
    try:
        resp = requests.post(
            f"{LITELLM_URL}/v1/chat/completions",
            headers=headers,
            json=payload,
            timeout=180,
        )
        duration = time.time() - t0
        if resp.status_code != 200:
            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
        data = resp.json()
        content = data["choices"][0]["message"]["content"]
        usage = data.get("usage", {})
        return content, duration, None, usage
    except Exception as e:
        return "", time.time() - t0, str(e), {}


def call_anthropic(prompt: str, system_prompt: str) -> tuple:
    """Call Anthropic API. Returns (response_text, duration_seconds, error)."""
    headers = {
        "x-api-key": ANTHROPIC_API_KEY,
        "anthropic-version": "2023-06-01",
        "content-type": "application/json",
    }
    payload = {
        "model": ANTHROPIC_MODEL,
        "max_tokens": 4096,
        "system": system_prompt,
        "messages": [{"role": "user", "content": prompt}],
    }

    t0 = time.time()
    try:
        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
        duration = time.time() - t0
        if resp.status_code != 200:
            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
        data = resp.json()
        content = data["content"][0]["text"] if data.get("content") else ""
        usage = data.get("usage", {})
        return content, duration, None, usage
    except Exception as e:
        return "", time.time() - t0, str(e), {}


# ── Quality Assessment ───────────────────────────────────────────────

REQUIRED_FIELDS = [
    "title", "objective", "rationale", "requirements",
    "test_procedure", "evidence", "severity", "domain",
]

BONUS_FIELDS = [
    "tags", "category", "target_audience", "source_article",
    "applicable_industries", "applicable_company_size",
]


def assess_quality(raw_text: str) -> dict:
    """Assess the quality of a control generation response."""
    result = {
        "json_valid": False,
        "required_fields": 0,
        "required_total": len(REQUIRED_FIELDS),
        "bonus_fields": 0,
        "bonus_total": len(BONUS_FIELDS),
        "requirements_count": 0,
        "test_procedure_count": 0,
        "evidence_count": 0,
        "title_length": 0,
        "objective_length": 0,
        "score": 0,
    }

    # Try to parse JSON
    text = raw_text.strip()
    if text.startswith("```"):
        lines = text.split("\n")
        text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])

    try:
        data = json.loads(text)
        if isinstance(data, list):
            data = data[0] if data else {}
    except json.JSONDecodeError:
        # Try to find JSON object
        import re
        match = re.search(r'\{[\s\S]*\}', text)
        if match:
            try:
                data = json.loads(match.group())
            except json.JSONDecodeError:
                return result
        else:
            return result

    result["json_valid"] = True

    # Check required fields
    for f in REQUIRED_FIELDS:
        val = data.get(f)
        if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
            result["required_fields"] += 1

    # Check bonus fields
    for f in BONUS_FIELDS:
        val = data.get(f)
        if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
            result["bonus_fields"] += 1

    # Depth metrics
    reqs = data.get("requirements", [])
    result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
    tp = data.get("test_procedure", [])
    result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
    ev = data.get("evidence", [])
    result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
    result["title_length"] = len(data.get("title", ""))
    result["objective_length"] = len(data.get("objective", ""))

    # Score: 0-100
    score = 0
    score += 20 if result["json_valid"] else 0
    score += (result["required_fields"] / result["required_total"]) * 40
    score += (result["bonus_fields"] / result["bonus_total"]) * 15
    score += min(result["requirements_count"], 5) * 3  # max 15 for 5+ requirements
    score += min(result["test_procedure_count"], 3) * 3  # max 9 for 3+ tests
    score += 1 if result["objective_length"] > 50 else 0
    result["score"] = round(score, 1)

    result["parsed_data"] = data
    return result


# ── Test Cases ───────────────────────────────────────────────────────

TEST_CASES = [
    {
        "source": "DSGVO (EU) 2016/679",
        "article": "Artikel 32",
        "pdf": "dsgvo_2016_679.pdf",
        "doc_type": "eu_regulation",
        "license": "EU_LAW",
        "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
    },
    {
        "source": "KI-Verordnung (EU) 2024/1689",
        "article": "Artikel 9",
        "pdf": "ai_act_2024_1689.pdf",
        "doc_type": "eu_regulation",
        "license": "EU_LAW",
        "description": "Risikomanagement für Hochrisiko-KI",
    },
    {
        "source": "NIS2-Richtlinie (EU) 2022/2555",
        "article": "Artikel 21",
        "pdf": "nis2_2022_2555.pdf",
        "doc_type": "eu_regulation",
        "license": "EU_LAW",
        "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
    },
    {
        "source": "Cyber Resilience Act (CRA)",
        "article": "Artikel 13",
        "pdf": "cra_2024_2847.pdf",
        "doc_type": "eu_regulation",
        "license": "EU_LAW",
        "description": "Pflichten der Hersteller",
    },
    {
        "source": "Bundesdatenschutzgesetz (BDSG)",
        "article": "§ 26",
        "pdf": "bdsg.pdf",
        "doc_type": "de_law",
        "license": "DE_LAW",
        "description": "Datenverarbeitung im Beschäftigungskontext",
    },
]


# ── Main ─────────────────────────────────────────────────────────────

def main():
    if not ANTHROPIC_API_KEY:
        print("ERROR: Set ANTHROPIC_API_KEY environment variable")
        sys.exit(1)

    print("=" * 80)
    print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
    print("=" * 80)
    print(f"  LiteLLM:   {LITELLM_URL} / {LITELLM_MODEL}")
    print(f"  Anthropic: {ANTHROPIC_MODEL}")
    print(f"  Tests:     {len(TEST_CASES)}")
    print()

    # Pre-check LiteLLM
    try:
        r = requests.get(f"{LITELLM_URL}/v1/models",
                         headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
        print(f"  LiteLLM OK: {r.status_code}")
    except Exception as e:
        print(f"  LiteLLM ERROR: {e}")
        sys.exit(1)

    results = []

    for i, tc in enumerate(TEST_CASES):
        print(f"\n{'='*80}")
        print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
        print(f"  {tc['description']}")
        print(f"{'='*80}")

        # Extract article text from PDF
        article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
        if not article_text or article_text.startswith("["):
            print(f"  WARNING: {article_text or 'Empty text'}")
            continue

        print(f"  Text extracted: {len(article_text)} chars")
        print(f"  First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")

        prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])

        # ── Call LiteLLM ──
        print(f"\n  --- gpt-oss-120b ---")
        litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
        if litellm_err:
            print(f"  ERROR: {litellm_err}")
            litellm_quality = {"json_valid": False, "score": 0}
        else:
            print(f"  Time: {litellm_time:.1f}s")
            print(f"  Tokens: {litellm_usage}")
            litellm_quality = assess_quality(litellm_raw)
            print(f"  JSON valid: {litellm_quality['json_valid']}")
            print(f"  Score: {litellm_quality['score']}/100")
            print(f"  Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
            print(f"  Requirements: {litellm_quality['requirements_count']}, "
                  f"Tests: {litellm_quality['test_procedure_count']}, "
                  f"Evidence: {litellm_quality['evidence_count']}")
            if litellm_quality.get("parsed_data"):
                d = litellm_quality["parsed_data"]
                print(f"  Title: {d.get('title', 'N/A')}")

        # ── Call Anthropic ──
        print(f"\n  --- Claude Sonnet 4.6 ---")
        anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
        if anthropic_err:
            print(f"  ERROR: {anthropic_err}")
            anthropic_quality = {"json_valid": False, "score": 0}
        else:
            print(f"  Time: {anthropic_time:.1f}s")
            print(f"  Tokens: {anthropic_usage}")
            anthropic_quality = assess_quality(anthropic_raw)
            print(f"  JSON valid: {anthropic_quality['json_valid']}")
            print(f"  Score: {anthropic_quality['score']}/100")
            print(f"  Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
            print(f"  Requirements: {anthropic_quality['requirements_count']}, "
                  f"Tests: {anthropic_quality['test_procedure_count']}, "
                  f"Evidence: {anthropic_quality['evidence_count']}")
            if anthropic_quality.get("parsed_data"):
                d = anthropic_quality["parsed_data"]
                print(f"  Title: {d.get('title', 'N/A')}")

        # Compare
        print(f"\n  --- VERGLEICH ---")
        speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
        print(f"  Speed:   120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
              f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
        print(f"  Score:   120b {litellm_quality.get('score', 0)}/100 vs "
              f"Sonnet {anthropic_quality.get('score', 0)}/100")

        results.append({
            "test": f"{tc['source']} — {tc['article']}",
            "litellm": {
                "time": round(litellm_time, 1),
                "score": litellm_quality.get("score", 0),
                "json_valid": litellm_quality.get("json_valid", False),
                "requirements": litellm_quality.get("requirements_count", 0),
                "tests": litellm_quality.get("test_procedure_count", 0),
                "usage": litellm_usage,
                "raw": litellm_raw[:500] if litellm_raw else "",
            },
            "anthropic": {
                "time": round(anthropic_time, 1),
                "score": anthropic_quality.get("score", 0),
                "json_valid": anthropic_quality.get("json_valid", False),
                "requirements": anthropic_quality.get("requirements_count", 0),
                "tests": anthropic_quality.get("test_procedure_count", 0),
                "usage": anthropic_usage,
                "raw": anthropic_raw[:500] if anthropic_raw else "",
            },
        })

    # ── Summary ──────────────────────────────────────────────────────
    print(f"\n\n{'='*80}")
    print("ZUSAMMENFASSUNG")
    print(f"{'='*80}")

    if not results:
        print("  Keine Ergebnisse.")
        return

    litellm_scores = [r["litellm"]["score"] for r in results]
    anthropic_scores = [r["anthropic"]["score"] for r in results]
    litellm_times = [r["litellm"]["time"] for r in results]
    anthropic_times = [r["anthropic"]["time"] for r in results]

    print(f"\n  {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
    print(f"  {'-'*30} {'-'*15} {'-'*15}")
    print(f"  {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f}   "
          f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
    print(f"  {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f}   "
          f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
    print(f"  {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)}   "
          f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
    print(f"  {'Avg Requirements':<30s} "
          f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f}   "
          f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
    print(f"  {'Avg Test Procedures':<30s} "
          f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f}   "
          f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")

    # Cost estimate
    # Claude Sonnet: ~$3/M input, ~$15/M output
    # gpt-oss-120b: self-hosted = $0 API cost (only compute)
    total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
    total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
    anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000

    print(f"\n  Kostenvergleich (fuer {len(results)} Controls):")
    print(f"    gpt-oss-120b:    $0.00 (self-hosted)")
    print(f"    Claude Sonnet:   ${anthropic_cost:.4f} "
          f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")

    # Extrapolate for 494 gap articles
    if results:
        cost_per_control = anthropic_cost / len(results)
        print(f"\n  Hochrechnung fuer 494 Luecken-Artikel:")
        print(f"    gpt-oss-120b:    $0.00")
        print(f"    Claude Sonnet:   ${cost_per_control * 494:.2f}")
        avg_time_120b = sum(litellm_times) / len(litellm_times)
        avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
        print(f"    Zeit 120b:       {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
        print(f"    Zeit Sonnet:     {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")

    # Save full results
    out_path = "/tmp/benchmark_llm_results.json"
    with open(out_path, 'w') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\n  Detaillierte Ergebnisse: {out_path}")


if __name__ == "__main__":
    main()