breakpilot-lehrer/website/components/admin/system-info-configs/gpu-config.ts

import type { SystemInfoConfig } from './types'

export const gpuConfig: SystemInfoConfig = {
  title: 'GPU Infrastruktur System-Info',
  description: 'vast.ai GPU-Management fuer KI-Workloads und Training.',
  version: '1.5',
  privacyNotes: [
    'GPU-Workloads werden isoliert in Docker-Containern ausgefuehrt',
    'Keine persistente Speicherung von Trainingsdaten auf GPU-Instanzen',
    'SSH-Keys werden regelmaessig rotiert',
    'Audit-Log fuer alle GPU-Operationen',
  ],
  architecture: {
    layers: [
      { title: 'Admin UI', components: ['GPU Dashboard', 'Instance Manager', 'Cost Calculator'], color: '#3b82f6' },
      { title: 'vast.ai API', components: ['Instance API', 'Billing API', 'SSH Management'], color: '#8b5cf6' },
      { title: 'GPU Instances', components: ['RTX 4090', 'A100', 'H100'], color: '#10b981' },
      { title: 'Workloads', components: ['Model Training', 'Inference', 'Fine-Tuning'], color: '#f59e0b' },
    ],
  },
  features: [
    { name: 'Instance Management', status: 'active', description: 'Start/Stop/Destroy von GPU-Instanzen' },
    { name: 'SSH Key Management', status: 'active', description: 'Automatische SSH-Key Rotation' },
    { name: 'Cost Tracking', status: 'active', description: 'Echtzeit-Kostenüberwachung' },
    { name: 'Auto-Scaling', status: 'planned', description: 'Automatische Skalierung bei Last' },
    { name: 'Spot Instance Management', status: 'planned', description: 'Kostenoptimierung durch Spot-Instances' },
  ],
  roadmap: [
    { phase: 'Phase 1: Automation (Q1)', priority: 'high', items: ['Auto-Start bei Bedarf', 'Auto-Stop bei Inaktivitaet', 'Scheduled Instances', 'Budget Alerts'] },
    { phase: 'Phase 2: Multi-Cloud (Q2)', priority: 'medium', items: ['Lambda Labs Integration', 'RunPod Integration', 'Cloud-uebergreifende Orchestrierung', 'Preisvergleich'] },
    { phase: 'Phase 3: ML Ops (Q3)', priority: 'medium', items: ['Model Registry', 'Experiment Tracking', 'A/B Testing', 'Model Versioning'] },
  ],
  technicalDetails: [
    { component: 'API', technology: 'vast.ai REST API', version: 'v1', description: 'GPU-Marktplatz' },
    { component: 'SSH', technology: 'OpenSSH', version: '9.x', description: 'Sichere Verbindung' },
    { component: 'Container', technology: 'Docker', version: '24.x', description: 'Workload-Isolation' },
    { component: 'ML Framework', technology: 'PyTorch', version: '2.x', description: 'Model Training' },
  ],
  auditInfo: [
    {
      category: 'Sicherheit & Zugriffskontrolle',
      items: [
        { label: 'Authentifizierung', value: 'API-Key + SSH-Key', status: 'ok' },
        { label: 'Verschluesselung', value: 'TLS 1.3', status: 'ok' },
        { label: 'Key Rotation', value: 'Alle 30 Tage', status: 'ok' },
        { label: 'Audit-Log', value: 'Aktiviert', status: 'ok' },
      ],
    },
    {
      category: 'Kosten & Ressourcen',
      items: [
        { label: 'Budget-Limit', value: 'Konfigurierbar', status: 'ok' },
        { label: 'Auto-Shutdown', value: '30 Min Inaktivitaet', status: 'ok' },
        { label: 'Spot-Instances', value: 'Geplant', status: 'warning' },
        { label: 'Multi-Cloud', value: 'Geplant', status: 'warning' },
      ],
    },
    {
      category: 'Compliance',
      items: [
        { label: 'Daten-Residenz', value: 'EU/US waehlbar', status: 'ok' },
        { label: 'DSGVO-Konformitaet', value: 'Ja', status: 'ok' },
        { label: 'SOC 2 Type II', value: 'vast.ai zertifiziert', status: 'ok' },
      ],
    },
  ],
  fullDocumentation: `
<h2>GPU Infrastructure Management</h2>

<h3>1. Uebersicht</h3>
<p>Das GPU-Infrastruktur-Modul ermoeglicht die Verwaltung von Cloud-GPU-Ressourcen ueber vast.ai fuer KI-Workloads wie Model Training, Fine-Tuning und Inference.</p>

<h3>2. Architektur</h3>
<pre>
┌─────────────────────────────────────────────────────────────┐
│                      Admin Dashboard                         │
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────────────┐ │
│  │ Start/Stop  │  │   Status    │  │   Cost Tracking     │ │
│  └──────┬──────┘  └──────┬──────┘  └──────────┬──────────┘ │
└─────────┼───────────────┼─────────────────────┼─────────────┘
          │               │                     │
          v               v                     v
┌─────────────────────────────────────────────────────────────┐
│                      Backend API                             │
│  ┌─────────────────────────────────────────────────────┐   │
│  │  /infra/vast/*  -  vast.ai Proxy mit Auth           │   │
│  └─────────────────────────────────────────────────────┘   │
└──────────────────────────┬──────────────────────────────────┘
                           │
                           v
┌─────────────────────────────────────────────────────────────┐
│                      vast.ai Cloud                           │
│  ┌──────────┐  ┌──────────┐  ┌──────────┐  ┌──────────┐    │
│  │ RTX 4090 │  │   A100   │  │   H100   │  │ RTX 3090 │    │
│  └──────────┘  └──────────┘  └──────────┘  └──────────┘    │
└─────────────────────────────────────────────────────────────┘
</pre>

<h3>3. API Endpoints</h3>
<table>
  <tr><th>Endpoint</th><th>Methode</th><th>Beschreibung</th></tr>
  <tr><td>/infra/vast/status</td><td>GET</td><td>Aktueller Status der GPU-Instanz</td></tr>
  <tr><td>/infra/vast/power/on</td><td>POST</td><td>GPU-Instanz starten</td></tr>
  <tr><td>/infra/vast/power/off</td><td>POST</td><td>GPU-Instanz stoppen</td></tr>
  <tr><td>/infra/vast/audit</td><td>GET</td><td>Audit-Log der letzten Aktionen</td></tr>
</table>

<h3>4. Sicherheitskonzept</h3>
<ul>
  <li><strong>Authentifizierung:</strong> API-Key basiert, keine Benutzer-Credentials auf GPU</li>
  <li><strong>SSH-Zugriff:</strong> Key-basiert, automatische Rotation alle 30 Tage</li>
  <li><strong>Netzwerk:</strong> Isolierte VPC, nur definierte Ports offen</li>
  <li><strong>Datenpersistenz:</strong> Ephemeral Storage, keine persistenten Volumes</li>
</ul>

<h3>5. Kostenmanagement</h3>
<table>
  <tr><th>GPU</th><th>Preis/Stunde</th><th>VRAM</th><th>Use Case</th></tr>
  <tr><td>RTX 4090</td><td>~$0.40</td><td>24 GB</td><td>Training, Inference</td></tr>
  <tr><td>A100</td><td>~$1.50</td><td>80 GB</td><td>Large Model Training</td></tr>
  <tr><td>H100</td><td>~$3.00</td><td>80 GB</td><td>Enterprise Training</td></tr>
</table>

<h3>6. Auto-Shutdown Policy</h3>
<p>GPU-Instanzen werden automatisch nach 30 Minuten Inaktivitaet gestoppt, um Kosten zu sparen. Bei jedem LLM-Request wird die Aktivitaet aufgezeichnet.</p>

<h3>7. Audit-relevante Informationen</h3>
<ul>
  <li>Alle Start/Stop-Aktionen werden mit Timestamp und User-ID geloggt</li>
  <li>Kostenabrechnung erfolgt minutengenau</li>
  <li>SSH-Key Rotation wird dokumentiert</li>
  <li>Fehlgeschlagene Operationen werden mit Error-Details gespeichert</li>
</ul>

<h3>8. Notfallprozeduren</h3>
<ol>
  <li><strong>Bei Kostenüberschreitung:</strong> Automatischer Stop aller Instanzen</li>
  <li><strong>Bei API-Ausfall:</strong> Fallback auf manuelles vast.ai Dashboard</li>
  <li><strong>Bei Sicherheitsvorfall:</strong> Sofortiges Destroy aller Instanzen</li>
</ol>
`,
}