""" GPU Infrastructure Test API - Test Runner fuer CUDA/ROCm GPU Management Endpoint: /api/admin/gpu-tests """ from fastapi import APIRouter from pydantic import BaseModel from typing import List, Optional, Literal import httpx import asyncio import time import os import subprocess router = APIRouter(prefix="/api/admin/gpu-tests", tags=["GPU Tests"]) # ============================================== # Models # ============================================== class TestResult(BaseModel): name: str description: str expected: str actual: str status: Literal["passed", "failed", "pending", "skipped"] duration_ms: float error_message: Optional[str] = None class TestCategoryResult(BaseModel): category: str display_name: str description: str tests: List[TestResult] passed: int failed: int total: int class FullTestResults(BaseModel): categories: List[TestCategoryResult] total_passed: int total_failed: int total_tests: int duration_ms: float # ============================================== # Configuration # ============================================== BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8000") VAST_API_KEY = os.getenv("VAST_API_KEY", "") # ============================================== # Test Implementations # ============================================== async def test_nvidia_smi() -> TestResult: """Test NVIDIA GPU Detection via nvidia-smi""" start = time.time() try: result = subprocess.run( ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"], capture_output=True, text=True, timeout=10 ) duration = (time.time() - start) * 1000 if result.returncode == 0 and result.stdout.strip(): gpu_info = result.stdout.strip().split('\n')[0] return TestResult( name="NVIDIA GPU Erkennung", description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual=f"GPU: {gpu_info}", status="passed", duration_ms=duration ) else: return TestResult( name="NVIDIA GPU Erkennung", description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual="Keine NVIDIA GPU gefunden", status="skipped", duration_ms=duration, error_message="nvidia-smi nicht verfuegbar oder keine GPU" ) except FileNotFoundError: return TestResult( name="NVIDIA GPU Erkennung", description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual="nvidia-smi nicht installiert", status="skipped", duration_ms=(time.time() - start) * 1000, error_message="nvidia-smi Binary nicht gefunden" ) except Exception as e: return TestResult( name="NVIDIA GPU Erkennung", description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual=f"Fehler: {str(e)}", status="failed", duration_ms=(time.time() - start) * 1000, error_message=str(e) ) async def test_rocm_smi() -> TestResult: """Test AMD GPU Detection via rocm-smi""" start = time.time() try: result = subprocess.run( ["rocm-smi", "--showproductname"], capture_output=True, text=True, timeout=10 ) duration = (time.time() - start) * 1000 if result.returncode == 0 and result.stdout.strip(): return TestResult( name="AMD ROCm GPU Erkennung", description="Prueft ob AMD GPUs via rocm-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual=f"ROCm GPU erkannt", status="passed", duration_ms=duration ) else: return TestResult( name="AMD ROCm GPU Erkennung", description="Prueft ob AMD GPUs via rocm-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual="Keine AMD GPU gefunden", status="skipped", duration_ms=duration, error_message="rocm-smi nicht verfuegbar oder keine GPU" ) except FileNotFoundError: return TestResult( name="AMD ROCm GPU Erkennung", description="Prueft ob AMD GPUs via rocm-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual="rocm-smi nicht installiert", status="skipped", duration_ms=(time.time() - start) * 1000, error_message="rocm-smi Binary nicht gefunden" ) except Exception as e: return TestResult( name="AMD ROCm GPU Erkennung", description="Prueft ob AMD GPUs via rocm-smi erkannt werden", expected="GPU-Informationen verfuegbar", actual=f"Fehler: {str(e)}", status="failed", duration_ms=(time.time() - start) * 1000, error_message=str(e) ) async def test_cuda_available() -> TestResult: """Test CUDA Availability via Python""" start = time.time() try: # Try to import torch and check CUDA result = subprocess.run( ["python", "-c", "import torch; print(f'CUDA: {torch.cuda.is_available()}, Devices: {torch.cuda.device_count()}')"], capture_output=True, text=True, timeout=30 ) duration = (time.time() - start) * 1000 if result.returncode == 0: output = result.stdout.strip() if "True" in output: return TestResult( name="PyTorch CUDA Support", description="Prueft ob PyTorch CUDA-Unterstuetzung hat", expected="CUDA verfuegbar", actual=output, status="passed", duration_ms=duration ) else: return TestResult( name="PyTorch CUDA Support", description="Prueft ob PyTorch CUDA-Unterstuetzung hat", expected="CUDA verfuegbar", actual=output, status="skipped", duration_ms=duration, error_message="CUDA nicht verfuegbar in PyTorch" ) else: return TestResult( name="PyTorch CUDA Support", description="Prueft ob PyTorch CUDA-Unterstuetzung hat", expected="CUDA verfuegbar", actual="PyTorch nicht installiert", status="skipped", duration_ms=duration, error_message=result.stderr[:200] if result.stderr else "PyTorch fehlt" ) except Exception as e: return TestResult( name="PyTorch CUDA Support", description="Prueft ob PyTorch CUDA-Unterstuetzung hat", expected="CUDA verfuegbar", actual=f"Fehler: {str(e)}", status="skipped", duration_ms=(time.time() - start) * 1000, error_message=str(e) ) async def test_vast_ai_api() -> TestResult: """Test vast.ai API Connection""" start = time.time() if not VAST_API_KEY: return TestResult( name="vast.ai API Verbindung", description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist", expected="API Key konfiguriert", actual="VAST_API_KEY nicht gesetzt", status="skipped", duration_ms=(time.time() - start) * 1000, error_message="Umgebungsvariable VAST_API_KEY fehlt" ) try: async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get( "https://console.vast.ai/api/v0/users/current", headers={"Authorization": f"Bearer {VAST_API_KEY}"} ) duration = (time.time() - start) * 1000 if response.status_code == 200: data = response.json() balance = data.get("credit", 0) return TestResult( name="vast.ai API Verbindung", description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist", expected="API erreichbar mit Guthaben", actual=f"Verbunden, Guthaben: ${balance:.2f}", status="passed", duration_ms=duration ) else: return TestResult( name="vast.ai API Verbindung", description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist", expected="API erreichbar mit Guthaben", actual=f"HTTP {response.status_code}", status="failed", duration_ms=duration, error_message="API-Authentifizierung fehlgeschlagen" ) except Exception as e: return TestResult( name="vast.ai API Verbindung", description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist", expected="API erreichbar mit Guthaben", actual=f"Fehler: {str(e)}", status="failed", duration_ms=(time.time() - start) * 1000, error_message=str(e) ) async def test_gpu_api_endpoint() -> TestResult: """Test GPU Admin API Endpoint""" start = time.time() try: async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get(f"{BACKEND_URL}/api/gpu/status") duration = (time.time() - start) * 1000 if response.status_code == 200: return TestResult( name="GPU Admin API", description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist", expected="HTTP 200 mit GPU-Status", actual="API verfuegbar", status="passed", duration_ms=duration ) elif response.status_code == 404: return TestResult( name="GPU Admin API", description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist", expected="HTTP 200 mit GPU-Status", actual="Endpoint nicht implementiert", status="skipped", duration_ms=duration, error_message="GPU API nicht aktiviert" ) else: return TestResult( name="GPU Admin API", description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist", expected="HTTP 200 mit GPU-Status", actual=f"HTTP {response.status_code}", status="failed", duration_ms=duration, error_message=f"Unerwarteter Status: {response.status_code}" ) except Exception as e: return TestResult( name="GPU Admin API", description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist", expected="HTTP 200 mit GPU-Status", actual=f"Fehler: {str(e)}", status="failed", duration_ms=(time.time() - start) * 1000, error_message=str(e) ) # ============================================== # Category Runners # ============================================== async def run_detection_tests() -> TestCategoryResult: """Run GPU detection tests""" tests = await asyncio.gather( test_nvidia_smi(), test_rocm_smi(), test_cuda_available(), ) passed = sum(1 for t in tests if t.status == "passed") failed = sum(1 for t in tests if t.status == "failed") return TestCategoryResult( category="detection", display_name="GPU Erkennung", description="Tests zur Hardware-Erkennung", tests=list(tests), passed=passed, failed=failed, total=len(tests) ) async def run_cloud_tests() -> TestCategoryResult: """Run cloud GPU tests""" tests = await asyncio.gather( test_vast_ai_api(), ) passed = sum(1 for t in tests if t.status == "passed") failed = sum(1 for t in tests if t.status == "failed") return TestCategoryResult( category="cloud", display_name="Cloud GPU (vast.ai)", description="Tests fuer Cloud-GPU-Dienste", tests=list(tests), passed=passed, failed=failed, total=len(tests) ) async def run_api_tests() -> TestCategoryResult: """Run GPU API tests""" tests = await asyncio.gather( test_gpu_api_endpoint(), ) passed = sum(1 for t in tests if t.status == "passed") failed = sum(1 for t in tests if t.status == "failed") return TestCategoryResult( category="api-health", display_name="GPU Admin API", description="Tests fuer die GPU-Verwaltungs-Endpunkte", tests=list(tests), passed=passed, failed=failed, total=len(tests) ) # ============================================== # API Endpoints # ============================================== @router.post("/{category}", response_model=TestCategoryResult) async def run_category_tests(category: str): """Run tests for a specific category""" runners = { "api-health": run_api_tests, "detection": run_detection_tests, "cloud": run_cloud_tests, } if category not in runners: return TestCategoryResult( category=category, display_name=f"Unbekannt: {category}", description="Kategorie nicht gefunden", tests=[], passed=0, failed=0, total=0 ) return await runners[category]() @router.post("/run-all", response_model=FullTestResults) async def run_all_tests(): """Run all GPU tests""" start = time.time() categories = await asyncio.gather( run_api_tests(), run_detection_tests(), run_cloud_tests(), ) total_passed = sum(c.passed for c in categories) total_failed = sum(c.failed for c in categories) total_tests = sum(c.total for c in categories) return FullTestResults( categories=list(categories), total_passed=total_passed, total_failed=total_failed, total_tests=total_tests, duration_ms=(time.time() - start) * 1000 ) @router.get("/categories") async def get_categories(): """Get available test categories""" return { "categories": [ {"id": "api-health", "name": "GPU Admin API", "description": "Backend API Tests"}, {"id": "detection", "name": "GPU Erkennung", "description": "Hardware Detection"}, {"id": "cloud", "name": "Cloud GPU", "description": "vast.ai Integration"}, ] }