breakpilot-pwa/backend/gpu_test_api.py

"""
GPU Infrastructure Test API - Test Runner fuer CUDA/ROCm GPU Management
Endpoint: /api/admin/gpu-tests
"""

from fastapi import APIRouter
from pydantic import BaseModel
from typing import List, Optional, Literal
import httpx
import asyncio
import time
import os
import subprocess

router = APIRouter(prefix="/api/admin/gpu-tests", tags=["GPU Tests"])

# ==============================================
# Models
# ==============================================

class TestResult(BaseModel):
    name: str
    description: str
    expected: str
    actual: str
    status: Literal["passed", "failed", "pending", "skipped"]
    duration_ms: float
    error_message: Optional[str] = None


class TestCategoryResult(BaseModel):
    category: str
    display_name: str
    description: str
    tests: List[TestResult]
    passed: int
    failed: int
    total: int


class FullTestResults(BaseModel):
    categories: List[TestCategoryResult]
    total_passed: int
    total_failed: int
    total_tests: int
    duration_ms: float


# ==============================================
# Configuration
# ==============================================

BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8000")
VAST_API_KEY = os.getenv("VAST_API_KEY", "")


# ==============================================
# Test Implementations
# ==============================================

async def test_nvidia_smi() -> TestResult:
    """Test NVIDIA GPU Detection via nvidia-smi"""
    start = time.time()
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
            capture_output=True,
            text=True,
            timeout=10
        )
        duration = (time.time() - start) * 1000

        if result.returncode == 0 and result.stdout.strip():
            gpu_info = result.stdout.strip().split('\n')[0]
            return TestResult(
                name="NVIDIA GPU Erkennung",
                description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
                expected="GPU-Informationen verfuegbar",
                actual=f"GPU: {gpu_info}",
                status="passed",
                duration_ms=duration
            )
        else:
            return TestResult(
                name="NVIDIA GPU Erkennung",
                description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
                expected="GPU-Informationen verfuegbar",
                actual="Keine NVIDIA GPU gefunden",
                status="skipped",
                duration_ms=duration,
                error_message="nvidia-smi nicht verfuegbar oder keine GPU"
            )
    except FileNotFoundError:
        return TestResult(
            name="NVIDIA GPU Erkennung",
            description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
            expected="GPU-Informationen verfuegbar",
            actual="nvidia-smi nicht installiert",
            status="skipped",
            duration_ms=(time.time() - start) * 1000,
            error_message="nvidia-smi Binary nicht gefunden"
        )
    except Exception as e:
        return TestResult(
            name="NVIDIA GPU Erkennung",
            description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
            expected="GPU-Informationen verfuegbar",
            actual=f"Fehler: {str(e)}",
            status="failed",
            duration_ms=(time.time() - start) * 1000,
            error_message=str(e)
        )


async def test_rocm_smi() -> TestResult:
    """Test AMD GPU Detection via rocm-smi"""
    start = time.time()
    try:
        result = subprocess.run(
            ["rocm-smi", "--showproductname"],
            capture_output=True,
            text=True,
            timeout=10
        )
        duration = (time.time() - start) * 1000

        if result.returncode == 0 and result.stdout.strip():
            return TestResult(
                name="AMD ROCm GPU Erkennung",
                description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
                expected="GPU-Informationen verfuegbar",
                actual=f"ROCm GPU erkannt",
                status="passed",
                duration_ms=duration
            )
        else:
            return TestResult(
                name="AMD ROCm GPU Erkennung",
                description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
                expected="GPU-Informationen verfuegbar",
                actual="Keine AMD GPU gefunden",
                status="skipped",
                duration_ms=duration,
                error_message="rocm-smi nicht verfuegbar oder keine GPU"
            )
    except FileNotFoundError:
        return TestResult(
            name="AMD ROCm GPU Erkennung",
            description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
            expected="GPU-Informationen verfuegbar",
            actual="rocm-smi nicht installiert",
            status="skipped",
            duration_ms=(time.time() - start) * 1000,
            error_message="rocm-smi Binary nicht gefunden"
        )
    except Exception as e:
        return TestResult(
            name="AMD ROCm GPU Erkennung",
            description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
            expected="GPU-Informationen verfuegbar",
            actual=f"Fehler: {str(e)}",
            status="failed",
            duration_ms=(time.time() - start) * 1000,
            error_message=str(e)
        )


async def test_cuda_available() -> TestResult:
    """Test CUDA Availability via Python"""
    start = time.time()
    try:
        # Try to import torch and check CUDA
        result = subprocess.run(
            ["python", "-c", "import torch; print(f'CUDA: {torch.cuda.is_available()}, Devices: {torch.cuda.device_count()}')"],
            capture_output=True,
            text=True,
            timeout=30
        )
        duration = (time.time() - start) * 1000

        if result.returncode == 0:
            output = result.stdout.strip()
            if "True" in output:
                return TestResult(
                    name="PyTorch CUDA Support",
                    description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
                    expected="CUDA verfuegbar",
                    actual=output,
                    status="passed",
                    duration_ms=duration
                )
            else:
                return TestResult(
                    name="PyTorch CUDA Support",
                    description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
                    expected="CUDA verfuegbar",
                    actual=output,
                    status="skipped",
                    duration_ms=duration,
                    error_message="CUDA nicht verfuegbar in PyTorch"
                )
        else:
            return TestResult(
                name="PyTorch CUDA Support",
                description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
                expected="CUDA verfuegbar",
                actual="PyTorch nicht installiert",
                status="skipped",
                duration_ms=duration,
                error_message=result.stderr[:200] if result.stderr else "PyTorch fehlt"
            )
    except Exception as e:
        return TestResult(
            name="PyTorch CUDA Support",
            description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
            expected="CUDA verfuegbar",
            actual=f"Fehler: {str(e)}",
            status="skipped",
            duration_ms=(time.time() - start) * 1000,
            error_message=str(e)
        )


async def test_vast_ai_api() -> TestResult:
    """Test vast.ai API Connection"""
    start = time.time()

    if not VAST_API_KEY:
        return TestResult(
            name="vast.ai API Verbindung",
            description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
            expected="API Key konfiguriert",
            actual="VAST_API_KEY nicht gesetzt",
            status="skipped",
            duration_ms=(time.time() - start) * 1000,
            error_message="Umgebungsvariable VAST_API_KEY fehlt"
        )

    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            response = await client.get(
                "https://console.vast.ai/api/v0/users/current",
                headers={"Authorization": f"Bearer {VAST_API_KEY}"}
            )
            duration = (time.time() - start) * 1000

            if response.status_code == 200:
                data = response.json()
                balance = data.get("credit", 0)
                return TestResult(
                    name="vast.ai API Verbindung",
                    description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
                    expected="API erreichbar mit Guthaben",
                    actual=f"Verbunden, Guthaben: ${balance:.2f}",
                    status="passed",
                    duration_ms=duration
                )
            else:
                return TestResult(
                    name="vast.ai API Verbindung",
                    description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
                    expected="API erreichbar mit Guthaben",
                    actual=f"HTTP {response.status_code}",
                    status="failed",
                    duration_ms=duration,
                    error_message="API-Authentifizierung fehlgeschlagen"
                )
    except Exception as e:
        return TestResult(
            name="vast.ai API Verbindung",
            description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
            expected="API erreichbar mit Guthaben",
            actual=f"Fehler: {str(e)}",
            status="failed",
            duration_ms=(time.time() - start) * 1000,
            error_message=str(e)
        )


async def test_gpu_api_endpoint() -> TestResult:
    """Test GPU Admin API Endpoint"""
    start = time.time()
    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            response = await client.get(f"{BACKEND_URL}/api/gpu/status")
            duration = (time.time() - start) * 1000

            if response.status_code == 200:
                return TestResult(
                    name="GPU Admin API",
                    description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
                    expected="HTTP 200 mit GPU-Status",
                    actual="API verfuegbar",
                    status="passed",
                    duration_ms=duration
                )
            elif response.status_code == 404:
                return TestResult(
                    name="GPU Admin API",
                    description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
                    expected="HTTP 200 mit GPU-Status",
                    actual="Endpoint nicht implementiert",
                    status="skipped",
                    duration_ms=duration,
                    error_message="GPU API nicht aktiviert"
                )
            else:
                return TestResult(
                    name="GPU Admin API",
                    description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
                    expected="HTTP 200 mit GPU-Status",
                    actual=f"HTTP {response.status_code}",
                    status="failed",
                    duration_ms=duration,
                    error_message=f"Unerwarteter Status: {response.status_code}"
                )
    except Exception as e:
        return TestResult(
            name="GPU Admin API",
            description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
            expected="HTTP 200 mit GPU-Status",
            actual=f"Fehler: {str(e)}",
            status="failed",
            duration_ms=(time.time() - start) * 1000,
            error_message=str(e)
        )


# ==============================================
# Category Runners
# ==============================================

async def run_detection_tests() -> TestCategoryResult:
    """Run GPU detection tests"""
    tests = await asyncio.gather(
        test_nvidia_smi(),
        test_rocm_smi(),
        test_cuda_available(),
    )

    passed = sum(1 for t in tests if t.status == "passed")
    failed = sum(1 for t in tests if t.status == "failed")

    return TestCategoryResult(
        category="detection",
        display_name="GPU Erkennung",
        description="Tests zur Hardware-Erkennung",
        tests=list(tests),
        passed=passed,
        failed=failed,
        total=len(tests)
    )


async def run_cloud_tests() -> TestCategoryResult:
    """Run cloud GPU tests"""
    tests = await asyncio.gather(
        test_vast_ai_api(),
    )

    passed = sum(1 for t in tests if t.status == "passed")
    failed = sum(1 for t in tests if t.status == "failed")

    return TestCategoryResult(
        category="cloud",
        display_name="Cloud GPU (vast.ai)",
        description="Tests fuer Cloud-GPU-Dienste",
        tests=list(tests),
        passed=passed,
        failed=failed,
        total=len(tests)
    )


async def run_api_tests() -> TestCategoryResult:
    """Run GPU API tests"""
    tests = await asyncio.gather(
        test_gpu_api_endpoint(),
    )

    passed = sum(1 for t in tests if t.status == "passed")
    failed = sum(1 for t in tests if t.status == "failed")

    return TestCategoryResult(
        category="api-health",
        display_name="GPU Admin API",
        description="Tests fuer die GPU-Verwaltungs-Endpunkte",
        tests=list(tests),
        passed=passed,
        failed=failed,
        total=len(tests)
    )


# ==============================================
# API Endpoints
# ==============================================

@router.post("/{category}", response_model=TestCategoryResult)
async def run_category_tests(category: str):
    """Run tests for a specific category"""
    runners = {
        "api-health": run_api_tests,
        "detection": run_detection_tests,
        "cloud": run_cloud_tests,
    }

    if category not in runners:
        return TestCategoryResult(
            category=category,
            display_name=f"Unbekannt: {category}",
            description="Kategorie nicht gefunden",
            tests=[],
            passed=0,
            failed=0,
            total=0
        )

    return await runners[category]()


@router.post("/run-all", response_model=FullTestResults)
async def run_all_tests():
    """Run all GPU tests"""
    start = time.time()

    categories = await asyncio.gather(
        run_api_tests(),
        run_detection_tests(),
        run_cloud_tests(),
    )

    total_passed = sum(c.passed for c in categories)
    total_failed = sum(c.failed for c in categories)
    total_tests = sum(c.total for c in categories)

    return FullTestResults(
        categories=list(categories),
        total_passed=total_passed,
        total_failed=total_failed,
        total_tests=total_tests,
        duration_ms=(time.time() - start) * 1000
    )


@router.get("/categories")
async def get_categories():
    """Get available test categories"""
    return {
        "categories": [
            {"id": "api-health", "name": "GPU Admin API", "description": "Backend API Tests"},
            {"id": "detection", "name": "GPU Erkennung", "description": "Hardware Detection"},
            {"id": "cloud", "name": "Cloud GPU", "description": "vast.ai Integration"},
        ]
    }