breakpilot-lehrer/backend-lehrer/infra/vast_client.py

"""
Vast.ai REST API Client.

Verwendet die offizielle vast.ai API statt CLI fuer mehr Stabilitaet.
API Dokumentation: https://docs.vast.ai/api
"""

import asyncio
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
from typing import Optional, Dict, Any, List

import httpx

logger = logging.getLogger(__name__)


class InstanceStatus(Enum):
    """Vast.ai Instance Status."""
    RUNNING = "running"
    STOPPED = "stopped"
    EXITED = "exited"
    LOADING = "loading"
    SCHEDULING = "scheduling"
    CREATING = "creating"
    UNKNOWN = "unknown"


@dataclass
class AccountInfo:
    """Informationen ueber den vast.ai Account."""
    credit: float  # Aktuelles Guthaben in USD
    balance: float  # Balance (meist 0)
    total_spend: float  # Gesamtausgaben
    username: str
    email: str
    has_billing: bool

    @classmethod
    def from_api_response(cls, data: Dict[str, Any]) -> "AccountInfo":
        """Erstellt AccountInfo aus API Response."""
        return cls(
            credit=data.get("credit", 0.0),
            balance=data.get("balance", 0.0),
            total_spend=abs(data.get("total_spend", 0.0)),  # API gibt negativ zurück
            username=data.get("username", ""),
            email=data.get("email", ""),
            has_billing=data.get("has_billing", False),
        )

    def to_dict(self) -> Dict[str, Any]:
        """Serialisiert zu Dictionary."""
        return {
            "credit": self.credit,
            "balance": self.balance,
            "total_spend": self.total_spend,
            "username": self.username,
            "email": self.email,
            "has_billing": self.has_billing,
        }


@dataclass
class InstanceInfo:
    """Informationen ueber eine vast.ai Instanz."""
    id: int
    status: InstanceStatus
    machine_id: Optional[int] = None
    gpu_name: Optional[str] = None
    num_gpus: int = 1
    gpu_ram: Optional[float] = None  # GB
    cpu_ram: Optional[float] = None  # GB
    disk_space: Optional[float] = None  # GB
    dph_total: Optional[float] = None  # $/hour
    public_ipaddr: Optional[str] = None
    ports: Dict[str, Any] = field(default_factory=dict)
    label: Optional[str] = None
    image_uuid: Optional[str] = None
    started_at: Optional[datetime] = None

    @classmethod
    def from_api_response(cls, data: Dict[str, Any]) -> "InstanceInfo":
        """Erstellt InstanceInfo aus API Response."""
        status_map = {
            "running": InstanceStatus.RUNNING,
            "exited": InstanceStatus.EXITED,
            "loading": InstanceStatus.LOADING,
            "scheduling": InstanceStatus.SCHEDULING,
            "creating": InstanceStatus.CREATING,
        }

        actual_status = data.get("actual_status", "unknown")
        status = status_map.get(actual_status, InstanceStatus.UNKNOWN)

        # Parse ports mapping
        ports = {}
        if "ports" in data and data["ports"]:
            ports = data["ports"]

        # Parse started_at
        started_at = None
        if "start_date" in data and data["start_date"]:
            try:
                started_at = datetime.fromtimestamp(data["start_date"], tz=timezone.utc)
            except (ValueError, TypeError):
                pass

        return cls(
            id=data.get("id", 0),
            status=status,
            machine_id=data.get("machine_id"),
            gpu_name=data.get("gpu_name"),
            num_gpus=data.get("num_gpus", 1),
            gpu_ram=data.get("gpu_ram"),
            cpu_ram=data.get("cpu_ram"),
            disk_space=data.get("disk_space"),
            dph_total=data.get("dph_total"),
            public_ipaddr=data.get("public_ipaddr"),
            ports=ports,
            label=data.get("label"),
            image_uuid=data.get("image_uuid"),
            started_at=started_at,
        )

    def get_endpoint_url(self, internal_port: int = 8001) -> Optional[str]:
        """Berechnet die externe URL fuer einen internen Port."""
        if not self.public_ipaddr:
            return None

        # vast.ai mapped interne Ports auf externe Ports
        # Format: {"8001/tcp": [{"HostIp": "0.0.0.0", "HostPort": "12345"}]}
        port_key = f"{internal_port}/tcp"
        if port_key in self.ports:
            port_info = self.ports[port_key]
            if isinstance(port_info, list) and port_info:
                host_port = port_info[0].get("HostPort")
                if host_port:
                    return f"http://{self.public_ipaddr}:{host_port}"

        # Fallback: Direkter Port
        return f"http://{self.public_ipaddr}:{internal_port}"

    def to_dict(self) -> Dict[str, Any]:
        """Serialisiert zu Dictionary."""
        return {
            "id": self.id,
            "status": self.status.value,
            "machine_id": self.machine_id,
            "gpu_name": self.gpu_name,
            "num_gpus": self.num_gpus,
            "gpu_ram": self.gpu_ram,
            "cpu_ram": self.cpu_ram,
            "disk_space": self.disk_space,
            "dph_total": self.dph_total,
            "public_ipaddr": self.public_ipaddr,
            "ports": self.ports,
            "label": self.label,
            "started_at": self.started_at.isoformat() if self.started_at else None,
        }


class VastAIClient:
    """
    Async Client fuer vast.ai REST API.

    Verwendet die offizielle API unter https://console.vast.ai/api/v0/
    """

    BASE_URL = "https://console.vast.ai/api/v0"

    def __init__(self, api_key: str, timeout: float = 30.0):
        self.api_key = api_key
        self.timeout = timeout
        self._client: Optional[httpx.AsyncClient] = None

    async def _get_client(self) -> httpx.AsyncClient:
        """Lazy Client-Erstellung."""
        if self._client is None or self._client.is_closed:
            self._client = httpx.AsyncClient(
                timeout=self.timeout,
                headers={
                    "Accept": "application/json",
                },
            )
        return self._client

    async def close(self) -> None:
        """Schliesst den HTTP Client."""
        if self._client and not self._client.is_closed:
            await self._client.aclose()
            self._client = None

    def _build_url(self, endpoint: str) -> str:
        """Baut vollstaendige URL mit API Key."""
        sep = "&" if "?" in endpoint else "?"
        return f"{self.BASE_URL}{endpoint}{sep}api_key={self.api_key}"

    async def list_instances(self) -> List[InstanceInfo]:
        """Listet alle Instanzen auf."""
        client = await self._get_client()
        url = self._build_url("/instances/")

        try:
            response = await client.get(url)
            response.raise_for_status()
            data = response.json()

            instances = []
            if "instances" in data:
                for inst_data in data["instances"]:
                    instances.append(InstanceInfo.from_api_response(inst_data))

            return instances

        except httpx.HTTPStatusError as e:
            logger.error(f"vast.ai API error listing instances: {e}")
            raise

    async def get_instance(self, instance_id: int) -> Optional[InstanceInfo]:
        """Holt Details einer spezifischen Instanz."""
        client = await self._get_client()
        url = self._build_url(f"/instances/{instance_id}/")

        try:
            response = await client.get(url)
            response.raise_for_status()
            data = response.json()

            if "instances" in data:
                instances = data["instances"]
                # API gibt bei einzelner Instanz ein dict zurück, bei Liste eine Liste
                if isinstance(instances, list) and instances:
                    return InstanceInfo.from_api_response(instances[0])
                elif isinstance(instances, dict):
                    # Füge ID hinzu falls nicht vorhanden
                    if "id" not in instances:
                        instances["id"] = instance_id
                    return InstanceInfo.from_api_response(instances)
            elif isinstance(data, dict) and "id" in data:
                return InstanceInfo.from_api_response(data)

            return None

        except httpx.HTTPStatusError as e:
            if e.response.status_code == 404:
                return None
            logger.error(f"vast.ai API error getting instance {instance_id}: {e}")
            raise

    async def start_instance(self, instance_id: int) -> bool:
        """Startet eine gestoppte Instanz."""
        client = await self._get_client()
        url = self._build_url(f"/instances/{instance_id}/")

        try:
            response = await client.put(
                url,
                json={"state": "running"},
            )
            response.raise_for_status()
            logger.info(f"vast.ai instance {instance_id} start requested")
            return True

        except httpx.HTTPStatusError as e:
            logger.error(f"vast.ai API error starting instance {instance_id}: {e}")
            return False

    async def stop_instance(self, instance_id: int) -> bool:
        """Stoppt eine laufende Instanz (haelt Disk)."""
        client = await self._get_client()
        url = self._build_url(f"/instances/{instance_id}/")

        try:
            response = await client.put(
                url,
                json={"state": "stopped"},
            )
            response.raise_for_status()
            logger.info(f"vast.ai instance {instance_id} stop requested")
            return True

        except httpx.HTTPStatusError as e:
            logger.error(f"vast.ai API error stopping instance {instance_id}: {e}")
            return False

    async def destroy_instance(self, instance_id: int) -> bool:
        """Loescht eine Instanz komplett (Disk weg!)."""
        client = await self._get_client()
        url = self._build_url(f"/instances/{instance_id}/")

        try:
            response = await client.delete(url)
            response.raise_for_status()
            logger.info(f"vast.ai instance {instance_id} destroyed")
            return True

        except httpx.HTTPStatusError as e:
            logger.error(f"vast.ai API error destroying instance {instance_id}: {e}")
            return False

    async def set_label(self, instance_id: int, label: str) -> bool:
        """Setzt ein Label fuer eine Instanz."""
        client = await self._get_client()
        url = self._build_url(f"/instances/{instance_id}/")

        try:
            response = await client.put(
                url,
                json={"label": label},
            )
            response.raise_for_status()
            return True

        except httpx.HTTPStatusError as e:
            logger.error(f"vast.ai API error setting label on instance {instance_id}: {e}")
            return False

    async def wait_for_status(
        self,
        instance_id: int,
        target_status: InstanceStatus,
        timeout_seconds: int = 300,
        poll_interval: float = 5.0,
    ) -> Optional[InstanceInfo]:
        """
        Wartet bis eine Instanz einen bestimmten Status erreicht.

        Returns:
            InstanceInfo wenn Status erreicht, None bei Timeout.
        """
        deadline = asyncio.get_event_loop().time() + timeout_seconds

        while asyncio.get_event_loop().time() < deadline:
            instance = await self.get_instance(instance_id)

            if instance and instance.status == target_status:
                return instance

            if instance:
                logger.debug(
                    f"vast.ai instance {instance_id} status: {instance.status.value}, "
                    f"waiting for {target_status.value}"
                )

            await asyncio.sleep(poll_interval)

        logger.warning(
            f"Timeout waiting for instance {instance_id} to reach {target_status.value}"
        )
        return None

    async def wait_for_health(
        self,
        instance: InstanceInfo,
        health_path: str = "/health",
        internal_port: int = 8001,
        timeout_seconds: int = 600,
        poll_interval: float = 5.0,
    ) -> bool:
        """
        Wartet bis der Health-Endpoint erreichbar ist.

        Returns:
            True wenn Health OK, False bei Timeout.
        """
        endpoint = instance.get_endpoint_url(internal_port)
        if not endpoint:
            logger.error("No endpoint URL available for health check")
            return False

        health_url = f"{endpoint.rstrip('/')}{health_path}"
        logger.info(f"Waiting for health at {health_url}")

        deadline = asyncio.get_event_loop().time() + timeout_seconds
        health_client = httpx.AsyncClient(timeout=5.0)

        try:
            while asyncio.get_event_loop().time() < deadline:
                try:
                    response = await health_client.get(health_url)
                    if 200 <= response.status_code < 300:
                        logger.info(f"Health check passed: {health_url}")
                        return True
                except Exception as e:
                    logger.debug(f"Health check failed: {e}")

                await asyncio.sleep(poll_interval)

            logger.warning(f"Health check timeout: {health_url}")
            return False

        finally:
            await health_client.aclose()

    async def get_account_info(self) -> Optional[AccountInfo]:
        """
        Holt Account-Informationen inkl. Credit/Budget.

        Returns:
            AccountInfo oder None bei Fehler.
        """
        client = await self._get_client()
        url = self._build_url("/users/current/")

        try:
            response = await client.get(url)
            response.raise_for_status()
            data = response.json()

            return AccountInfo.from_api_response(data)

        except httpx.HTTPStatusError as e:
            logger.error(f"vast.ai API error getting account info: {e}")
            return None
        except Exception as e:
            logger.error(f"Error getting vast.ai account info: {e}")
            return None