Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
420 lines
14 KiB
Python
420 lines
14 KiB
Python
"""
|
|
Vast.ai REST API Client.
|
|
|
|
Verwendet die offizielle vast.ai API statt CLI fuer mehr Stabilitaet.
|
|
API Dokumentation: https://docs.vast.ai/api
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from enum import Enum
|
|
from typing import Optional, Dict, Any, List
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class InstanceStatus(Enum):
|
|
"""Vast.ai Instance Status."""
|
|
RUNNING = "running"
|
|
STOPPED = "stopped"
|
|
EXITED = "exited"
|
|
LOADING = "loading"
|
|
SCHEDULING = "scheduling"
|
|
CREATING = "creating"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
@dataclass
|
|
class AccountInfo:
|
|
"""Informationen ueber den vast.ai Account."""
|
|
credit: float # Aktuelles Guthaben in USD
|
|
balance: float # Balance (meist 0)
|
|
total_spend: float # Gesamtausgaben
|
|
username: str
|
|
email: str
|
|
has_billing: bool
|
|
|
|
@classmethod
|
|
def from_api_response(cls, data: Dict[str, Any]) -> "AccountInfo":
|
|
"""Erstellt AccountInfo aus API Response."""
|
|
return cls(
|
|
credit=data.get("credit", 0.0),
|
|
balance=data.get("balance", 0.0),
|
|
total_spend=abs(data.get("total_spend", 0.0)), # API gibt negativ zurück
|
|
username=data.get("username", ""),
|
|
email=data.get("email", ""),
|
|
has_billing=data.get("has_billing", False),
|
|
)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Serialisiert zu Dictionary."""
|
|
return {
|
|
"credit": self.credit,
|
|
"balance": self.balance,
|
|
"total_spend": self.total_spend,
|
|
"username": self.username,
|
|
"email": self.email,
|
|
"has_billing": self.has_billing,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class InstanceInfo:
|
|
"""Informationen ueber eine vast.ai Instanz."""
|
|
id: int
|
|
status: InstanceStatus
|
|
machine_id: Optional[int] = None
|
|
gpu_name: Optional[str] = None
|
|
num_gpus: int = 1
|
|
gpu_ram: Optional[float] = None # GB
|
|
cpu_ram: Optional[float] = None # GB
|
|
disk_space: Optional[float] = None # GB
|
|
dph_total: Optional[float] = None # $/hour
|
|
public_ipaddr: Optional[str] = None
|
|
ports: Dict[str, Any] = field(default_factory=dict)
|
|
label: Optional[str] = None
|
|
image_uuid: Optional[str] = None
|
|
started_at: Optional[datetime] = None
|
|
|
|
@classmethod
|
|
def from_api_response(cls, data: Dict[str, Any]) -> "InstanceInfo":
|
|
"""Erstellt InstanceInfo aus API Response."""
|
|
status_map = {
|
|
"running": InstanceStatus.RUNNING,
|
|
"exited": InstanceStatus.EXITED,
|
|
"loading": InstanceStatus.LOADING,
|
|
"scheduling": InstanceStatus.SCHEDULING,
|
|
"creating": InstanceStatus.CREATING,
|
|
}
|
|
|
|
actual_status = data.get("actual_status", "unknown")
|
|
status = status_map.get(actual_status, InstanceStatus.UNKNOWN)
|
|
|
|
# Parse ports mapping
|
|
ports = {}
|
|
if "ports" in data and data["ports"]:
|
|
ports = data["ports"]
|
|
|
|
# Parse started_at
|
|
started_at = None
|
|
if "start_date" in data and data["start_date"]:
|
|
try:
|
|
started_at = datetime.fromtimestamp(data["start_date"], tz=timezone.utc)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
return cls(
|
|
id=data.get("id", 0),
|
|
status=status,
|
|
machine_id=data.get("machine_id"),
|
|
gpu_name=data.get("gpu_name"),
|
|
num_gpus=data.get("num_gpus", 1),
|
|
gpu_ram=data.get("gpu_ram"),
|
|
cpu_ram=data.get("cpu_ram"),
|
|
disk_space=data.get("disk_space"),
|
|
dph_total=data.get("dph_total"),
|
|
public_ipaddr=data.get("public_ipaddr"),
|
|
ports=ports,
|
|
label=data.get("label"),
|
|
image_uuid=data.get("image_uuid"),
|
|
started_at=started_at,
|
|
)
|
|
|
|
def get_endpoint_url(self, internal_port: int = 8001) -> Optional[str]:
|
|
"""Berechnet die externe URL fuer einen internen Port."""
|
|
if not self.public_ipaddr:
|
|
return None
|
|
|
|
# vast.ai mapped interne Ports auf externe Ports
|
|
# Format: {"8001/tcp": [{"HostIp": "0.0.0.0", "HostPort": "12345"}]}
|
|
port_key = f"{internal_port}/tcp"
|
|
if port_key in self.ports:
|
|
port_info = self.ports[port_key]
|
|
if isinstance(port_info, list) and port_info:
|
|
host_port = port_info[0].get("HostPort")
|
|
if host_port:
|
|
return f"http://{self.public_ipaddr}:{host_port}"
|
|
|
|
# Fallback: Direkter Port
|
|
return f"http://{self.public_ipaddr}:{internal_port}"
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Serialisiert zu Dictionary."""
|
|
return {
|
|
"id": self.id,
|
|
"status": self.status.value,
|
|
"machine_id": self.machine_id,
|
|
"gpu_name": self.gpu_name,
|
|
"num_gpus": self.num_gpus,
|
|
"gpu_ram": self.gpu_ram,
|
|
"cpu_ram": self.cpu_ram,
|
|
"disk_space": self.disk_space,
|
|
"dph_total": self.dph_total,
|
|
"public_ipaddr": self.public_ipaddr,
|
|
"ports": self.ports,
|
|
"label": self.label,
|
|
"started_at": self.started_at.isoformat() if self.started_at else None,
|
|
}
|
|
|
|
|
|
class VastAIClient:
|
|
"""
|
|
Async Client fuer vast.ai REST API.
|
|
|
|
Verwendet die offizielle API unter https://console.vast.ai/api/v0/
|
|
"""
|
|
|
|
BASE_URL = "https://console.vast.ai/api/v0"
|
|
|
|
def __init__(self, api_key: str, timeout: float = 30.0):
|
|
self.api_key = api_key
|
|
self.timeout = timeout
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""Lazy Client-Erstellung."""
|
|
if self._client is None or self._client.is_closed:
|
|
self._client = httpx.AsyncClient(
|
|
timeout=self.timeout,
|
|
headers={
|
|
"Accept": "application/json",
|
|
},
|
|
)
|
|
return self._client
|
|
|
|
async def close(self) -> None:
|
|
"""Schliesst den HTTP Client."""
|
|
if self._client and not self._client.is_closed:
|
|
await self._client.aclose()
|
|
self._client = None
|
|
|
|
def _build_url(self, endpoint: str) -> str:
|
|
"""Baut vollstaendige URL mit API Key."""
|
|
sep = "&" if "?" in endpoint else "?"
|
|
return f"{self.BASE_URL}{endpoint}{sep}api_key={self.api_key}"
|
|
|
|
async def list_instances(self) -> List[InstanceInfo]:
|
|
"""Listet alle Instanzen auf."""
|
|
client = await self._get_client()
|
|
url = self._build_url("/instances/")
|
|
|
|
try:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
instances = []
|
|
if "instances" in data:
|
|
for inst_data in data["instances"]:
|
|
instances.append(InstanceInfo.from_api_response(inst_data))
|
|
|
|
return instances
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"vast.ai API error listing instances: {e}")
|
|
raise
|
|
|
|
async def get_instance(self, instance_id: int) -> Optional[InstanceInfo]:
|
|
"""Holt Details einer spezifischen Instanz."""
|
|
client = await self._get_client()
|
|
url = self._build_url(f"/instances/{instance_id}/")
|
|
|
|
try:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "instances" in data:
|
|
instances = data["instances"]
|
|
# API gibt bei einzelner Instanz ein dict zurück, bei Liste eine Liste
|
|
if isinstance(instances, list) and instances:
|
|
return InstanceInfo.from_api_response(instances[0])
|
|
elif isinstance(instances, dict):
|
|
# Füge ID hinzu falls nicht vorhanden
|
|
if "id" not in instances:
|
|
instances["id"] = instance_id
|
|
return InstanceInfo.from_api_response(instances)
|
|
elif isinstance(data, dict) and "id" in data:
|
|
return InstanceInfo.from_api_response(data)
|
|
|
|
return None
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 404:
|
|
return None
|
|
logger.error(f"vast.ai API error getting instance {instance_id}: {e}")
|
|
raise
|
|
|
|
async def start_instance(self, instance_id: int) -> bool:
|
|
"""Startet eine gestoppte Instanz."""
|
|
client = await self._get_client()
|
|
url = self._build_url(f"/instances/{instance_id}/")
|
|
|
|
try:
|
|
response = await client.put(
|
|
url,
|
|
json={"state": "running"},
|
|
)
|
|
response.raise_for_status()
|
|
logger.info(f"vast.ai instance {instance_id} start requested")
|
|
return True
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"vast.ai API error starting instance {instance_id}: {e}")
|
|
return False
|
|
|
|
async def stop_instance(self, instance_id: int) -> bool:
|
|
"""Stoppt eine laufende Instanz (haelt Disk)."""
|
|
client = await self._get_client()
|
|
url = self._build_url(f"/instances/{instance_id}/")
|
|
|
|
try:
|
|
response = await client.put(
|
|
url,
|
|
json={"state": "stopped"},
|
|
)
|
|
response.raise_for_status()
|
|
logger.info(f"vast.ai instance {instance_id} stop requested")
|
|
return True
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"vast.ai API error stopping instance {instance_id}: {e}")
|
|
return False
|
|
|
|
async def destroy_instance(self, instance_id: int) -> bool:
|
|
"""Loescht eine Instanz komplett (Disk weg!)."""
|
|
client = await self._get_client()
|
|
url = self._build_url(f"/instances/{instance_id}/")
|
|
|
|
try:
|
|
response = await client.delete(url)
|
|
response.raise_for_status()
|
|
logger.info(f"vast.ai instance {instance_id} destroyed")
|
|
return True
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"vast.ai API error destroying instance {instance_id}: {e}")
|
|
return False
|
|
|
|
async def set_label(self, instance_id: int, label: str) -> bool:
|
|
"""Setzt ein Label fuer eine Instanz."""
|
|
client = await self._get_client()
|
|
url = self._build_url(f"/instances/{instance_id}/")
|
|
|
|
try:
|
|
response = await client.put(
|
|
url,
|
|
json={"label": label},
|
|
)
|
|
response.raise_for_status()
|
|
return True
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"vast.ai API error setting label on instance {instance_id}: {e}")
|
|
return False
|
|
|
|
async def wait_for_status(
|
|
self,
|
|
instance_id: int,
|
|
target_status: InstanceStatus,
|
|
timeout_seconds: int = 300,
|
|
poll_interval: float = 5.0,
|
|
) -> Optional[InstanceInfo]:
|
|
"""
|
|
Wartet bis eine Instanz einen bestimmten Status erreicht.
|
|
|
|
Returns:
|
|
InstanceInfo wenn Status erreicht, None bei Timeout.
|
|
"""
|
|
deadline = asyncio.get_event_loop().time() + timeout_seconds
|
|
|
|
while asyncio.get_event_loop().time() < deadline:
|
|
instance = await self.get_instance(instance_id)
|
|
|
|
if instance and instance.status == target_status:
|
|
return instance
|
|
|
|
if instance:
|
|
logger.debug(
|
|
f"vast.ai instance {instance_id} status: {instance.status.value}, "
|
|
f"waiting for {target_status.value}"
|
|
)
|
|
|
|
await asyncio.sleep(poll_interval)
|
|
|
|
logger.warning(
|
|
f"Timeout waiting for instance {instance_id} to reach {target_status.value}"
|
|
)
|
|
return None
|
|
|
|
async def wait_for_health(
|
|
self,
|
|
instance: InstanceInfo,
|
|
health_path: str = "/health",
|
|
internal_port: int = 8001,
|
|
timeout_seconds: int = 600,
|
|
poll_interval: float = 5.0,
|
|
) -> bool:
|
|
"""
|
|
Wartet bis der Health-Endpoint erreichbar ist.
|
|
|
|
Returns:
|
|
True wenn Health OK, False bei Timeout.
|
|
"""
|
|
endpoint = instance.get_endpoint_url(internal_port)
|
|
if not endpoint:
|
|
logger.error("No endpoint URL available for health check")
|
|
return False
|
|
|
|
health_url = f"{endpoint.rstrip('/')}{health_path}"
|
|
logger.info(f"Waiting for health at {health_url}")
|
|
|
|
deadline = asyncio.get_event_loop().time() + timeout_seconds
|
|
health_client = httpx.AsyncClient(timeout=5.0)
|
|
|
|
try:
|
|
while asyncio.get_event_loop().time() < deadline:
|
|
try:
|
|
response = await health_client.get(health_url)
|
|
if 200 <= response.status_code < 300:
|
|
logger.info(f"Health check passed: {health_url}")
|
|
return True
|
|
except Exception as e:
|
|
logger.debug(f"Health check failed: {e}")
|
|
|
|
await asyncio.sleep(poll_interval)
|
|
|
|
logger.warning(f"Health check timeout: {health_url}")
|
|
return False
|
|
|
|
finally:
|
|
await health_client.aclose()
|
|
|
|
async def get_account_info(self) -> Optional[AccountInfo]:
|
|
"""
|
|
Holt Account-Informationen inkl. Credit/Budget.
|
|
|
|
Returns:
|
|
AccountInfo oder None bei Fehler.
|
|
"""
|
|
client = await self._get_client()
|
|
url = self._build_url("/users/current/")
|
|
|
|
try:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
return AccountInfo.from_api_response(data)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"vast.ai API error getting account info: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting vast.ai account info: {e}")
|
|
return None
|