Files
breakpilot-lehrer/backend-lehrer/infra/vast_client.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

420 lines
14 KiB
Python

"""
Vast.ai REST API Client.
Verwendet die offizielle vast.ai API statt CLI fuer mehr Stabilitaet.
API Dokumentation: https://docs.vast.ai/api
"""
import asyncio
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
from typing import Optional, Dict, Any, List
import httpx
logger = logging.getLogger(__name__)
class InstanceStatus(Enum):
"""Vast.ai Instance Status."""
RUNNING = "running"
STOPPED = "stopped"
EXITED = "exited"
LOADING = "loading"
SCHEDULING = "scheduling"
CREATING = "creating"
UNKNOWN = "unknown"
@dataclass
class AccountInfo:
"""Informationen ueber den vast.ai Account."""
credit: float # Aktuelles Guthaben in USD
balance: float # Balance (meist 0)
total_spend: float # Gesamtausgaben
username: str
email: str
has_billing: bool
@classmethod
def from_api_response(cls, data: Dict[str, Any]) -> "AccountInfo":
"""Erstellt AccountInfo aus API Response."""
return cls(
credit=data.get("credit", 0.0),
balance=data.get("balance", 0.0),
total_spend=abs(data.get("total_spend", 0.0)), # API gibt negativ zurück
username=data.get("username", ""),
email=data.get("email", ""),
has_billing=data.get("has_billing", False),
)
def to_dict(self) -> Dict[str, Any]:
"""Serialisiert zu Dictionary."""
return {
"credit": self.credit,
"balance": self.balance,
"total_spend": self.total_spend,
"username": self.username,
"email": self.email,
"has_billing": self.has_billing,
}
@dataclass
class InstanceInfo:
"""Informationen ueber eine vast.ai Instanz."""
id: int
status: InstanceStatus
machine_id: Optional[int] = None
gpu_name: Optional[str] = None
num_gpus: int = 1
gpu_ram: Optional[float] = None # GB
cpu_ram: Optional[float] = None # GB
disk_space: Optional[float] = None # GB
dph_total: Optional[float] = None # $/hour
public_ipaddr: Optional[str] = None
ports: Dict[str, Any] = field(default_factory=dict)
label: Optional[str] = None
image_uuid: Optional[str] = None
started_at: Optional[datetime] = None
@classmethod
def from_api_response(cls, data: Dict[str, Any]) -> "InstanceInfo":
"""Erstellt InstanceInfo aus API Response."""
status_map = {
"running": InstanceStatus.RUNNING,
"exited": InstanceStatus.EXITED,
"loading": InstanceStatus.LOADING,
"scheduling": InstanceStatus.SCHEDULING,
"creating": InstanceStatus.CREATING,
}
actual_status = data.get("actual_status", "unknown")
status = status_map.get(actual_status, InstanceStatus.UNKNOWN)
# Parse ports mapping
ports = {}
if "ports" in data and data["ports"]:
ports = data["ports"]
# Parse started_at
started_at = None
if "start_date" in data and data["start_date"]:
try:
started_at = datetime.fromtimestamp(data["start_date"], tz=timezone.utc)
except (ValueError, TypeError):
pass
return cls(
id=data.get("id", 0),
status=status,
machine_id=data.get("machine_id"),
gpu_name=data.get("gpu_name"),
num_gpus=data.get("num_gpus", 1),
gpu_ram=data.get("gpu_ram"),
cpu_ram=data.get("cpu_ram"),
disk_space=data.get("disk_space"),
dph_total=data.get("dph_total"),
public_ipaddr=data.get("public_ipaddr"),
ports=ports,
label=data.get("label"),
image_uuid=data.get("image_uuid"),
started_at=started_at,
)
def get_endpoint_url(self, internal_port: int = 8001) -> Optional[str]:
"""Berechnet die externe URL fuer einen internen Port."""
if not self.public_ipaddr:
return None
# vast.ai mapped interne Ports auf externe Ports
# Format: {"8001/tcp": [{"HostIp": "0.0.0.0", "HostPort": "12345"}]}
port_key = f"{internal_port}/tcp"
if port_key in self.ports:
port_info = self.ports[port_key]
if isinstance(port_info, list) and port_info:
host_port = port_info[0].get("HostPort")
if host_port:
return f"http://{self.public_ipaddr}:{host_port}"
# Fallback: Direkter Port
return f"http://{self.public_ipaddr}:{internal_port}"
def to_dict(self) -> Dict[str, Any]:
"""Serialisiert zu Dictionary."""
return {
"id": self.id,
"status": self.status.value,
"machine_id": self.machine_id,
"gpu_name": self.gpu_name,
"num_gpus": self.num_gpus,
"gpu_ram": self.gpu_ram,
"cpu_ram": self.cpu_ram,
"disk_space": self.disk_space,
"dph_total": self.dph_total,
"public_ipaddr": self.public_ipaddr,
"ports": self.ports,
"label": self.label,
"started_at": self.started_at.isoformat() if self.started_at else None,
}
class VastAIClient:
"""
Async Client fuer vast.ai REST API.
Verwendet die offizielle API unter https://console.vast.ai/api/v0/
"""
BASE_URL = "https://console.vast.ai/api/v0"
def __init__(self, api_key: str, timeout: float = 30.0):
self.api_key = api_key
self.timeout = timeout
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Lazy Client-Erstellung."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
timeout=self.timeout,
headers={
"Accept": "application/json",
},
)
return self._client
async def close(self) -> None:
"""Schliesst den HTTP Client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None
def _build_url(self, endpoint: str) -> str:
"""Baut vollstaendige URL mit API Key."""
sep = "&" if "?" in endpoint else "?"
return f"{self.BASE_URL}{endpoint}{sep}api_key={self.api_key}"
async def list_instances(self) -> List[InstanceInfo]:
"""Listet alle Instanzen auf."""
client = await self._get_client()
url = self._build_url("/instances/")
try:
response = await client.get(url)
response.raise_for_status()
data = response.json()
instances = []
if "instances" in data:
for inst_data in data["instances"]:
instances.append(InstanceInfo.from_api_response(inst_data))
return instances
except httpx.HTTPStatusError as e:
logger.error(f"vast.ai API error listing instances: {e}")
raise
async def get_instance(self, instance_id: int) -> Optional[InstanceInfo]:
"""Holt Details einer spezifischen Instanz."""
client = await self._get_client()
url = self._build_url(f"/instances/{instance_id}/")
try:
response = await client.get(url)
response.raise_for_status()
data = response.json()
if "instances" in data:
instances = data["instances"]
# API gibt bei einzelner Instanz ein dict zurück, bei Liste eine Liste
if isinstance(instances, list) and instances:
return InstanceInfo.from_api_response(instances[0])
elif isinstance(instances, dict):
# Füge ID hinzu falls nicht vorhanden
if "id" not in instances:
instances["id"] = instance_id
return InstanceInfo.from_api_response(instances)
elif isinstance(data, dict) and "id" in data:
return InstanceInfo.from_api_response(data)
return None
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
return None
logger.error(f"vast.ai API error getting instance {instance_id}: {e}")
raise
async def start_instance(self, instance_id: int) -> bool:
"""Startet eine gestoppte Instanz."""
client = await self._get_client()
url = self._build_url(f"/instances/{instance_id}/")
try:
response = await client.put(
url,
json={"state": "running"},
)
response.raise_for_status()
logger.info(f"vast.ai instance {instance_id} start requested")
return True
except httpx.HTTPStatusError as e:
logger.error(f"vast.ai API error starting instance {instance_id}: {e}")
return False
async def stop_instance(self, instance_id: int) -> bool:
"""Stoppt eine laufende Instanz (haelt Disk)."""
client = await self._get_client()
url = self._build_url(f"/instances/{instance_id}/")
try:
response = await client.put(
url,
json={"state": "stopped"},
)
response.raise_for_status()
logger.info(f"vast.ai instance {instance_id} stop requested")
return True
except httpx.HTTPStatusError as e:
logger.error(f"vast.ai API error stopping instance {instance_id}: {e}")
return False
async def destroy_instance(self, instance_id: int) -> bool:
"""Loescht eine Instanz komplett (Disk weg!)."""
client = await self._get_client()
url = self._build_url(f"/instances/{instance_id}/")
try:
response = await client.delete(url)
response.raise_for_status()
logger.info(f"vast.ai instance {instance_id} destroyed")
return True
except httpx.HTTPStatusError as e:
logger.error(f"vast.ai API error destroying instance {instance_id}: {e}")
return False
async def set_label(self, instance_id: int, label: str) -> bool:
"""Setzt ein Label fuer eine Instanz."""
client = await self._get_client()
url = self._build_url(f"/instances/{instance_id}/")
try:
response = await client.put(
url,
json={"label": label},
)
response.raise_for_status()
return True
except httpx.HTTPStatusError as e:
logger.error(f"vast.ai API error setting label on instance {instance_id}: {e}")
return False
async def wait_for_status(
self,
instance_id: int,
target_status: InstanceStatus,
timeout_seconds: int = 300,
poll_interval: float = 5.0,
) -> Optional[InstanceInfo]:
"""
Wartet bis eine Instanz einen bestimmten Status erreicht.
Returns:
InstanceInfo wenn Status erreicht, None bei Timeout.
"""
deadline = asyncio.get_event_loop().time() + timeout_seconds
while asyncio.get_event_loop().time() < deadline:
instance = await self.get_instance(instance_id)
if instance and instance.status == target_status:
return instance
if instance:
logger.debug(
f"vast.ai instance {instance_id} status: {instance.status.value}, "
f"waiting for {target_status.value}"
)
await asyncio.sleep(poll_interval)
logger.warning(
f"Timeout waiting for instance {instance_id} to reach {target_status.value}"
)
return None
async def wait_for_health(
self,
instance: InstanceInfo,
health_path: str = "/health",
internal_port: int = 8001,
timeout_seconds: int = 600,
poll_interval: float = 5.0,
) -> bool:
"""
Wartet bis der Health-Endpoint erreichbar ist.
Returns:
True wenn Health OK, False bei Timeout.
"""
endpoint = instance.get_endpoint_url(internal_port)
if not endpoint:
logger.error("No endpoint URL available for health check")
return False
health_url = f"{endpoint.rstrip('/')}{health_path}"
logger.info(f"Waiting for health at {health_url}")
deadline = asyncio.get_event_loop().time() + timeout_seconds
health_client = httpx.AsyncClient(timeout=5.0)
try:
while asyncio.get_event_loop().time() < deadline:
try:
response = await health_client.get(health_url)
if 200 <= response.status_code < 300:
logger.info(f"Health check passed: {health_url}")
return True
except Exception as e:
logger.debug(f"Health check failed: {e}")
await asyncio.sleep(poll_interval)
logger.warning(f"Health check timeout: {health_url}")
return False
finally:
await health_client.aclose()
async def get_account_info(self) -> Optional[AccountInfo]:
"""
Holt Account-Informationen inkl. Credit/Budget.
Returns:
AccountInfo oder None bei Fehler.
"""
client = await self._get_client()
url = self._build_url("/users/current/")
try:
response = await client.get(url)
response.raise_for_status()
data = response.json()
return AccountInfo.from_api_response(data)
except httpx.HTTPStatusError as e:
logger.error(f"vast.ai API error getting account info: {e}")
return None
except Exception as e:
logger.error(f"Error getting vast.ai account info: {e}")
return None