fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
10
backend/infra/__init__.py
Normal file
10
backend/infra/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
Infrastructure management module.
|
||||
|
||||
Provides control plane for external GPU resources (vast.ai).
|
||||
"""
|
||||
|
||||
from .vast_client import VastAIClient
|
||||
from .vast_power import router as vast_router
|
||||
|
||||
__all__ = ["VastAIClient", "vast_router"]
|
||||
419
backend/infra/vast_client.py
Normal file
419
backend/infra/vast_client.py
Normal file
@@ -0,0 +1,419 @@
|
||||
"""
|
||||
Vast.ai REST API Client.
|
||||
|
||||
Verwendet die offizielle vast.ai API statt CLI fuer mehr Stabilitaet.
|
||||
API Dokumentation: https://docs.vast.ai/api
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InstanceStatus(Enum):
|
||||
"""Vast.ai Instance Status."""
|
||||
RUNNING = "running"
|
||||
STOPPED = "stopped"
|
||||
EXITED = "exited"
|
||||
LOADING = "loading"
|
||||
SCHEDULING = "scheduling"
|
||||
CREATING = "creating"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AccountInfo:
|
||||
"""Informationen ueber den vast.ai Account."""
|
||||
credit: float # Aktuelles Guthaben in USD
|
||||
balance: float # Balance (meist 0)
|
||||
total_spend: float # Gesamtausgaben
|
||||
username: str
|
||||
email: str
|
||||
has_billing: bool
|
||||
|
||||
@classmethod
|
||||
def from_api_response(cls, data: Dict[str, Any]) -> "AccountInfo":
|
||||
"""Erstellt AccountInfo aus API Response."""
|
||||
return cls(
|
||||
credit=data.get("credit", 0.0),
|
||||
balance=data.get("balance", 0.0),
|
||||
total_spend=abs(data.get("total_spend", 0.0)), # API gibt negativ zurück
|
||||
username=data.get("username", ""),
|
||||
email=data.get("email", ""),
|
||||
has_billing=data.get("has_billing", False),
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Serialisiert zu Dictionary."""
|
||||
return {
|
||||
"credit": self.credit,
|
||||
"balance": self.balance,
|
||||
"total_spend": self.total_spend,
|
||||
"username": self.username,
|
||||
"email": self.email,
|
||||
"has_billing": self.has_billing,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstanceInfo:
|
||||
"""Informationen ueber eine vast.ai Instanz."""
|
||||
id: int
|
||||
status: InstanceStatus
|
||||
machine_id: Optional[int] = None
|
||||
gpu_name: Optional[str] = None
|
||||
num_gpus: int = 1
|
||||
gpu_ram: Optional[float] = None # GB
|
||||
cpu_ram: Optional[float] = None # GB
|
||||
disk_space: Optional[float] = None # GB
|
||||
dph_total: Optional[float] = None # $/hour
|
||||
public_ipaddr: Optional[str] = None
|
||||
ports: Dict[str, Any] = field(default_factory=dict)
|
||||
label: Optional[str] = None
|
||||
image_uuid: Optional[str] = None
|
||||
started_at: Optional[datetime] = None
|
||||
|
||||
@classmethod
|
||||
def from_api_response(cls, data: Dict[str, Any]) -> "InstanceInfo":
|
||||
"""Erstellt InstanceInfo aus API Response."""
|
||||
status_map = {
|
||||
"running": InstanceStatus.RUNNING,
|
||||
"exited": InstanceStatus.EXITED,
|
||||
"loading": InstanceStatus.LOADING,
|
||||
"scheduling": InstanceStatus.SCHEDULING,
|
||||
"creating": InstanceStatus.CREATING,
|
||||
}
|
||||
|
||||
actual_status = data.get("actual_status", "unknown")
|
||||
status = status_map.get(actual_status, InstanceStatus.UNKNOWN)
|
||||
|
||||
# Parse ports mapping
|
||||
ports = {}
|
||||
if "ports" in data and data["ports"]:
|
||||
ports = data["ports"]
|
||||
|
||||
# Parse started_at
|
||||
started_at = None
|
||||
if "start_date" in data and data["start_date"]:
|
||||
try:
|
||||
started_at = datetime.fromtimestamp(data["start_date"], tz=timezone.utc)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return cls(
|
||||
id=data.get("id", 0),
|
||||
status=status,
|
||||
machine_id=data.get("machine_id"),
|
||||
gpu_name=data.get("gpu_name"),
|
||||
num_gpus=data.get("num_gpus", 1),
|
||||
gpu_ram=data.get("gpu_ram"),
|
||||
cpu_ram=data.get("cpu_ram"),
|
||||
disk_space=data.get("disk_space"),
|
||||
dph_total=data.get("dph_total"),
|
||||
public_ipaddr=data.get("public_ipaddr"),
|
||||
ports=ports,
|
||||
label=data.get("label"),
|
||||
image_uuid=data.get("image_uuid"),
|
||||
started_at=started_at,
|
||||
)
|
||||
|
||||
def get_endpoint_url(self, internal_port: int = 8001) -> Optional[str]:
|
||||
"""Berechnet die externe URL fuer einen internen Port."""
|
||||
if not self.public_ipaddr:
|
||||
return None
|
||||
|
||||
# vast.ai mapped interne Ports auf externe Ports
|
||||
# Format: {"8001/tcp": [{"HostIp": "0.0.0.0", "HostPort": "12345"}]}
|
||||
port_key = f"{internal_port}/tcp"
|
||||
if port_key in self.ports:
|
||||
port_info = self.ports[port_key]
|
||||
if isinstance(port_info, list) and port_info:
|
||||
host_port = port_info[0].get("HostPort")
|
||||
if host_port:
|
||||
return f"http://{self.public_ipaddr}:{host_port}"
|
||||
|
||||
# Fallback: Direkter Port
|
||||
return f"http://{self.public_ipaddr}:{internal_port}"
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Serialisiert zu Dictionary."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"status": self.status.value,
|
||||
"machine_id": self.machine_id,
|
||||
"gpu_name": self.gpu_name,
|
||||
"num_gpus": self.num_gpus,
|
||||
"gpu_ram": self.gpu_ram,
|
||||
"cpu_ram": self.cpu_ram,
|
||||
"disk_space": self.disk_space,
|
||||
"dph_total": self.dph_total,
|
||||
"public_ipaddr": self.public_ipaddr,
|
||||
"ports": self.ports,
|
||||
"label": self.label,
|
||||
"started_at": self.started_at.isoformat() if self.started_at else None,
|
||||
}
|
||||
|
||||
|
||||
class VastAIClient:
|
||||
"""
|
||||
Async Client fuer vast.ai REST API.
|
||||
|
||||
Verwendet die offizielle API unter https://console.vast.ai/api/v0/
|
||||
"""
|
||||
|
||||
BASE_URL = "https://console.vast.ai/api/v0"
|
||||
|
||||
def __init__(self, api_key: str, timeout: float = 30.0):
|
||||
self.api_key = api_key
|
||||
self.timeout = timeout
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Lazy Client-Erstellung."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=self.timeout,
|
||||
headers={
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Schliesst den HTTP Client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
def _build_url(self, endpoint: str) -> str:
|
||||
"""Baut vollstaendige URL mit API Key."""
|
||||
sep = "&" if "?" in endpoint else "?"
|
||||
return f"{self.BASE_URL}{endpoint}{sep}api_key={self.api_key}"
|
||||
|
||||
async def list_instances(self) -> List[InstanceInfo]:
|
||||
"""Listet alle Instanzen auf."""
|
||||
client = await self._get_client()
|
||||
url = self._build_url("/instances/")
|
||||
|
||||
try:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
instances = []
|
||||
if "instances" in data:
|
||||
for inst_data in data["instances"]:
|
||||
instances.append(InstanceInfo.from_api_response(inst_data))
|
||||
|
||||
return instances
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"vast.ai API error listing instances: {e}")
|
||||
raise
|
||||
|
||||
async def get_instance(self, instance_id: int) -> Optional[InstanceInfo]:
|
||||
"""Holt Details einer spezifischen Instanz."""
|
||||
client = await self._get_client()
|
||||
url = self._build_url(f"/instances/{instance_id}/")
|
||||
|
||||
try:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if "instances" in data:
|
||||
instances = data["instances"]
|
||||
# API gibt bei einzelner Instanz ein dict zurück, bei Liste eine Liste
|
||||
if isinstance(instances, list) and instances:
|
||||
return InstanceInfo.from_api_response(instances[0])
|
||||
elif isinstance(instances, dict):
|
||||
# Füge ID hinzu falls nicht vorhanden
|
||||
if "id" not in instances:
|
||||
instances["id"] = instance_id
|
||||
return InstanceInfo.from_api_response(instances)
|
||||
elif isinstance(data, dict) and "id" in data:
|
||||
return InstanceInfo.from_api_response(data)
|
||||
|
||||
return None
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 404:
|
||||
return None
|
||||
logger.error(f"vast.ai API error getting instance {instance_id}: {e}")
|
||||
raise
|
||||
|
||||
async def start_instance(self, instance_id: int) -> bool:
|
||||
"""Startet eine gestoppte Instanz."""
|
||||
client = await self._get_client()
|
||||
url = self._build_url(f"/instances/{instance_id}/")
|
||||
|
||||
try:
|
||||
response = await client.put(
|
||||
url,
|
||||
json={"state": "running"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
logger.info(f"vast.ai instance {instance_id} start requested")
|
||||
return True
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"vast.ai API error starting instance {instance_id}: {e}")
|
||||
return False
|
||||
|
||||
async def stop_instance(self, instance_id: int) -> bool:
|
||||
"""Stoppt eine laufende Instanz (haelt Disk)."""
|
||||
client = await self._get_client()
|
||||
url = self._build_url(f"/instances/{instance_id}/")
|
||||
|
||||
try:
|
||||
response = await client.put(
|
||||
url,
|
||||
json={"state": "stopped"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
logger.info(f"vast.ai instance {instance_id} stop requested")
|
||||
return True
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"vast.ai API error stopping instance {instance_id}: {e}")
|
||||
return False
|
||||
|
||||
async def destroy_instance(self, instance_id: int) -> bool:
|
||||
"""Loescht eine Instanz komplett (Disk weg!)."""
|
||||
client = await self._get_client()
|
||||
url = self._build_url(f"/instances/{instance_id}/")
|
||||
|
||||
try:
|
||||
response = await client.delete(url)
|
||||
response.raise_for_status()
|
||||
logger.info(f"vast.ai instance {instance_id} destroyed")
|
||||
return True
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"vast.ai API error destroying instance {instance_id}: {e}")
|
||||
return False
|
||||
|
||||
async def set_label(self, instance_id: int, label: str) -> bool:
|
||||
"""Setzt ein Label fuer eine Instanz."""
|
||||
client = await self._get_client()
|
||||
url = self._build_url(f"/instances/{instance_id}/")
|
||||
|
||||
try:
|
||||
response = await client.put(
|
||||
url,
|
||||
json={"label": label},
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"vast.ai API error setting label on instance {instance_id}: {e}")
|
||||
return False
|
||||
|
||||
async def wait_for_status(
|
||||
self,
|
||||
instance_id: int,
|
||||
target_status: InstanceStatus,
|
||||
timeout_seconds: int = 300,
|
||||
poll_interval: float = 5.0,
|
||||
) -> Optional[InstanceInfo]:
|
||||
"""
|
||||
Wartet bis eine Instanz einen bestimmten Status erreicht.
|
||||
|
||||
Returns:
|
||||
InstanceInfo wenn Status erreicht, None bei Timeout.
|
||||
"""
|
||||
deadline = asyncio.get_event_loop().time() + timeout_seconds
|
||||
|
||||
while asyncio.get_event_loop().time() < deadline:
|
||||
instance = await self.get_instance(instance_id)
|
||||
|
||||
if instance and instance.status == target_status:
|
||||
return instance
|
||||
|
||||
if instance:
|
||||
logger.debug(
|
||||
f"vast.ai instance {instance_id} status: {instance.status.value}, "
|
||||
f"waiting for {target_status.value}"
|
||||
)
|
||||
|
||||
await asyncio.sleep(poll_interval)
|
||||
|
||||
logger.warning(
|
||||
f"Timeout waiting for instance {instance_id} to reach {target_status.value}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def wait_for_health(
|
||||
self,
|
||||
instance: InstanceInfo,
|
||||
health_path: str = "/health",
|
||||
internal_port: int = 8001,
|
||||
timeout_seconds: int = 600,
|
||||
poll_interval: float = 5.0,
|
||||
) -> bool:
|
||||
"""
|
||||
Wartet bis der Health-Endpoint erreichbar ist.
|
||||
|
||||
Returns:
|
||||
True wenn Health OK, False bei Timeout.
|
||||
"""
|
||||
endpoint = instance.get_endpoint_url(internal_port)
|
||||
if not endpoint:
|
||||
logger.error("No endpoint URL available for health check")
|
||||
return False
|
||||
|
||||
health_url = f"{endpoint.rstrip('/')}{health_path}"
|
||||
logger.info(f"Waiting for health at {health_url}")
|
||||
|
||||
deadline = asyncio.get_event_loop().time() + timeout_seconds
|
||||
health_client = httpx.AsyncClient(timeout=5.0)
|
||||
|
||||
try:
|
||||
while asyncio.get_event_loop().time() < deadline:
|
||||
try:
|
||||
response = await health_client.get(health_url)
|
||||
if 200 <= response.status_code < 300:
|
||||
logger.info(f"Health check passed: {health_url}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug(f"Health check failed: {e}")
|
||||
|
||||
await asyncio.sleep(poll_interval)
|
||||
|
||||
logger.warning(f"Health check timeout: {health_url}")
|
||||
return False
|
||||
|
||||
finally:
|
||||
await health_client.aclose()
|
||||
|
||||
async def get_account_info(self) -> Optional[AccountInfo]:
|
||||
"""
|
||||
Holt Account-Informationen inkl. Credit/Budget.
|
||||
|
||||
Returns:
|
||||
AccountInfo oder None bei Fehler.
|
||||
"""
|
||||
client = await self._get_client()
|
||||
url = self._build_url("/users/current/")
|
||||
|
||||
try:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
return AccountInfo.from_api_response(data)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"vast.ai API error getting account info: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting vast.ai account info: {e}")
|
||||
return None
|
||||
618
backend/infra/vast_power.py
Normal file
618
backend/infra/vast_power.py
Normal file
@@ -0,0 +1,618 @@
|
||||
"""
|
||||
Vast.ai Power Control API.
|
||||
|
||||
Stellt Endpoints bereit fuer:
|
||||
- Start/Stop von vast.ai Instanzen
|
||||
- Status-Abfrage
|
||||
- Auto-Shutdown bei Inaktivitaet
|
||||
- Kosten-Tracking
|
||||
|
||||
Sicherheit: Alle Endpoints erfordern CONTROL_API_KEY.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Header, BackgroundTasks
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .vast_client import VastAIClient, InstanceInfo, InstanceStatus, AccountInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/infra/vast", tags=["Infrastructure"])
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Configuration (ENV)
|
||||
# -------------------------
|
||||
VAST_API_KEY = os.getenv("VAST_API_KEY")
|
||||
VAST_INSTANCE_ID = os.getenv("VAST_INSTANCE_ID") # Numeric instance ID
|
||||
CONTROL_API_KEY = os.getenv("CONTROL_API_KEY") # Admin key for these endpoints
|
||||
|
||||
# Health check configuration
|
||||
VAST_HEALTH_PORT = int(os.getenv("VAST_HEALTH_PORT", "8001"))
|
||||
VAST_HEALTH_PATH = os.getenv("VAST_HEALTH_PATH", "/health")
|
||||
VAST_WAIT_TIMEOUT_S = int(os.getenv("VAST_WAIT_TIMEOUT_S", "600")) # 10 min
|
||||
|
||||
# Auto-shutdown configuration
|
||||
AUTO_SHUTDOWN_ENABLED = os.getenv("VAST_AUTO_SHUTDOWN", "true").lower() == "true"
|
||||
AUTO_SHUTDOWN_MINUTES = int(os.getenv("VAST_AUTO_SHUTDOWN_MINUTES", "30"))
|
||||
|
||||
# State persistence (in /tmp for container compatibility)
|
||||
STATE_PATH = Path(os.getenv("VAST_STATE_PATH", "/tmp/vast_state.json"))
|
||||
AUDIT_PATH = Path(os.getenv("VAST_AUDIT_PATH", "/tmp/vast_audit.log"))
|
||||
|
||||
|
||||
# -------------------------
|
||||
# State Management
|
||||
# -------------------------
|
||||
class VastState:
|
||||
"""
|
||||
Persistenter State fuer vast.ai Kontrolle.
|
||||
|
||||
Speichert:
|
||||
- Aktueller Endpunkt (weil IP sich aendern kann)
|
||||
- Letzte Aktivitaet (fuer Auto-Shutdown)
|
||||
- Kosten-Tracking
|
||||
"""
|
||||
|
||||
def __init__(self, path: Path = STATE_PATH):
|
||||
self.path = path
|
||||
self._state: Dict[str, Any] = self._load()
|
||||
|
||||
def _load(self) -> Dict[str, Any]:
|
||||
"""Laedt State von Disk."""
|
||||
if not self.path.exists():
|
||||
return {
|
||||
"desired_state": None,
|
||||
"endpoint_base_url": None,
|
||||
"last_activity": None,
|
||||
"last_start": None,
|
||||
"last_stop": None,
|
||||
"total_runtime_seconds": 0,
|
||||
"total_cost_usd": 0.0,
|
||||
}
|
||||
try:
|
||||
return json.loads(self.path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _save(self) -> None:
|
||||
"""Speichert State auf Disk."""
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.path.write_text(
|
||||
json.dumps(self._state, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def get(self, key: str, default: Any = None) -> Any:
|
||||
return self._state.get(key, default)
|
||||
|
||||
def set(self, key: str, value: Any) -> None:
|
||||
self._state[key] = value
|
||||
self._save()
|
||||
|
||||
def update(self, data: Dict[str, Any]) -> None:
|
||||
self._state.update(data)
|
||||
self._save()
|
||||
|
||||
def record_activity(self) -> None:
|
||||
"""Zeichnet letzte Aktivitaet auf (fuer Auto-Shutdown)."""
|
||||
self._state["last_activity"] = datetime.now(timezone.utc).isoformat()
|
||||
self._save()
|
||||
|
||||
def get_last_activity(self) -> Optional[datetime]:
|
||||
"""Gibt letzte Aktivitaet als datetime."""
|
||||
ts = self._state.get("last_activity")
|
||||
if ts:
|
||||
return datetime.fromisoformat(ts)
|
||||
return None
|
||||
|
||||
def record_start(self) -> None:
|
||||
"""Zeichnet Start-Zeit auf."""
|
||||
self._state["last_start"] = datetime.now(timezone.utc).isoformat()
|
||||
self._state["desired_state"] = "RUNNING"
|
||||
self._save()
|
||||
|
||||
def record_stop(self, dph_total: Optional[float] = None) -> None:
|
||||
"""Zeichnet Stop-Zeit auf und berechnet Kosten."""
|
||||
now = datetime.now(timezone.utc)
|
||||
self._state["last_stop"] = now.isoformat()
|
||||
self._state["desired_state"] = "STOPPED"
|
||||
|
||||
# Berechne Runtime und Kosten
|
||||
last_start = self._state.get("last_start")
|
||||
if last_start:
|
||||
start_dt = datetime.fromisoformat(last_start)
|
||||
runtime_seconds = (now - start_dt).total_seconds()
|
||||
self._state["total_runtime_seconds"] = (
|
||||
self._state.get("total_runtime_seconds", 0) + runtime_seconds
|
||||
)
|
||||
|
||||
if dph_total:
|
||||
hours = runtime_seconds / 3600
|
||||
cost = hours * dph_total
|
||||
self._state["total_cost_usd"] = (
|
||||
self._state.get("total_cost_usd", 0.0) + cost
|
||||
)
|
||||
logger.info(
|
||||
f"Session cost: ${cost:.3f} ({runtime_seconds/60:.1f} min @ ${dph_total}/h)"
|
||||
)
|
||||
|
||||
self._save()
|
||||
|
||||
|
||||
# Global state instance
|
||||
_state = VastState()
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Audit Logging
|
||||
# -------------------------
|
||||
def audit_log(event: str, actor: str = "system", meta: Optional[Dict[str, Any]] = None) -> None:
|
||||
"""Schreibt Audit-Log Eintrag."""
|
||||
meta = meta or {}
|
||||
line = json.dumps(
|
||||
{
|
||||
"ts": datetime.now(timezone.utc).isoformat(),
|
||||
"event": event,
|
||||
"actor": actor,
|
||||
"meta": meta,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
AUDIT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with AUDIT_PATH.open("a", encoding="utf-8") as f:
|
||||
f.write(line + "\n")
|
||||
logger.info(f"AUDIT: {event} by {actor}")
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Request/Response Models
|
||||
# -------------------------
|
||||
class PowerOnRequest(BaseModel):
|
||||
wait_for_health: bool = Field(default=True, description="Warten bis LLM bereit")
|
||||
health_path: str = Field(default=VAST_HEALTH_PATH)
|
||||
health_port: int = Field(default=VAST_HEALTH_PORT)
|
||||
|
||||
|
||||
class PowerOnResponse(BaseModel):
|
||||
status: str
|
||||
instance_id: Optional[int] = None
|
||||
endpoint_base_url: Optional[str] = None
|
||||
health_url: Optional[str] = None
|
||||
message: Optional[str] = None
|
||||
|
||||
|
||||
class PowerOffRequest(BaseModel):
|
||||
pass # Keine Parameter noetig
|
||||
|
||||
|
||||
class PowerOffResponse(BaseModel):
|
||||
status: str
|
||||
session_runtime_minutes: Optional[float] = None
|
||||
session_cost_usd: Optional[float] = None
|
||||
message: Optional[str] = None
|
||||
|
||||
|
||||
class VastStatusResponse(BaseModel):
|
||||
instance_id: Optional[int] = None
|
||||
status: str
|
||||
gpu_name: Optional[str] = None
|
||||
dph_total: Optional[float] = None
|
||||
endpoint_base_url: Optional[str] = None
|
||||
last_activity: Optional[str] = None
|
||||
auto_shutdown_in_minutes: Optional[int] = None
|
||||
total_runtime_hours: Optional[float] = None
|
||||
total_cost_usd: Optional[float] = None
|
||||
# Budget / Credit Informationen
|
||||
account_credit: Optional[float] = None # Verbleibendes Guthaben in USD
|
||||
account_total_spend: Optional[float] = None # Gesamtausgaben auf vast.ai
|
||||
# Session-Kosten (seit letztem Start)
|
||||
session_runtime_minutes: Optional[float] = None
|
||||
session_cost_usd: Optional[float] = None
|
||||
message: Optional[str] = None
|
||||
|
||||
|
||||
class CostStatsResponse(BaseModel):
|
||||
total_runtime_hours: float
|
||||
total_cost_usd: float
|
||||
sessions_count: int
|
||||
avg_session_minutes: float
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Security Dependency
|
||||
# -------------------------
|
||||
def require_control_key(x_api_key: Optional[str] = Header(default=None)) -> None:
|
||||
"""
|
||||
Admin-Schutz fuer Control-Endpoints.
|
||||
|
||||
Header: X-API-Key: <CONTROL_API_KEY>
|
||||
"""
|
||||
if not CONTROL_API_KEY:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="CONTROL_API_KEY not configured on server",
|
||||
)
|
||||
if x_api_key != CONTROL_API_KEY:
|
||||
raise HTTPException(status_code=401, detail="Unauthorized")
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Auto-Shutdown Background Task
|
||||
# -------------------------
|
||||
_shutdown_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def auto_shutdown_monitor() -> None:
|
||||
"""
|
||||
Hintergrund-Task der bei Inaktivitaet die Instanz stoppt.
|
||||
|
||||
Laeuft permanent wenn Instanz an ist und prueft alle 60s ob
|
||||
Aktivitaet stattfand. Stoppt Instanz wenn keine Aktivitaet
|
||||
seit AUTO_SHUTDOWN_MINUTES.
|
||||
"""
|
||||
if not VAST_API_KEY or not VAST_INSTANCE_ID:
|
||||
return
|
||||
|
||||
client = VastAIClient(VAST_API_KEY)
|
||||
|
||||
try:
|
||||
while True:
|
||||
await asyncio.sleep(60) # Check every minute
|
||||
|
||||
if not AUTO_SHUTDOWN_ENABLED:
|
||||
continue
|
||||
|
||||
last_activity = _state.get_last_activity()
|
||||
if not last_activity:
|
||||
continue
|
||||
|
||||
# Berechne Inaktivitaet
|
||||
now = datetime.now(timezone.utc)
|
||||
inactive_minutes = (now - last_activity).total_seconds() / 60
|
||||
|
||||
if inactive_minutes >= AUTO_SHUTDOWN_MINUTES:
|
||||
logger.info(
|
||||
f"Auto-shutdown triggered: {inactive_minutes:.1f} min inactive"
|
||||
)
|
||||
audit_log(
|
||||
"auto_shutdown",
|
||||
actor="system",
|
||||
meta={"inactive_minutes": inactive_minutes},
|
||||
)
|
||||
|
||||
# Hole aktuelle Instanz-Info fuer Kosten
|
||||
instance = await client.get_instance(int(VAST_INSTANCE_ID))
|
||||
dph = instance.dph_total if instance else None
|
||||
|
||||
# Stop
|
||||
await client.stop_instance(int(VAST_INSTANCE_ID))
|
||||
_state.record_stop(dph_total=dph)
|
||||
|
||||
audit_log("auto_shutdown_complete", actor="system")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Auto-shutdown monitor error: {e}")
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
def start_auto_shutdown_monitor() -> None:
|
||||
"""Startet den Auto-Shutdown Monitor."""
|
||||
global _shutdown_task
|
||||
if _shutdown_task is None or _shutdown_task.done():
|
||||
_shutdown_task = asyncio.create_task(auto_shutdown_monitor())
|
||||
logger.info("Auto-shutdown monitor started")
|
||||
|
||||
|
||||
def stop_auto_shutdown_monitor() -> None:
|
||||
"""Stoppt den Auto-Shutdown Monitor."""
|
||||
global _shutdown_task
|
||||
if _shutdown_task and not _shutdown_task.done():
|
||||
_shutdown_task.cancel()
|
||||
logger.info("Auto-shutdown monitor stopped")
|
||||
|
||||
|
||||
# -------------------------
|
||||
# API Endpoints
|
||||
# -------------------------
|
||||
|
||||
@router.get("/status", response_model=VastStatusResponse, dependencies=[Depends(require_control_key)])
|
||||
async def get_status() -> VastStatusResponse:
|
||||
"""
|
||||
Gibt Status der vast.ai Instanz zurueck.
|
||||
|
||||
Inkludiert:
|
||||
- Aktueller Status (running/stopped/etc)
|
||||
- GPU Info und Kosten pro Stunde
|
||||
- Endpoint URL
|
||||
- Auto-Shutdown Timer
|
||||
- Gesamtkosten
|
||||
- Account Credit (verbleibendes Budget)
|
||||
- Session-Kosten (seit letztem Start)
|
||||
"""
|
||||
if not VAST_API_KEY or not VAST_INSTANCE_ID:
|
||||
return VastStatusResponse(
|
||||
status="unconfigured",
|
||||
message="VAST_API_KEY or VAST_INSTANCE_ID not set",
|
||||
)
|
||||
|
||||
client = VastAIClient(VAST_API_KEY)
|
||||
try:
|
||||
instance = await client.get_instance(int(VAST_INSTANCE_ID))
|
||||
|
||||
if not instance:
|
||||
return VastStatusResponse(
|
||||
instance_id=int(VAST_INSTANCE_ID),
|
||||
status="not_found",
|
||||
message=f"Instance {VAST_INSTANCE_ID} not found",
|
||||
)
|
||||
|
||||
# Hole Account-Info fuer Budget/Credit
|
||||
account_info = await client.get_account_info()
|
||||
account_credit = account_info.credit if account_info else None
|
||||
account_total_spend = account_info.total_spend if account_info else None
|
||||
|
||||
# Update endpoint if running
|
||||
endpoint = None
|
||||
if instance.status == InstanceStatus.RUNNING:
|
||||
endpoint = instance.get_endpoint_url(VAST_HEALTH_PORT)
|
||||
if endpoint:
|
||||
_state.set("endpoint_base_url", endpoint)
|
||||
|
||||
# Calculate auto-shutdown timer
|
||||
auto_shutdown_minutes = None
|
||||
if AUTO_SHUTDOWN_ENABLED and instance.status == InstanceStatus.RUNNING:
|
||||
last_activity = _state.get_last_activity()
|
||||
if last_activity:
|
||||
inactive = (datetime.now(timezone.utc) - last_activity).total_seconds() / 60
|
||||
auto_shutdown_minutes = max(0, int(AUTO_SHUTDOWN_MINUTES - inactive))
|
||||
|
||||
# Berechne aktuelle Session-Kosten (wenn Instanz laeuft)
|
||||
session_runtime_minutes = None
|
||||
session_cost_usd = None
|
||||
last_start = _state.get("last_start")
|
||||
|
||||
# Falls Instanz laeuft aber kein last_start gesetzt (z.B. nach Container-Neustart),
|
||||
# nutze start_date aus der vast.ai API falls vorhanden, sonst jetzt
|
||||
if instance.status == InstanceStatus.RUNNING and not last_start:
|
||||
if instance.started_at:
|
||||
_state.set("last_start", instance.started_at.isoformat())
|
||||
last_start = instance.started_at.isoformat()
|
||||
else:
|
||||
_state.record_start()
|
||||
last_start = _state.get("last_start")
|
||||
|
||||
if last_start and instance.status == InstanceStatus.RUNNING:
|
||||
start_dt = datetime.fromisoformat(last_start)
|
||||
session_runtime_minutes = (datetime.now(timezone.utc) - start_dt).total_seconds() / 60
|
||||
if instance.dph_total:
|
||||
session_cost_usd = (session_runtime_minutes / 60) * instance.dph_total
|
||||
|
||||
return VastStatusResponse(
|
||||
instance_id=instance.id,
|
||||
status=instance.status.value,
|
||||
gpu_name=instance.gpu_name,
|
||||
dph_total=instance.dph_total,
|
||||
endpoint_base_url=endpoint or _state.get("endpoint_base_url"),
|
||||
last_activity=_state.get("last_activity"),
|
||||
auto_shutdown_in_minutes=auto_shutdown_minutes,
|
||||
total_runtime_hours=_state.get("total_runtime_seconds", 0) / 3600,
|
||||
total_cost_usd=_state.get("total_cost_usd", 0.0),
|
||||
account_credit=account_credit,
|
||||
account_total_spend=account_total_spend,
|
||||
session_runtime_minutes=session_runtime_minutes,
|
||||
session_cost_usd=session_cost_usd,
|
||||
)
|
||||
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
@router.post("/power/on", response_model=PowerOnResponse, dependencies=[Depends(require_control_key)])
|
||||
async def power_on(
|
||||
payload: PowerOnRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
) -> PowerOnResponse:
|
||||
"""
|
||||
Startet die vast.ai Instanz.
|
||||
|
||||
1. Startet Instanz via API
|
||||
2. Wartet auf Status RUNNING
|
||||
3. Optional: Wartet auf Health-Endpoint
|
||||
4. Startet Auto-Shutdown Monitor
|
||||
"""
|
||||
if not VAST_API_KEY or not VAST_INSTANCE_ID:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="VAST_API_KEY or VAST_INSTANCE_ID not configured",
|
||||
)
|
||||
|
||||
instance_id = int(VAST_INSTANCE_ID)
|
||||
audit_log("power_on_requested", meta={"instance_id": instance_id})
|
||||
|
||||
client = VastAIClient(VAST_API_KEY)
|
||||
try:
|
||||
# Start instance
|
||||
success = await client.start_instance(instance_id)
|
||||
if not success:
|
||||
raise HTTPException(status_code=502, detail="Failed to start instance")
|
||||
|
||||
_state.record_start()
|
||||
_state.record_activity()
|
||||
|
||||
# Wait for running status
|
||||
instance = await client.wait_for_status(
|
||||
instance_id,
|
||||
InstanceStatus.RUNNING,
|
||||
timeout_seconds=300,
|
||||
)
|
||||
|
||||
if not instance:
|
||||
return PowerOnResponse(
|
||||
status="starting",
|
||||
instance_id=instance_id,
|
||||
message="Instance start requested but not yet running. Check status.",
|
||||
)
|
||||
|
||||
# Get endpoint
|
||||
endpoint = instance.get_endpoint_url(payload.health_port)
|
||||
if endpoint:
|
||||
_state.set("endpoint_base_url", endpoint)
|
||||
|
||||
# Wait for health if requested
|
||||
if payload.wait_for_health:
|
||||
health_ok = await client.wait_for_health(
|
||||
instance,
|
||||
health_path=payload.health_path,
|
||||
internal_port=payload.health_port,
|
||||
timeout_seconds=VAST_WAIT_TIMEOUT_S,
|
||||
)
|
||||
|
||||
if not health_ok:
|
||||
audit_log("power_on_health_timeout", meta={"instance_id": instance_id})
|
||||
return PowerOnResponse(
|
||||
status="running_unhealthy",
|
||||
instance_id=instance_id,
|
||||
endpoint_base_url=endpoint,
|
||||
message=f"Instance running but health check failed at {endpoint}{payload.health_path}",
|
||||
)
|
||||
|
||||
# Start auto-shutdown monitor
|
||||
start_auto_shutdown_monitor()
|
||||
|
||||
audit_log("power_on_complete", meta={
|
||||
"instance_id": instance_id,
|
||||
"endpoint": endpoint,
|
||||
})
|
||||
|
||||
return PowerOnResponse(
|
||||
status="running",
|
||||
instance_id=instance_id,
|
||||
endpoint_base_url=endpoint,
|
||||
health_url=f"{endpoint}{payload.health_path}" if endpoint else None,
|
||||
message="Instance running and healthy",
|
||||
)
|
||||
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
@router.post("/power/off", response_model=PowerOffResponse, dependencies=[Depends(require_control_key)])
|
||||
async def power_off(payload: PowerOffRequest) -> PowerOffResponse:
|
||||
"""
|
||||
Stoppt die vast.ai Instanz (behaelt Disk).
|
||||
|
||||
Berechnet Session-Kosten und -Laufzeit.
|
||||
"""
|
||||
if not VAST_API_KEY or not VAST_INSTANCE_ID:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="VAST_API_KEY or VAST_INSTANCE_ID not configured",
|
||||
)
|
||||
|
||||
instance_id = int(VAST_INSTANCE_ID)
|
||||
audit_log("power_off_requested", meta={"instance_id": instance_id})
|
||||
|
||||
# Stop auto-shutdown monitor
|
||||
stop_auto_shutdown_monitor()
|
||||
|
||||
client = VastAIClient(VAST_API_KEY)
|
||||
try:
|
||||
# Get current info for cost calculation
|
||||
instance = await client.get_instance(instance_id)
|
||||
dph = instance.dph_total if instance else None
|
||||
|
||||
# Calculate session stats before updating state
|
||||
session_runtime = 0.0
|
||||
session_cost = 0.0
|
||||
last_start = _state.get("last_start")
|
||||
if last_start:
|
||||
start_dt = datetime.fromisoformat(last_start)
|
||||
session_runtime = (datetime.now(timezone.utc) - start_dt).total_seconds() / 60
|
||||
if dph:
|
||||
session_cost = (session_runtime / 60) * dph
|
||||
|
||||
# Stop instance
|
||||
success = await client.stop_instance(instance_id)
|
||||
if not success:
|
||||
raise HTTPException(status_code=502, detail="Failed to stop instance")
|
||||
|
||||
_state.record_stop(dph_total=dph)
|
||||
|
||||
audit_log("power_off_complete", meta={
|
||||
"instance_id": instance_id,
|
||||
"session_minutes": session_runtime,
|
||||
"session_cost": session_cost,
|
||||
})
|
||||
|
||||
return PowerOffResponse(
|
||||
status="stopped",
|
||||
session_runtime_minutes=session_runtime,
|
||||
session_cost_usd=session_cost,
|
||||
message=f"Instance stopped. Session: {session_runtime:.1f} min, ${session_cost:.3f}",
|
||||
)
|
||||
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
@router.post("/activity", dependencies=[Depends(require_control_key)])
|
||||
async def record_activity() -> Dict[str, str]:
|
||||
"""
|
||||
Zeichnet Aktivitaet auf (verzoegert Auto-Shutdown).
|
||||
|
||||
Sollte von LLM Gateway aufgerufen werden bei jedem Request.
|
||||
"""
|
||||
_state.record_activity()
|
||||
return {"status": "recorded", "last_activity": _state.get("last_activity")}
|
||||
|
||||
|
||||
@router.get("/costs", response_model=CostStatsResponse, dependencies=[Depends(require_control_key)])
|
||||
async def get_costs() -> CostStatsResponse:
|
||||
"""
|
||||
Gibt Kosten-Statistiken zurueck.
|
||||
"""
|
||||
total_seconds = _state.get("total_runtime_seconds", 0)
|
||||
total_cost = _state.get("total_cost_usd", 0.0)
|
||||
|
||||
# TODO: Sessions count from audit log
|
||||
sessions = 1 if total_seconds > 0 else 0
|
||||
avg_minutes = (total_seconds / 60 / sessions) if sessions > 0 else 0
|
||||
|
||||
return CostStatsResponse(
|
||||
total_runtime_hours=total_seconds / 3600,
|
||||
total_cost_usd=total_cost,
|
||||
sessions_count=sessions,
|
||||
avg_session_minutes=avg_minutes,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/audit", dependencies=[Depends(require_control_key)])
|
||||
async def get_audit_log(limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Gibt letzte Audit-Log Eintraege zurueck.
|
||||
"""
|
||||
if not AUDIT_PATH.exists():
|
||||
return []
|
||||
|
||||
lines = AUDIT_PATH.read_text(encoding="utf-8").strip().split("\n")
|
||||
entries = []
|
||||
for line in lines[-limit:]:
|
||||
try:
|
||||
entries.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return list(reversed(entries)) # Neueste zuerst
|
||||
Reference in New Issue
Block a user