fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
455
backend/gpu_test_api.py
Normal file
455
backend/gpu_test_api.py
Normal file
@@ -0,0 +1,455 @@
|
||||
"""
|
||||
GPU Infrastructure Test API - Test Runner fuer CUDA/ROCm GPU Management
|
||||
Endpoint: /api/admin/gpu-tests
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional, Literal
|
||||
import httpx
|
||||
import asyncio
|
||||
import time
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
router = APIRouter(prefix="/api/admin/gpu-tests", tags=["GPU Tests"])
|
||||
|
||||
# ==============================================
|
||||
# Models
|
||||
# ==============================================
|
||||
|
||||
class TestResult(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
expected: str
|
||||
actual: str
|
||||
status: Literal["passed", "failed", "pending", "skipped"]
|
||||
duration_ms: float
|
||||
error_message: Optional[str] = None
|
||||
|
||||
|
||||
class TestCategoryResult(BaseModel):
|
||||
category: str
|
||||
display_name: str
|
||||
description: str
|
||||
tests: List[TestResult]
|
||||
passed: int
|
||||
failed: int
|
||||
total: int
|
||||
|
||||
|
||||
class FullTestResults(BaseModel):
|
||||
categories: List[TestCategoryResult]
|
||||
total_passed: int
|
||||
total_failed: int
|
||||
total_tests: int
|
||||
duration_ms: float
|
||||
|
||||
|
||||
# ==============================================
|
||||
# Configuration
|
||||
# ==============================================
|
||||
|
||||
BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8000")
|
||||
VAST_API_KEY = os.getenv("VAST_API_KEY", "")
|
||||
|
||||
|
||||
# ==============================================
|
||||
# Test Implementations
|
||||
# ==============================================
|
||||
|
||||
async def test_nvidia_smi() -> TestResult:
|
||||
"""Test NVIDIA GPU Detection via nvidia-smi"""
|
||||
start = time.time()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
duration = (time.time() - start) * 1000
|
||||
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
gpu_info = result.stdout.strip().split('\n')[0]
|
||||
return TestResult(
|
||||
name="NVIDIA GPU Erkennung",
|
||||
description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual=f"GPU: {gpu_info}",
|
||||
status="passed",
|
||||
duration_ms=duration
|
||||
)
|
||||
else:
|
||||
return TestResult(
|
||||
name="NVIDIA GPU Erkennung",
|
||||
description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual="Keine NVIDIA GPU gefunden",
|
||||
status="skipped",
|
||||
duration_ms=duration,
|
||||
error_message="nvidia-smi nicht verfuegbar oder keine GPU"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
return TestResult(
|
||||
name="NVIDIA GPU Erkennung",
|
||||
description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual="nvidia-smi nicht installiert",
|
||||
status="skipped",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message="nvidia-smi Binary nicht gefunden"
|
||||
)
|
||||
except Exception as e:
|
||||
return TestResult(
|
||||
name="NVIDIA GPU Erkennung",
|
||||
description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual=f"Fehler: {str(e)}",
|
||||
status="failed",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def test_rocm_smi() -> TestResult:
|
||||
"""Test AMD GPU Detection via rocm-smi"""
|
||||
start = time.time()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["rocm-smi", "--showproductname"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
duration = (time.time() - start) * 1000
|
||||
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return TestResult(
|
||||
name="AMD ROCm GPU Erkennung",
|
||||
description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual=f"ROCm GPU erkannt",
|
||||
status="passed",
|
||||
duration_ms=duration
|
||||
)
|
||||
else:
|
||||
return TestResult(
|
||||
name="AMD ROCm GPU Erkennung",
|
||||
description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual="Keine AMD GPU gefunden",
|
||||
status="skipped",
|
||||
duration_ms=duration,
|
||||
error_message="rocm-smi nicht verfuegbar oder keine GPU"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
return TestResult(
|
||||
name="AMD ROCm GPU Erkennung",
|
||||
description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual="rocm-smi nicht installiert",
|
||||
status="skipped",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message="rocm-smi Binary nicht gefunden"
|
||||
)
|
||||
except Exception as e:
|
||||
return TestResult(
|
||||
name="AMD ROCm GPU Erkennung",
|
||||
description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
|
||||
expected="GPU-Informationen verfuegbar",
|
||||
actual=f"Fehler: {str(e)}",
|
||||
status="failed",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def test_cuda_available() -> TestResult:
|
||||
"""Test CUDA Availability via Python"""
|
||||
start = time.time()
|
||||
try:
|
||||
# Try to import torch and check CUDA
|
||||
result = subprocess.run(
|
||||
["python", "-c", "import torch; print(f'CUDA: {torch.cuda.is_available()}, Devices: {torch.cuda.device_count()}')"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
duration = (time.time() - start) * 1000
|
||||
|
||||
if result.returncode == 0:
|
||||
output = result.stdout.strip()
|
||||
if "True" in output:
|
||||
return TestResult(
|
||||
name="PyTorch CUDA Support",
|
||||
description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
|
||||
expected="CUDA verfuegbar",
|
||||
actual=output,
|
||||
status="passed",
|
||||
duration_ms=duration
|
||||
)
|
||||
else:
|
||||
return TestResult(
|
||||
name="PyTorch CUDA Support",
|
||||
description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
|
||||
expected="CUDA verfuegbar",
|
||||
actual=output,
|
||||
status="skipped",
|
||||
duration_ms=duration,
|
||||
error_message="CUDA nicht verfuegbar in PyTorch"
|
||||
)
|
||||
else:
|
||||
return TestResult(
|
||||
name="PyTorch CUDA Support",
|
||||
description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
|
||||
expected="CUDA verfuegbar",
|
||||
actual="PyTorch nicht installiert",
|
||||
status="skipped",
|
||||
duration_ms=duration,
|
||||
error_message=result.stderr[:200] if result.stderr else "PyTorch fehlt"
|
||||
)
|
||||
except Exception as e:
|
||||
return TestResult(
|
||||
name="PyTorch CUDA Support",
|
||||
description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
|
||||
expected="CUDA verfuegbar",
|
||||
actual=f"Fehler: {str(e)}",
|
||||
status="skipped",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def test_vast_ai_api() -> TestResult:
|
||||
"""Test vast.ai API Connection"""
|
||||
start = time.time()
|
||||
|
||||
if not VAST_API_KEY:
|
||||
return TestResult(
|
||||
name="vast.ai API Verbindung",
|
||||
description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
|
||||
expected="API Key konfiguriert",
|
||||
actual="VAST_API_KEY nicht gesetzt",
|
||||
status="skipped",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message="Umgebungsvariable VAST_API_KEY fehlt"
|
||||
)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.get(
|
||||
"https://console.vast.ai/api/v0/users/current",
|
||||
headers={"Authorization": f"Bearer {VAST_API_KEY}"}
|
||||
)
|
||||
duration = (time.time() - start) * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
balance = data.get("credit", 0)
|
||||
return TestResult(
|
||||
name="vast.ai API Verbindung",
|
||||
description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
|
||||
expected="API erreichbar mit Guthaben",
|
||||
actual=f"Verbunden, Guthaben: ${balance:.2f}",
|
||||
status="passed",
|
||||
duration_ms=duration
|
||||
)
|
||||
else:
|
||||
return TestResult(
|
||||
name="vast.ai API Verbindung",
|
||||
description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
|
||||
expected="API erreichbar mit Guthaben",
|
||||
actual=f"HTTP {response.status_code}",
|
||||
status="failed",
|
||||
duration_ms=duration,
|
||||
error_message="API-Authentifizierung fehlgeschlagen"
|
||||
)
|
||||
except Exception as e:
|
||||
return TestResult(
|
||||
name="vast.ai API Verbindung",
|
||||
description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
|
||||
expected="API erreichbar mit Guthaben",
|
||||
actual=f"Fehler: {str(e)}",
|
||||
status="failed",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def test_gpu_api_endpoint() -> TestResult:
|
||||
"""Test GPU Admin API Endpoint"""
|
||||
start = time.time()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.get(f"{BACKEND_URL}/api/gpu/status")
|
||||
duration = (time.time() - start) * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
return TestResult(
|
||||
name="GPU Admin API",
|
||||
description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
|
||||
expected="HTTP 200 mit GPU-Status",
|
||||
actual="API verfuegbar",
|
||||
status="passed",
|
||||
duration_ms=duration
|
||||
)
|
||||
elif response.status_code == 404:
|
||||
return TestResult(
|
||||
name="GPU Admin API",
|
||||
description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
|
||||
expected="HTTP 200 mit GPU-Status",
|
||||
actual="Endpoint nicht implementiert",
|
||||
status="skipped",
|
||||
duration_ms=duration,
|
||||
error_message="GPU API nicht aktiviert"
|
||||
)
|
||||
else:
|
||||
return TestResult(
|
||||
name="GPU Admin API",
|
||||
description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
|
||||
expected="HTTP 200 mit GPU-Status",
|
||||
actual=f"HTTP {response.status_code}",
|
||||
status="failed",
|
||||
duration_ms=duration,
|
||||
error_message=f"Unerwarteter Status: {response.status_code}"
|
||||
)
|
||||
except Exception as e:
|
||||
return TestResult(
|
||||
name="GPU Admin API",
|
||||
description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
|
||||
expected="HTTP 200 mit GPU-Status",
|
||||
actual=f"Fehler: {str(e)}",
|
||||
status="failed",
|
||||
duration_ms=(time.time() - start) * 1000,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
|
||||
# ==============================================
|
||||
# Category Runners
|
||||
# ==============================================
|
||||
|
||||
async def run_detection_tests() -> TestCategoryResult:
|
||||
"""Run GPU detection tests"""
|
||||
tests = await asyncio.gather(
|
||||
test_nvidia_smi(),
|
||||
test_rocm_smi(),
|
||||
test_cuda_available(),
|
||||
)
|
||||
|
||||
passed = sum(1 for t in tests if t.status == "passed")
|
||||
failed = sum(1 for t in tests if t.status == "failed")
|
||||
|
||||
return TestCategoryResult(
|
||||
category="detection",
|
||||
display_name="GPU Erkennung",
|
||||
description="Tests zur Hardware-Erkennung",
|
||||
tests=list(tests),
|
||||
passed=passed,
|
||||
failed=failed,
|
||||
total=len(tests)
|
||||
)
|
||||
|
||||
|
||||
async def run_cloud_tests() -> TestCategoryResult:
|
||||
"""Run cloud GPU tests"""
|
||||
tests = await asyncio.gather(
|
||||
test_vast_ai_api(),
|
||||
)
|
||||
|
||||
passed = sum(1 for t in tests if t.status == "passed")
|
||||
failed = sum(1 for t in tests if t.status == "failed")
|
||||
|
||||
return TestCategoryResult(
|
||||
category="cloud",
|
||||
display_name="Cloud GPU (vast.ai)",
|
||||
description="Tests fuer Cloud-GPU-Dienste",
|
||||
tests=list(tests),
|
||||
passed=passed,
|
||||
failed=failed,
|
||||
total=len(tests)
|
||||
)
|
||||
|
||||
|
||||
async def run_api_tests() -> TestCategoryResult:
|
||||
"""Run GPU API tests"""
|
||||
tests = await asyncio.gather(
|
||||
test_gpu_api_endpoint(),
|
||||
)
|
||||
|
||||
passed = sum(1 for t in tests if t.status == "passed")
|
||||
failed = sum(1 for t in tests if t.status == "failed")
|
||||
|
||||
return TestCategoryResult(
|
||||
category="api-health",
|
||||
display_name="GPU Admin API",
|
||||
description="Tests fuer die GPU-Verwaltungs-Endpunkte",
|
||||
tests=list(tests),
|
||||
passed=passed,
|
||||
failed=failed,
|
||||
total=len(tests)
|
||||
)
|
||||
|
||||
|
||||
# ==============================================
|
||||
# API Endpoints
|
||||
# ==============================================
|
||||
|
||||
@router.post("/{category}", response_model=TestCategoryResult)
|
||||
async def run_category_tests(category: str):
|
||||
"""Run tests for a specific category"""
|
||||
runners = {
|
||||
"api-health": run_api_tests,
|
||||
"detection": run_detection_tests,
|
||||
"cloud": run_cloud_tests,
|
||||
}
|
||||
|
||||
if category not in runners:
|
||||
return TestCategoryResult(
|
||||
category=category,
|
||||
display_name=f"Unbekannt: {category}",
|
||||
description="Kategorie nicht gefunden",
|
||||
tests=[],
|
||||
passed=0,
|
||||
failed=0,
|
||||
total=0
|
||||
)
|
||||
|
||||
return await runners[category]()
|
||||
|
||||
|
||||
@router.post("/run-all", response_model=FullTestResults)
|
||||
async def run_all_tests():
|
||||
"""Run all GPU tests"""
|
||||
start = time.time()
|
||||
|
||||
categories = await asyncio.gather(
|
||||
run_api_tests(),
|
||||
run_detection_tests(),
|
||||
run_cloud_tests(),
|
||||
)
|
||||
|
||||
total_passed = sum(c.passed for c in categories)
|
||||
total_failed = sum(c.failed for c in categories)
|
||||
total_tests = sum(c.total for c in categories)
|
||||
|
||||
return FullTestResults(
|
||||
categories=list(categories),
|
||||
total_passed=total_passed,
|
||||
total_failed=total_failed,
|
||||
total_tests=total_tests,
|
||||
duration_ms=(time.time() - start) * 1000
|
||||
)
|
||||
|
||||
|
||||
@router.get("/categories")
|
||||
async def get_categories():
|
||||
"""Get available test categories"""
|
||||
return {
|
||||
"categories": [
|
||||
{"id": "api-health", "name": "GPU Admin API", "description": "Backend API Tests"},
|
||||
{"id": "detection", "name": "GPU Erkennung", "description": "Hardware Detection"},
|
||||
{"id": "cloud", "name": "Cloud GPU", "description": "vast.ai Integration"},
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user