fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit 21a844cb8a
1986 changed files with 744143 additions and 1731 deletions
--- a/backend/gpu_test_api.py
+++ b/backend/gpu_test_api.py
@@ -0,0 +1,455 @@
+"""
+GPU Infrastructure Test API - Test Runner fuer CUDA/ROCm GPU Management
+Endpoint: /api/admin/gpu-tests
+"""
+
+from fastapi import APIRouter
+from pydantic import BaseModel
+from typing import List, Optional, Literal
+import httpx
+import asyncio
+import time
+import os
+import subprocess
+
+router = APIRouter(prefix="/api/admin/gpu-tests", tags=["GPU Tests"])
+
+# ==============================================
+# Models
+# ==============================================
+
+class TestResult(BaseModel):
+    name: str
+    description: str
+    expected: str
+    actual: str
+    status: Literal["passed", "failed", "pending", "skipped"]
+    duration_ms: float
+    error_message: Optional[str] = None
+
+
+class TestCategoryResult(BaseModel):
+    category: str
+    display_name: str
+    description: str
+    tests: List[TestResult]
+    passed: int
+    failed: int
+    total: int
+
+
+class FullTestResults(BaseModel):
+    categories: List[TestCategoryResult]
+    total_passed: int
+    total_failed: int
+    total_tests: int
+    duration_ms: float
+
+
+# ==============================================
+# Configuration
+# ==============================================
+
+BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8000")
+VAST_API_KEY = os.getenv("VAST_API_KEY", "")
+
+
+# ==============================================
+# Test Implementations
+# ==============================================
+
+async def test_nvidia_smi() -> TestResult:
+    """Test NVIDIA GPU Detection via nvidia-smi"""
+    start = time.time()
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+            timeout=10
+        )
+        duration = (time.time() - start) * 1000
+
+        if result.returncode == 0 and result.stdout.strip():
+            gpu_info = result.stdout.strip().split('\n')[0]
+            return TestResult(
+                name="NVIDIA GPU Erkennung",
+                description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
+                expected="GPU-Informationen verfuegbar",
+                actual=f"GPU: {gpu_info}",
+                status="passed",
+                duration_ms=duration
+            )
+        else:
+            return TestResult(
+                name="NVIDIA GPU Erkennung",
+                description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
+                expected="GPU-Informationen verfuegbar",
+                actual="Keine NVIDIA GPU gefunden",
+                status="skipped",
+                duration_ms=duration,
+                error_message="nvidia-smi nicht verfuegbar oder keine GPU"
+            )
+    except FileNotFoundError:
+        return TestResult(
+            name="NVIDIA GPU Erkennung",
+            description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
+            expected="GPU-Informationen verfuegbar",
+            actual="nvidia-smi nicht installiert",
+            status="skipped",
+            duration_ms=(time.time() - start) * 1000,
+            error_message="nvidia-smi Binary nicht gefunden"
+        )
+    except Exception as e:
+        return TestResult(
+            name="NVIDIA GPU Erkennung",
+            description="Prueft ob NVIDIA GPUs via nvidia-smi erkannt werden",
+            expected="GPU-Informationen verfuegbar",
+            actual=f"Fehler: {str(e)}",
+            status="failed",
+            duration_ms=(time.time() - start) * 1000,
+            error_message=str(e)
+        )
+
+
+async def test_rocm_smi() -> TestResult:
+    """Test AMD GPU Detection via rocm-smi"""
+    start = time.time()
+    try:
+        result = subprocess.run(
+            ["rocm-smi", "--showproductname"],
+            capture_output=True,
+            text=True,
+            timeout=10
+        )
+        duration = (time.time() - start) * 1000
+
+        if result.returncode == 0 and result.stdout.strip():
+            return TestResult(
+                name="AMD ROCm GPU Erkennung",
+                description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
+                expected="GPU-Informationen verfuegbar",
+                actual=f"ROCm GPU erkannt",
+                status="passed",
+                duration_ms=duration
+            )
+        else:
+            return TestResult(
+                name="AMD ROCm GPU Erkennung",
+                description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
+                expected="GPU-Informationen verfuegbar",
+                actual="Keine AMD GPU gefunden",
+                status="skipped",
+                duration_ms=duration,
+                error_message="rocm-smi nicht verfuegbar oder keine GPU"
+            )
+    except FileNotFoundError:
+        return TestResult(
+            name="AMD ROCm GPU Erkennung",
+            description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
+            expected="GPU-Informationen verfuegbar",
+            actual="rocm-smi nicht installiert",
+            status="skipped",
+            duration_ms=(time.time() - start) * 1000,
+            error_message="rocm-smi Binary nicht gefunden"
+        )
+    except Exception as e:
+        return TestResult(
+            name="AMD ROCm GPU Erkennung",
+            description="Prueft ob AMD GPUs via rocm-smi erkannt werden",
+            expected="GPU-Informationen verfuegbar",
+            actual=f"Fehler: {str(e)}",
+            status="failed",
+            duration_ms=(time.time() - start) * 1000,
+            error_message=str(e)
+        )
+
+
+async def test_cuda_available() -> TestResult:
+    """Test CUDA Availability via Python"""
+    start = time.time()
+    try:
+        # Try to import torch and check CUDA
+        result = subprocess.run(
+            ["python", "-c", "import torch; print(f'CUDA: {torch.cuda.is_available()}, Devices: {torch.cuda.device_count()}')"],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        duration = (time.time() - start) * 1000
+
+        if result.returncode == 0:
+            output = result.stdout.strip()
+            if "True" in output:
+                return TestResult(
+                    name="PyTorch CUDA Support",
+                    description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
+                    expected="CUDA verfuegbar",
+                    actual=output,
+                    status="passed",
+                    duration_ms=duration
+                )
+            else:
+                return TestResult(
+                    name="PyTorch CUDA Support",
+                    description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
+                    expected="CUDA verfuegbar",
+                    actual=output,
+                    status="skipped",
+                    duration_ms=duration,
+                    error_message="CUDA nicht verfuegbar in PyTorch"
+                )
+        else:
+            return TestResult(
+                name="PyTorch CUDA Support",
+                description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
+                expected="CUDA verfuegbar",
+                actual="PyTorch nicht installiert",
+                status="skipped",
+                duration_ms=duration,
+                error_message=result.stderr[:200] if result.stderr else "PyTorch fehlt"
+            )
+    except Exception as e:
+        return TestResult(
+            name="PyTorch CUDA Support",
+            description="Prueft ob PyTorch CUDA-Unterstuetzung hat",
+            expected="CUDA verfuegbar",
+            actual=f"Fehler: {str(e)}",
+            status="skipped",
+            duration_ms=(time.time() - start) * 1000,
+            error_message=str(e)
+        )
+
+
+async def test_vast_ai_api() -> TestResult:
+    """Test vast.ai API Connection"""
+    start = time.time()
+
+    if not VAST_API_KEY:
+        return TestResult(
+            name="vast.ai API Verbindung",
+            description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
+            expected="API Key konfiguriert",
+            actual="VAST_API_KEY nicht gesetzt",
+            status="skipped",
+            duration_ms=(time.time() - start) * 1000,
+            error_message="Umgebungsvariable VAST_API_KEY fehlt"
+        )
+
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            response = await client.get(
+                "https://console.vast.ai/api/v0/users/current",
+                headers={"Authorization": f"Bearer {VAST_API_KEY}"}
+            )
+            duration = (time.time() - start) * 1000
+
+            if response.status_code == 200:
+                data = response.json()
+                balance = data.get("credit", 0)
+                return TestResult(
+                    name="vast.ai API Verbindung",
+                    description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
+                    expected="API erreichbar mit Guthaben",
+                    actual=f"Verbunden, Guthaben: ${balance:.2f}",
+                    status="passed",
+                    duration_ms=duration
+                )
+            else:
+                return TestResult(
+                    name="vast.ai API Verbindung",
+                    description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
+                    expected="API erreichbar mit Guthaben",
+                    actual=f"HTTP {response.status_code}",
+                    status="failed",
+                    duration_ms=duration,
+                    error_message="API-Authentifizierung fehlgeschlagen"
+                )
+    except Exception as e:
+        return TestResult(
+            name="vast.ai API Verbindung",
+            description="Prueft ob die vast.ai Cloud-GPU API konfiguriert ist",
+            expected="API erreichbar mit Guthaben",
+            actual=f"Fehler: {str(e)}",
+            status="failed",
+            duration_ms=(time.time() - start) * 1000,
+            error_message=str(e)
+        )
+
+
+async def test_gpu_api_endpoint() -> TestResult:
+    """Test GPU Admin API Endpoint"""
+    start = time.time()
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            response = await client.get(f"{BACKEND_URL}/api/gpu/status")
+            duration = (time.time() - start) * 1000
+
+            if response.status_code == 200:
+                return TestResult(
+                    name="GPU Admin API",
+                    description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
+                    expected="HTTP 200 mit GPU-Status",
+                    actual="API verfuegbar",
+                    status="passed",
+                    duration_ms=duration
+                )
+            elif response.status_code == 404:
+                return TestResult(
+                    name="GPU Admin API",
+                    description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
+                    expected="HTTP 200 mit GPU-Status",
+                    actual="Endpoint nicht implementiert",
+                    status="skipped",
+                    duration_ms=duration,
+                    error_message="GPU API nicht aktiviert"
+                )
+            else:
+                return TestResult(
+                    name="GPU Admin API",
+                    description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
+                    expected="HTTP 200 mit GPU-Status",
+                    actual=f"HTTP {response.status_code}",
+                    status="failed",
+                    duration_ms=duration,
+                    error_message=f"Unerwarteter Status: {response.status_code}"
+                )
+    except Exception as e:
+        return TestResult(
+            name="GPU Admin API",
+            description="Prueft ob die GPU-Verwaltungs-API verfuegbar ist",
+            expected="HTTP 200 mit GPU-Status",
+            actual=f"Fehler: {str(e)}",
+            status="failed",
+            duration_ms=(time.time() - start) * 1000,
+            error_message=str(e)
+        )
+
+
+# ==============================================
+# Category Runners
+# ==============================================
+
+async def run_detection_tests() -> TestCategoryResult:
+    """Run GPU detection tests"""
+    tests = await asyncio.gather(
+        test_nvidia_smi(),
+        test_rocm_smi(),
+        test_cuda_available(),
+    )
+
+    passed = sum(1 for t in tests if t.status == "passed")
+    failed = sum(1 for t in tests if t.status == "failed")
+
+    return TestCategoryResult(
+        category="detection",
+        display_name="GPU Erkennung",
+        description="Tests zur Hardware-Erkennung",
+        tests=list(tests),
+        passed=passed,
+        failed=failed,
+        total=len(tests)
+    )
+
+
+async def run_cloud_tests() -> TestCategoryResult:
+    """Run cloud GPU tests"""
+    tests = await asyncio.gather(
+        test_vast_ai_api(),
+    )
+
+    passed = sum(1 for t in tests if t.status == "passed")
+    failed = sum(1 for t in tests if t.status == "failed")
+
+    return TestCategoryResult(
+        category="cloud",
+        display_name="Cloud GPU (vast.ai)",
+        description="Tests fuer Cloud-GPU-Dienste",
+        tests=list(tests),
+        passed=passed,
+        failed=failed,
+        total=len(tests)
+    )
+
+
+async def run_api_tests() -> TestCategoryResult:
+    """Run GPU API tests"""
+    tests = await asyncio.gather(
+        test_gpu_api_endpoint(),
+    )
+
+    passed = sum(1 for t in tests if t.status == "passed")
+    failed = sum(1 for t in tests if t.status == "failed")
+
+    return TestCategoryResult(
+        category="api-health",
+        display_name="GPU Admin API",
+        description="Tests fuer die GPU-Verwaltungs-Endpunkte",
+        tests=list(tests),
+        passed=passed,
+        failed=failed,
+        total=len(tests)
+    )
+
+
+# ==============================================
+# API Endpoints
+# ==============================================
+
+@router.post("/{category}", response_model=TestCategoryResult)
+async def run_category_tests(category: str):
+    """Run tests for a specific category"""
+    runners = {
+        "api-health": run_api_tests,
+        "detection": run_detection_tests,
+        "cloud": run_cloud_tests,
+    }
+
+    if category not in runners:
+        return TestCategoryResult(
+            category=category,
+            display_name=f"Unbekannt: {category}",
+            description="Kategorie nicht gefunden",
+            tests=[],
+            passed=0,
+            failed=0,
+            total=0
+        )
+
+    return await runners[category]()
+
+
+@router.post("/run-all", response_model=FullTestResults)
+async def run_all_tests():
+    """Run all GPU tests"""
+    start = time.time()
+
+    categories = await asyncio.gather(
+        run_api_tests(),
+        run_detection_tests(),
+        run_cloud_tests(),
+    )
+
+    total_passed = sum(c.passed for c in categories)
+    total_failed = sum(c.failed for c in categories)
+    total_tests = sum(c.total for c in categories)
+
+    return FullTestResults(
+        categories=list(categories),
+        total_passed=total_passed,
+        total_failed=total_failed,
+        total_tests=total_tests,
+        duration_ms=(time.time() - start) * 1000
+    )
+
+
+@router.get("/categories")
+async def get_categories():
+    """Get available test categories"""
+    return {
+        "categories": [
+            {"id": "api-health", "name": "GPU Admin API", "description": "Backend API Tests"},
+            {"id": "detection", "name": "GPU Erkennung", "description": "Hardware Detection"},
+            {"id": "cloud", "name": "Cloud GPU", "description": "vast.ai Integration"},
+        ]
+    }