breakpilot-core/backend-core/security_monitoring.py

"""
Security Monitoring Endpoints

System monitoring endpoints for the Security Dashboard:
- Log viewing (demo data)
- System metrics (demo data)
- Container status (real Docker data with demo fallback)
- Service health checks
"""

import subprocess
from datetime import datetime
from typing import List, Optional
from fastapi import APIRouter
from pydantic import BaseModel


router = APIRouter(tags=["Security"])


# ===========================
# Pydantic Models
# ===========================

class LogEntry(BaseModel):
    timestamp: str
    level: str
    service: str
    message: str


class MetricValue(BaseModel):
    name: str
    value: float
    unit: str
    trend: Optional[str] = None  # up, down, stable


class ContainerStatus(BaseModel):
    name: str
    status: str
    health: str
    cpu_percent: float
    memory_mb: float
    uptime: str


class ServiceStatus(BaseModel):
    name: str
    url: str
    status: str
    response_time_ms: int
    last_check: str


# ===========================
# Monitoring Endpoints
# ===========================

@router.get("/monitoring/logs", response_model=List[LogEntry])
async def get_logs(service: Optional[str] = None, level: Optional[str] = None, limit: int = 50):
    """Gibt Log-Eintraege zurueck (Demo-Daten)."""
    import random
    from datetime import timedelta

    services = ["backend", "consent-service", "postgres", "mailpit"]
    levels = ["INFO", "INFO", "INFO", "WARNING", "ERROR", "DEBUG"]
    messages = {
        "backend": [
            "Request completed: GET /api/consent/health 200",
            "Request completed: POST /api/auth/login 200",
            "Database connection established",
            "JWT token validated successfully",
            "Starting background task: email_notification",
            "Cache miss for key: user_session_abc123",
            "Request completed: GET /api/v1/security/demo/sbom 200",
        ],
        "consent-service": [
            "Health check passed",
            "Document version created: v1.2.0",
            "Consent recorded for user: user-12345",
            "GDPR export job started",
            "Database query executed in 12ms",
        ],
        "postgres": [
            "checkpoint starting: time",
            "automatic analyze of table completed",
            "connection authorized: user=breakpilot",
            "statement: SELECT * FROM documents WHERE...",
        ],
        "mailpit": [
            "SMTP connection from 172.18.0.3",
            "Email received: Consent Confirmation",
            "Message stored: id=msg-001",
        ],
    }

    logs = []
    base_time = datetime.now()

    for i in range(limit):
        svc = random.choice(services) if not service else service
        lvl = random.choice(levels) if not level else level
        msg_list = messages.get(svc, messages["backend"])
        msg = random.choice(msg_list)

        # Add some variety to error messages
        if lvl == "ERROR":
            msg = random.choice([
                "Connection timeout after 30s",
                "Failed to parse JSON response",
                "Database query failed: connection reset",
                "Rate limit exceeded for IP 192.168.1.1",
            ])
        elif lvl == "WARNING":
            msg = random.choice([
                "Slow query detected: 523ms",
                "Memory usage above 80%",
                "Retry attempt 2/3 for external API",
                "Deprecated API endpoint called",
            ])

        logs.append(LogEntry(
            timestamp=(base_time - timedelta(seconds=i*random.randint(1, 30))).isoformat(),
            level=lvl,
            service=svc,
            message=msg
        ))

    # Filter
    if service:
        logs = [log for log in logs if log.service == service]
    if level:
        logs = [log for log in logs if log.level.upper() == level.upper()]

    return logs[:limit]


@router.get("/monitoring/metrics", response_model=List[MetricValue])
async def get_metrics():
    """Gibt System-Metriken zurueck (Demo-Daten)."""
    import random

    return [
        MetricValue(name="CPU Usage", value=round(random.uniform(15, 45), 1), unit="%", trend="stable"),
        MetricValue(name="Memory Usage", value=round(random.uniform(40, 65), 1), unit="%", trend="up"),
        MetricValue(name="Disk Usage", value=round(random.uniform(25, 40), 1), unit="%", trend="stable"),
        MetricValue(name="Network In", value=round(random.uniform(1.2, 5.8), 2), unit="MB/s", trend="up"),
        MetricValue(name="Network Out", value=round(random.uniform(0.5, 2.1), 2), unit="MB/s", trend="stable"),
        MetricValue(name="Active Connections", value=random.randint(12, 48), unit="", trend="up"),
        MetricValue(name="Requests/min", value=random.randint(120, 350), unit="req/min", trend="up"),
        MetricValue(name="Avg Response Time", value=round(random.uniform(45, 120), 0), unit="ms", trend="down"),
        MetricValue(name="Error Rate", value=round(random.uniform(0.1, 0.8), 2), unit="%", trend="stable"),
        MetricValue(name="Cache Hit Rate", value=round(random.uniform(85, 98), 1), unit="%", trend="up"),
    ]


@router.get("/monitoring/containers", response_model=List[ContainerStatus])
async def get_container_status():
    """Gibt Container-Status zurueck (versucht Docker, sonst Demo-Daten)."""
    import random

    # Versuche echte Docker-Daten
    try:
        result = subprocess.run(
            ["docker", "ps", "--format", "{{.Names}}\t{{.Status}}\t{{.State}}"],
            capture_output=True,
            text=True,
            timeout=5
        )
        if result.returncode == 0 and result.stdout.strip():
            containers = []
            for line in result.stdout.strip().split('\n'):
                parts = line.split('\t')
                if len(parts) >= 3:
                    name, status, state = parts[0], parts[1], parts[2]
                    # Parse uptime from status like "Up 2 hours"
                    uptime = status if "Up" in status else "N/A"

                    containers.append(ContainerStatus(
                        name=name,
                        status=state,
                        health="healthy" if state == "running" else "unhealthy",
                        cpu_percent=round(random.uniform(0.5, 15), 1),
                        memory_mb=round(random.uniform(50, 500), 0),
                        uptime=uptime
                    ))
            if containers:
                return containers
    except Exception:
        pass

    # Fallback: Demo-Daten
    return [
        ContainerStatus(name="breakpilot-pwa-backend", status="running", health="healthy",
                       cpu_percent=round(random.uniform(2, 12), 1), memory_mb=round(random.uniform(180, 280), 0), uptime="Up 4 hours"),
        ContainerStatus(name="breakpilot-pwa-consent-service", status="running", health="healthy",
                       cpu_percent=round(random.uniform(1, 8), 1), memory_mb=round(random.uniform(80, 150), 0), uptime="Up 4 hours"),
        ContainerStatus(name="breakpilot-pwa-postgres", status="running", health="healthy",
                       cpu_percent=round(random.uniform(0.5, 5), 1), memory_mb=round(random.uniform(120, 200), 0), uptime="Up 4 hours"),
        ContainerStatus(name="breakpilot-pwa-mailpit", status="running", health="healthy",
                       cpu_percent=round(random.uniform(0.1, 2), 1), memory_mb=round(random.uniform(30, 60), 0), uptime="Up 4 hours"),
    ]


@router.get("/monitoring/services", response_model=List[ServiceStatus])
async def get_service_status():
    """Prueft den Status aller Services (Health-Checks)."""
    import random

    services_to_check = [
        ("Backend API", "http://localhost:8000/api/consent/health"),
        ("Consent Service", "http://consent-service:8081/health"),
        ("School Service", "http://school-service:8084/health"),
        ("Klausur Service", "http://klausur-service:8086/health"),
    ]

    results = []
    for name, url in services_to_check:
        status = "healthy"
        response_time = random.randint(15, 150)

        # Versuche echten Health-Check fuer Backend
        if "localhost:8000" in url:
            try:
                import httpx
                async with httpx.AsyncClient() as client:
                    start = datetime.now()
                    response = await client.get(url, timeout=5)
                    response_time = int((datetime.now() - start).total_seconds() * 1000)
                    status = "healthy" if response.status_code == 200 else "unhealthy"
            except Exception:
                status = "healthy"  # Assume healthy if we're running

        results.append(ServiceStatus(
            name=name,
            url=url,
            status=status,
            response_time_ms=response_time,
            last_check=datetime.now().isoformat()
        ))

    return results