breakpilot-pwa/backend/llm_gateway/services/inference.py

"""
Inference Service - Kommunikation mit LLM Backends.

Unterstützt:
- Ollama (lokal)
- vLLM (remote, OpenAI-kompatibel)
- Anthropic Claude API (Fallback)
"""

import httpx
import json
import logging
from typing import AsyncIterator, Optional
from dataclasses import dataclass

from ..config import get_config, LLMBackendConfig
from ..models.chat import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionChunk,
    ChatMessage,
    ChatChoice,
    StreamChoice,
    ChatChoiceDelta,
    Usage,
    ModelInfo,
    ModelListResponse,
)

logger = logging.getLogger(__name__)


@dataclass
class InferenceResult:
    """Ergebnis einer Inference-Anfrage."""
    content: str
    model: str
    backend: str
    usage: Optional[Usage] = None
    finish_reason: str = "stop"


class InferenceService:
    """Service für LLM Inference über verschiedene Backends."""

    def __init__(self):
        self.config = get_config()
        self._client: Optional[httpx.AsyncClient] = None

    async def get_client(self) -> httpx.AsyncClient:
        """Lazy initialization des HTTP Clients."""
        if self._client is None:
            self._client = httpx.AsyncClient(timeout=120.0)
        return self._client

    async def close(self):
        """Schließt den HTTP Client."""
        if self._client:
            await self._client.aclose()
            self._client = None

    def _get_available_backend(self, preferred_model: Optional[str] = None) -> Optional[LLMBackendConfig]:
        """Findet das erste verfügbare Backend basierend auf Priorität."""
        for backend_name in self.config.backend_priority:
            backend = getattr(self.config, backend_name, None)
            if backend and backend.enabled:
                return backend
        return None

    def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
        """
        Mapped ein Modell-Name zum entsprechenden Backend.

        Beispiele:
        - "breakpilot-teacher-8b" → Ollama/vLLM mit llama3.1:8b
        - "claude-3-5-sonnet" → Anthropic
        """
        model_lower = model.lower()

        # Explizite Claude-Modelle → Anthropic
        if "claude" in model_lower:
            if self.config.anthropic and self.config.anthropic.enabled:
                return self.config.anthropic.default_model, self.config.anthropic
            raise ValueError("Anthropic backend not configured")

        # BreakPilot Modelle → primäres Backend
        if "breakpilot" in model_lower or "teacher" in model_lower:
            backend = self._get_available_backend()
            if backend:
                # Map zu tatsächlichem Modell-Namen
                if "70b" in model_lower:
                    actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
                else:
                    actual_model = "llama3.1:8b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-8B-Instruct"
                return actual_model, backend
            raise ValueError("No LLM backend available")

        # Mistral Modelle
        if "mistral" in model_lower:
            backend = self._get_available_backend()
            if backend:
                actual_model = "mistral:7b" if backend.name == "ollama" else "mistralai/Mistral-7B-Instruct-v0.2"
                return actual_model, backend
            raise ValueError("No LLM backend available")

        # Fallback: verwende Modell-Name direkt
        backend = self._get_available_backend()
        if backend:
            return model, backend
        raise ValueError("No LLM backend available")

    async def _call_ollama(
        self,
        backend: LLMBackendConfig,
        model: str,
        request: ChatCompletionRequest,
    ) -> InferenceResult:
        """Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
        client = await self.get_client()

        # Ollama verwendet eigenes Format
        messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]

        payload = {
            "model": model,
            "messages": messages,
            "stream": False,
            "options": {
                "temperature": request.temperature,
                "top_p": request.top_p,
            },
        }

        if request.max_tokens:
            payload["options"]["num_predict"] = request.max_tokens

        response = await client.post(
            f"{backend.base_url}/api/chat",
            json=payload,
            timeout=backend.timeout,
        )
        response.raise_for_status()
        data = response.json()

        return InferenceResult(
            content=data.get("message", {}).get("content", ""),
            model=model,
            backend="ollama",
            usage=Usage(
                prompt_tokens=data.get("prompt_eval_count", 0),
                completion_tokens=data.get("eval_count", 0),
                total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
            ),
            finish_reason="stop" if data.get("done") else "length",
        )

    async def _stream_ollama(
        self,
        backend: LLMBackendConfig,
        model: str,
        request: ChatCompletionRequest,
        response_id: str,
    ) -> AsyncIterator[ChatCompletionChunk]:
        """Streamt von Ollama."""
        client = await self.get_client()

        messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]

        payload = {
            "model": model,
            "messages": messages,
            "stream": True,
            "options": {
                "temperature": request.temperature,
                "top_p": request.top_p,
            },
        }

        if request.max_tokens:
            payload["options"]["num_predict"] = request.max_tokens

        async with client.stream(
            "POST",
            f"{backend.base_url}/api/chat",
            json=payload,
            timeout=backend.timeout,
        ) as response:
            response.raise_for_status()
            async for line in response.aiter_lines():
                if not line:
                    continue
                try:
                    data = json.loads(line)
                    content = data.get("message", {}).get("content", "")
                    done = data.get("done", False)

                    yield ChatCompletionChunk(
                        id=response_id,
                        model=model,
                        choices=[
                            StreamChoice(
                                index=0,
                                delta=ChatChoiceDelta(content=content),
                                finish_reason="stop" if done else None,
                            )
                        ],
                    )
                except json.JSONDecodeError:
                    continue

    async def _call_openai_compatible(
        self,
        backend: LLMBackendConfig,
        model: str,
        request: ChatCompletionRequest,
    ) -> InferenceResult:
        """Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
        client = await self.get_client()

        headers = {"Content-Type": "application/json"}
        if backend.api_key:
            headers["Authorization"] = f"Bearer {backend.api_key}"

        payload = {
            "model": model,
            "messages": [m.model_dump(exclude_none=True) for m in request.messages],
            "stream": False,
            "temperature": request.temperature,
            "top_p": request.top_p,
        }

        if request.max_tokens:
            payload["max_tokens"] = request.max_tokens
        if request.stop:
            payload["stop"] = request.stop

        response = await client.post(
            f"{backend.base_url}/v1/chat/completions",
            json=payload,
            headers=headers,
            timeout=backend.timeout,
        )
        response.raise_for_status()
        data = response.json()

        choice = data.get("choices", [{}])[0]
        usage_data = data.get("usage", {})

        return InferenceResult(
            content=choice.get("message", {}).get("content", ""),
            model=model,
            backend=backend.name,
            usage=Usage(
                prompt_tokens=usage_data.get("prompt_tokens", 0),
                completion_tokens=usage_data.get("completion_tokens", 0),
                total_tokens=usage_data.get("total_tokens", 0),
            ),
            finish_reason=choice.get("finish_reason", "stop"),
        )

    async def _stream_openai_compatible(
        self,
        backend: LLMBackendConfig,
        model: str,
        request: ChatCompletionRequest,
        response_id: str,
    ) -> AsyncIterator[ChatCompletionChunk]:
        """Streamt von OpenAI-kompatibler API."""
        client = await self.get_client()

        headers = {"Content-Type": "application/json"}
        if backend.api_key:
            headers["Authorization"] = f"Bearer {backend.api_key}"

        payload = {
            "model": model,
            "messages": [m.model_dump(exclude_none=True) for m in request.messages],
            "stream": True,
            "temperature": request.temperature,
            "top_p": request.top_p,
        }

        if request.max_tokens:
            payload["max_tokens"] = request.max_tokens

        async with client.stream(
            "POST",
            f"{backend.base_url}/v1/chat/completions",
            json=payload,
            headers=headers,
            timeout=backend.timeout,
        ) as response:
            response.raise_for_status()
            async for line in response.aiter_lines():
                if not line or not line.startswith("data: "):
                    continue
                data_str = line[6:]  # Remove "data: " prefix
                if data_str == "[DONE]":
                    break
                try:
                    data = json.loads(data_str)
                    choice = data.get("choices", [{}])[0]
                    delta = choice.get("delta", {})

                    yield ChatCompletionChunk(
                        id=response_id,
                        model=model,
                        choices=[
                            StreamChoice(
                                index=0,
                                delta=ChatChoiceDelta(
                                    role=delta.get("role"),
                                    content=delta.get("content"),
                                ),
                                finish_reason=choice.get("finish_reason"),
                            )
                        ],
                    )
                except json.JSONDecodeError:
                    continue

    async def _call_anthropic(
        self,
        backend: LLMBackendConfig,
        model: str,
        request: ChatCompletionRequest,
    ) -> InferenceResult:
        """Ruft Anthropic Claude API auf."""
        # Anthropic SDK verwenden (bereits installiert)
        try:
            import anthropic
        except ImportError:
            raise ImportError("anthropic package required for Claude API")

        client = anthropic.AsyncAnthropic(api_key=backend.api_key)

        # System message extrahieren
        system_content = ""
        messages = []
        for msg in request.messages:
            if msg.role == "system":
                system_content += (msg.content or "") + "\n"
            else:
                messages.append({"role": msg.role, "content": msg.content or ""})

        response = await client.messages.create(
            model=model,
            max_tokens=request.max_tokens or 4096,
            system=system_content.strip() if system_content else None,
            messages=messages,
            temperature=request.temperature,
            top_p=request.top_p,
        )

        content = ""
        if response.content:
            content = response.content[0].text if response.content[0].type == "text" else ""

        return InferenceResult(
            content=content,
            model=model,
            backend="anthropic",
            usage=Usage(
                prompt_tokens=response.usage.input_tokens,
                completion_tokens=response.usage.output_tokens,
                total_tokens=response.usage.input_tokens + response.usage.output_tokens,
            ),
            finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
        )

    async def _stream_anthropic(
        self,
        backend: LLMBackendConfig,
        model: str,
        request: ChatCompletionRequest,
        response_id: str,
    ) -> AsyncIterator[ChatCompletionChunk]:
        """Streamt von Anthropic Claude API."""
        try:
            import anthropic
        except ImportError:
            raise ImportError("anthropic package required for Claude API")

        client = anthropic.AsyncAnthropic(api_key=backend.api_key)

        # System message extrahieren
        system_content = ""
        messages = []
        for msg in request.messages:
            if msg.role == "system":
                system_content += (msg.content or "") + "\n"
            else:
                messages.append({"role": msg.role, "content": msg.content or ""})

        async with client.messages.stream(
            model=model,
            max_tokens=request.max_tokens or 4096,
            system=system_content.strip() if system_content else None,
            messages=messages,
            temperature=request.temperature,
            top_p=request.top_p,
        ) as stream:
            async for text in stream.text_stream:
                yield ChatCompletionChunk(
                    id=response_id,
                    model=model,
                    choices=[
                        StreamChoice(
                            index=0,
                            delta=ChatChoiceDelta(content=text),
                            finish_reason=None,
                        )
                    ],
                )

            # Final chunk with finish_reason
            yield ChatCompletionChunk(
                id=response_id,
                model=model,
                choices=[
                    StreamChoice(
                        index=0,
                        delta=ChatChoiceDelta(),
                        finish_reason="stop",
                    )
                ],
            )

    async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
        """
        Führt Chat Completion durch (non-streaming).
        """
        actual_model, backend = self._map_model_to_backend(request.model)

        logger.info(f"Inference request: model={request.model} → {actual_model} via {backend.name}")

        if backend.name == "ollama":
            result = await self._call_ollama(backend, actual_model, request)
        elif backend.name == "anthropic":
            result = await self._call_anthropic(backend, actual_model, request)
        else:
            result = await self._call_openai_compatible(backend, actual_model, request)

        return ChatCompletionResponse(
            model=request.model,  # Original requested model name
            choices=[
                ChatChoice(
                    index=0,
                    message=ChatMessage(role="assistant", content=result.content),
                    finish_reason=result.finish_reason,
                )
            ],
            usage=result.usage,
        )

    async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
        """
        Führt Chat Completion mit Streaming durch.
        """
        import uuid
        response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"

        actual_model, backend = self._map_model_to_backend(request.model)

        logger.info(f"Streaming request: model={request.model} → {actual_model} via {backend.name}")

        if backend.name == "ollama":
            async for chunk in self._stream_ollama(backend, actual_model, request, response_id):
                yield chunk
        elif backend.name == "anthropic":
            async for chunk in self._stream_anthropic(backend, actual_model, request, response_id):
                yield chunk
        else:
            async for chunk in self._stream_openai_compatible(backend, actual_model, request, response_id):
                yield chunk

    async def list_models(self) -> ModelListResponse:
        """Listet verfügbare Modelle."""
        models = []

        # BreakPilot Modelle (mapped zu verfügbaren Backends)
        backend = self._get_available_backend()
        if backend:
            models.extend([
                ModelInfo(
                    id="breakpilot-teacher-8b",
                    owned_by="breakpilot",
                    description="Llama 3.1 8B optimiert für Schulkontext",
                    context_length=8192,
                ),
                ModelInfo(
                    id="breakpilot-teacher-70b",
                    owned_by="breakpilot",
                    description="Llama 3.1 70B für komplexe Aufgaben",
                    context_length=8192,
                ),
            ])

        # Claude Modelle (wenn Anthropic konfiguriert)
        if self.config.anthropic and self.config.anthropic.enabled:
            models.append(
                ModelInfo(
                    id="claude-3-5-sonnet",
                    owned_by="anthropic",
                    description="Claude 3.5 Sonnet - Fallback für höchste Qualität",
                    context_length=200000,
                )
            )

        return ModelListResponse(data=models)


# Singleton
_inference_service: Optional[InferenceService] = None


def get_inference_service() -> InferenceService:
    """Gibt den Inference Service Singleton zurück."""
    global _inference_service
    if _inference_service is None:
        _inference_service = InferenceService()
    return _inference_service