breakpilot-lehrer/backend-lehrer/llm_gateway/services/inference.py

"""
Inference Service - Kommunikation mit LLM Backends.

Unterstützt:
- Ollama (lokal)
- vLLM (remote, OpenAI-kompatibel)
- Anthropic Claude API (Fallback)
"""

import httpx
import logging
from typing import AsyncIterator, Optional

from ..config import get_config, LLMBackendConfig
from ..models.chat import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionChunk,
    ChatMessage,
    ChatChoice,
    Usage,
    ModelInfo,
    ModelListResponse,
)
from .inference_backends import (
    InferenceResult,
    call_ollama,
    stream_ollama,
    call_openai_compatible,
    stream_openai_compatible,
    call_anthropic,
    stream_anthropic,
)

logger = logging.getLogger(__name__)


class InferenceService:
    """Service für LLM Inference über verschiedene Backends."""

    def __init__(self):
        self.config = get_config()
        self._client: Optional[httpx.AsyncClient] = None

    async def get_client(self) -> httpx.AsyncClient:
        """Lazy initialization des HTTP Clients."""
        if self._client is None:
            self._client = httpx.AsyncClient(timeout=120.0)
        return self._client

    async def close(self):
        """Schließt den HTTP Client."""
        if self._client:
            await self._client.aclose()
            self._client = None

    def _get_available_backend(self, preferred_model: Optional[str] = None) -> Optional[LLMBackendConfig]:
        """Findet das erste verfügbare Backend basierend auf Priorität."""
        for backend_name in self.config.backend_priority:
            backend = getattr(self.config, backend_name, None)
            if backend and backend.enabled:
                return backend
        return None

    def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
        """Mapped ein Modell-Name zum entsprechenden Backend."""
        model_lower = model.lower()

        if "claude" in model_lower:
            if self.config.anthropic and self.config.anthropic.enabled:
                return self.config.anthropic.default_model, self.config.anthropic
            raise ValueError("Anthropic backend not configured")

        if "breakpilot" in model_lower or "teacher" in model_lower:
            backend = self._get_available_backend()
            if backend:
                if "70b" in model_lower:
                    actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
                else:
                    actual_model = "llama3.1:8b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-8B-Instruct"
                return actual_model, backend
            raise ValueError("No LLM backend available")

        if "mistral" in model_lower:
            backend = self._get_available_backend()
            if backend:
                actual_model = "mistral:7b" if backend.name == "ollama" else "mistralai/Mistral-7B-Instruct-v0.2"
                return actual_model, backend
            raise ValueError("No LLM backend available")

        backend = self._get_available_backend()
        if backend:
            return model, backend
        raise ValueError("No LLM backend available")

    async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
        """Führt Chat Completion durch (non-streaming)."""
        actual_model, backend = self._map_model_to_backend(request.model)
        logger.info(f"Inference request: model={request.model} -> {actual_model} via {backend.name}")

        client = await self.get_client()

        if backend.name == "ollama":
            result = await call_ollama(client, backend, actual_model, request)
        elif backend.name == "anthropic":
            result = await call_anthropic(backend, actual_model, request)
        else:
            result = await call_openai_compatible(client, backend, actual_model, request)

        return ChatCompletionResponse(
            model=request.model,
            choices=[ChatChoice(index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason)],
            usage=result.usage,
        )

    async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
        """Führt Chat Completion mit Streaming durch."""
        import uuid
        response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"

        actual_model, backend = self._map_model_to_backend(request.model)
        logger.info(f"Streaming request: model={request.model} -> {actual_model} via {backend.name}")

        client = await self.get_client()

        if backend.name == "ollama":
            async for chunk in stream_ollama(client, backend, actual_model, request, response_id):
                yield chunk
        elif backend.name == "anthropic":
            async for chunk in stream_anthropic(backend, actual_model, request, response_id):
                yield chunk
        else:
            async for chunk in stream_openai_compatible(client, backend, actual_model, request, response_id):
                yield chunk

    async def list_models(self) -> ModelListResponse:
        """Listet verfügbare Modelle."""
        models = []

        backend = self._get_available_backend()
        if backend:
            models.extend([
                ModelInfo(id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192),
                ModelInfo(id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192),
            ])

        if self.config.anthropic and self.config.anthropic.enabled:
            models.append(ModelInfo(id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000))

        return ModelListResponse(data=models)


# Singleton
_inference_service: Optional[InferenceService] = None


def get_inference_service() -> InferenceService:
    """Gibt den Inference Service Singleton zurück."""
    global _inference_service
    if _inference_service is None:
        _inference_service = InferenceService()
    return _inference_service