""" Inference Service - Kommunikation mit LLM Backends. Unterstützt: - Ollama (lokal) - vLLM (remote, OpenAI-kompatibel) - Anthropic Claude API (Fallback) """ import httpx import logging from typing import AsyncIterator, Optional from ..config import get_config, LLMBackendConfig from ..models.chat import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionChunk, ChatMessage, ChatChoice, Usage, ModelInfo, ModelListResponse, ) from .inference_backends import ( InferenceResult, call_ollama, stream_ollama, call_openai_compatible, stream_openai_compatible, call_anthropic, stream_anthropic, ) logger = logging.getLogger(__name__) class InferenceService: """Service für LLM Inference über verschiedene Backends.""" def __init__(self): self.config = get_config() self._client: Optional[httpx.AsyncClient] = None async def get_client(self) -> httpx.AsyncClient: """Lazy initialization des HTTP Clients.""" if self._client is None: self._client = httpx.AsyncClient(timeout=120.0) return self._client async def close(self): """Schließt den HTTP Client.""" if self._client: await self._client.aclose() self._client = None def _get_available_backend(self, preferred_model: Optional[str] = None) -> Optional[LLMBackendConfig]: """Findet das erste verfügbare Backend basierend auf Priorität.""" for backend_name in self.config.backend_priority: backend = getattr(self.config, backend_name, None) if backend and backend.enabled: return backend return None def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]: """Mapped ein Modell-Name zum entsprechenden Backend.""" model_lower = model.lower() if "claude" in model_lower: if self.config.anthropic and self.config.anthropic.enabled: return self.config.anthropic.default_model, self.config.anthropic raise ValueError("Anthropic backend not configured") if "breakpilot" in model_lower or "teacher" in model_lower: backend = self._get_available_backend() if backend: if "70b" in model_lower: actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct" else: actual_model = "llama3.1:8b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-8B-Instruct" return actual_model, backend raise ValueError("No LLM backend available") if "mistral" in model_lower: backend = self._get_available_backend() if backend: actual_model = "mistral:7b" if backend.name == "ollama" else "mistralai/Mistral-7B-Instruct-v0.2" return actual_model, backend raise ValueError("No LLM backend available") backend = self._get_available_backend() if backend: return model, backend raise ValueError("No LLM backend available") async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse: """Führt Chat Completion durch (non-streaming).""" actual_model, backend = self._map_model_to_backend(request.model) logger.info(f"Inference request: model={request.model} -> {actual_model} via {backend.name}") client = await self.get_client() if backend.name == "ollama": result = await call_ollama(client, backend, actual_model, request) elif backend.name == "anthropic": result = await call_anthropic(backend, actual_model, request) else: result = await call_openai_compatible(client, backend, actual_model, request) return ChatCompletionResponse( model=request.model, choices=[ChatChoice(index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason)], usage=result.usage, ) async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]: """Führt Chat Completion mit Streaming durch.""" import uuid response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" actual_model, backend = self._map_model_to_backend(request.model) logger.info(f"Streaming request: model={request.model} -> {actual_model} via {backend.name}") client = await self.get_client() if backend.name == "ollama": async for chunk in stream_ollama(client, backend, actual_model, request, response_id): yield chunk elif backend.name == "anthropic": async for chunk in stream_anthropic(backend, actual_model, request, response_id): yield chunk else: async for chunk in stream_openai_compatible(client, backend, actual_model, request, response_id): yield chunk async def list_models(self) -> ModelListResponse: """Listet verfügbare Modelle.""" models = [] backend = self._get_available_backend() if backend: models.extend([ ModelInfo(id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192), ModelInfo(id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192), ]) if self.config.anthropic and self.config.anthropic.enabled: models.append(ModelInfo(id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000)) return ModelListResponse(data=models) # Singleton _inference_service: Optional[InferenceService] = None def get_inference_service() -> InferenceService: """Gibt den Inference Service Singleton zurück.""" global _inference_service if _inference_service is None: _inference_service = InferenceService() return _inference_service