""" Inference Service - Kommunikation mit LLM Backends. Unterstützt: - Ollama (lokal) - vLLM (remote, OpenAI-kompatibel) - Anthropic Claude API (Fallback) """ import httpx import json import logging from typing import AsyncIterator, Optional from dataclasses import dataclass from ..config import get_config, LLMBackendConfig from ..models.chat import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionChunk, ChatMessage, ChatChoice, StreamChoice, ChatChoiceDelta, Usage, ModelInfo, ModelListResponse, ) logger = logging.getLogger(__name__) @dataclass class InferenceResult: """Ergebnis einer Inference-Anfrage.""" content: str model: str backend: str usage: Optional[Usage] = None finish_reason: str = "stop" class InferenceService: """Service für LLM Inference über verschiedene Backends.""" def __init__(self): self.config = get_config() self._client: Optional[httpx.AsyncClient] = None async def get_client(self) -> httpx.AsyncClient: """Lazy initialization des HTTP Clients.""" if self._client is None: self._client = httpx.AsyncClient(timeout=120.0) return self._client async def close(self): """Schließt den HTTP Client.""" if self._client: await self._client.aclose() self._client = None def _get_available_backend(self, preferred_model: Optional[str] = None) -> Optional[LLMBackendConfig]: """Findet das erste verfügbare Backend basierend auf Priorität.""" for backend_name in self.config.backend_priority: backend = getattr(self.config, backend_name, None) if backend and backend.enabled: return backend return None def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]: """ Mapped ein Modell-Name zum entsprechenden Backend. Beispiele: - "breakpilot-teacher-8b" → Ollama/vLLM mit llama3.1:8b - "claude-3-5-sonnet" → Anthropic """ model_lower = model.lower() # Explizite Claude-Modelle → Anthropic if "claude" in model_lower: if self.config.anthropic and self.config.anthropic.enabled: return self.config.anthropic.default_model, self.config.anthropic raise ValueError("Anthropic backend not configured") # BreakPilot Modelle → primäres Backend if "breakpilot" in model_lower or "teacher" in model_lower: backend = self._get_available_backend() if backend: # Map zu tatsächlichem Modell-Namen if "70b" in model_lower: actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct" else: actual_model = "llama3.1:8b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-8B-Instruct" return actual_model, backend raise ValueError("No LLM backend available") # Mistral Modelle if "mistral" in model_lower: backend = self._get_available_backend() if backend: actual_model = "mistral:7b" if backend.name == "ollama" else "mistralai/Mistral-7B-Instruct-v0.2" return actual_model, backend raise ValueError("No LLM backend available") # Fallback: verwende Modell-Name direkt backend = self._get_available_backend() if backend: return model, backend raise ValueError("No LLM backend available") async def _call_ollama( self, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest, ) -> InferenceResult: """Ruft Ollama API auf (nicht OpenAI-kompatibel).""" client = await self.get_client() # Ollama verwendet eigenes Format messages = [{"role": m.role, "content": m.content or ""} for m in request.messages] payload = { "model": model, "messages": messages, "stream": False, "options": { "temperature": request.temperature, "top_p": request.top_p, }, } if request.max_tokens: payload["options"]["num_predict"] = request.max_tokens response = await client.post( f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout, ) response.raise_for_status() data = response.json() return InferenceResult( content=data.get("message", {}).get("content", ""), model=model, backend="ollama", usage=Usage( prompt_tokens=data.get("prompt_eval_count", 0), completion_tokens=data.get("eval_count", 0), total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0), ), finish_reason="stop" if data.get("done") else "length", ) async def _stream_ollama( self, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest, response_id: str, ) -> AsyncIterator[ChatCompletionChunk]: """Streamt von Ollama.""" client = await self.get_client() messages = [{"role": m.role, "content": m.content or ""} for m in request.messages] payload = { "model": model, "messages": messages, "stream": True, "options": { "temperature": request.temperature, "top_p": request.top_p, }, } if request.max_tokens: payload["options"]["num_predict"] = request.max_tokens async with client.stream( "POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout, ) as response: response.raise_for_status() async for line in response.aiter_lines(): if not line: continue try: data = json.loads(line) content = data.get("message", {}).get("content", "") done = data.get("done", False) yield ChatCompletionChunk( id=response_id, model=model, choices=[ StreamChoice( index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None, ) ], ) except json.JSONDecodeError: continue async def _call_openai_compatible( self, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest, ) -> InferenceResult: """Ruft OpenAI-kompatible API auf (vLLM, etc.).""" client = await self.get_client() headers = {"Content-Type": "application/json"} if backend.api_key: headers["Authorization"] = f"Bearer {backend.api_key}" payload = { "model": model, "messages": [m.model_dump(exclude_none=True) for m in request.messages], "stream": False, "temperature": request.temperature, "top_p": request.top_p, } if request.max_tokens: payload["max_tokens"] = request.max_tokens if request.stop: payload["stop"] = request.stop response = await client.post( f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout, ) response.raise_for_status() data = response.json() choice = data.get("choices", [{}])[0] usage_data = data.get("usage", {}) return InferenceResult( content=choice.get("message", {}).get("content", ""), model=model, backend=backend.name, usage=Usage( prompt_tokens=usage_data.get("prompt_tokens", 0), completion_tokens=usage_data.get("completion_tokens", 0), total_tokens=usage_data.get("total_tokens", 0), ), finish_reason=choice.get("finish_reason", "stop"), ) async def _stream_openai_compatible( self, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest, response_id: str, ) -> AsyncIterator[ChatCompletionChunk]: """Streamt von OpenAI-kompatibler API.""" client = await self.get_client() headers = {"Content-Type": "application/json"} if backend.api_key: headers["Authorization"] = f"Bearer {backend.api_key}" payload = { "model": model, "messages": [m.model_dump(exclude_none=True) for m in request.messages], "stream": True, "temperature": request.temperature, "top_p": request.top_p, } if request.max_tokens: payload["max_tokens"] = request.max_tokens async with client.stream( "POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout, ) as response: response.raise_for_status() async for line in response.aiter_lines(): if not line or not line.startswith("data: "): continue data_str = line[6:] # Remove "data: " prefix if data_str == "[DONE]": break try: data = json.loads(data_str) choice = data.get("choices", [{}])[0] delta = choice.get("delta", {}) yield ChatCompletionChunk( id=response_id, model=model, choices=[ StreamChoice( index=0, delta=ChatChoiceDelta( role=delta.get("role"), content=delta.get("content"), ), finish_reason=choice.get("finish_reason"), ) ], ) except json.JSONDecodeError: continue async def _call_anthropic( self, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest, ) -> InferenceResult: """Ruft Anthropic Claude API auf.""" # Anthropic SDK verwenden (bereits installiert) try: import anthropic except ImportError: raise ImportError("anthropic package required for Claude API") client = anthropic.AsyncAnthropic(api_key=backend.api_key) # System message extrahieren system_content = "" messages = [] for msg in request.messages: if msg.role == "system": system_content += (msg.content or "") + "\n" else: messages.append({"role": msg.role, "content": msg.content or ""}) response = await client.messages.create( model=model, max_tokens=request.max_tokens or 4096, system=system_content.strip() if system_content else None, messages=messages, temperature=request.temperature, top_p=request.top_p, ) content = "" if response.content: content = response.content[0].text if response.content[0].type == "text" else "" return InferenceResult( content=content, model=model, backend="anthropic", usage=Usage( prompt_tokens=response.usage.input_tokens, completion_tokens=response.usage.output_tokens, total_tokens=response.usage.input_tokens + response.usage.output_tokens, ), finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop", ) async def _stream_anthropic( self, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest, response_id: str, ) -> AsyncIterator[ChatCompletionChunk]: """Streamt von Anthropic Claude API.""" try: import anthropic except ImportError: raise ImportError("anthropic package required for Claude API") client = anthropic.AsyncAnthropic(api_key=backend.api_key) # System message extrahieren system_content = "" messages = [] for msg in request.messages: if msg.role == "system": system_content += (msg.content or "") + "\n" else: messages.append({"role": msg.role, "content": msg.content or ""}) async with client.messages.stream( model=model, max_tokens=request.max_tokens or 4096, system=system_content.strip() if system_content else None, messages=messages, temperature=request.temperature, top_p=request.top_p, ) as stream: async for text in stream.text_stream: yield ChatCompletionChunk( id=response_id, model=model, choices=[ StreamChoice( index=0, delta=ChatChoiceDelta(content=text), finish_reason=None, ) ], ) # Final chunk with finish_reason yield ChatCompletionChunk( id=response_id, model=model, choices=[ StreamChoice( index=0, delta=ChatChoiceDelta(), finish_reason="stop", ) ], ) async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse: """ Führt Chat Completion durch (non-streaming). """ actual_model, backend = self._map_model_to_backend(request.model) logger.info(f"Inference request: model={request.model} → {actual_model} via {backend.name}") if backend.name == "ollama": result = await self._call_ollama(backend, actual_model, request) elif backend.name == "anthropic": result = await self._call_anthropic(backend, actual_model, request) else: result = await self._call_openai_compatible(backend, actual_model, request) return ChatCompletionResponse( model=request.model, # Original requested model name choices=[ ChatChoice( index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason, ) ], usage=result.usage, ) async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]: """ Führt Chat Completion mit Streaming durch. """ import uuid response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" actual_model, backend = self._map_model_to_backend(request.model) logger.info(f"Streaming request: model={request.model} → {actual_model} via {backend.name}") if backend.name == "ollama": async for chunk in self._stream_ollama(backend, actual_model, request, response_id): yield chunk elif backend.name == "anthropic": async for chunk in self._stream_anthropic(backend, actual_model, request, response_id): yield chunk else: async for chunk in self._stream_openai_compatible(backend, actual_model, request, response_id): yield chunk async def list_models(self) -> ModelListResponse: """Listet verfügbare Modelle.""" models = [] # BreakPilot Modelle (mapped zu verfügbaren Backends) backend = self._get_available_backend() if backend: models.extend([ ModelInfo( id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192, ), ModelInfo( id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192, ), ]) # Claude Modelle (wenn Anthropic konfiguriert) if self.config.anthropic and self.config.anthropic.enabled: models.append( ModelInfo( id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000, ) ) return ModelListResponse(data=models) # Singleton _inference_service: Optional[InferenceService] = None def get_inference_service() -> InferenceService: """Gibt den Inference Service Singleton zurück.""" global _inference_service if _inference_service is None: _inference_service = InferenceService() return _inference_service