""" Inference Backends - Kommunikation mit einzelnen LLM-Providern. Unterstützt Ollama, OpenAI-kompatible APIs und Anthropic Claude. """ import json import logging from typing import AsyncIterator, Optional from dataclasses import dataclass from ..config import LLMBackendConfig from ..models.chat import ( ChatCompletionRequest, ChatCompletionChunk, ChatMessage, StreamChoice, ChatChoiceDelta, Usage, ) logger = logging.getLogger(__name__) @dataclass class InferenceResult: """Ergebnis einer Inference-Anfrage.""" content: str model: str backend: str usage: Optional[Usage] = None finish_reason: str = "stop" async def call_ollama(client, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest) -> InferenceResult: """Ruft Ollama API auf (nicht OpenAI-kompatibel).""" messages = [{"role": m.role, "content": m.content or ""} for m in request.messages] payload = { "model": model, "messages": messages, "stream": False, "options": {"temperature": request.temperature, "top_p": request.top_p}, } if request.max_tokens: payload["options"]["num_predict"] = request.max_tokens response = await client.post(f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) response.raise_for_status() data = response.json() return InferenceResult( content=data.get("message", {}).get("content", ""), model=model, backend="ollama", usage=Usage( prompt_tokens=data.get("prompt_eval_count", 0), completion_tokens=data.get("eval_count", 0), total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0), ), finish_reason="stop" if data.get("done") else "length", ) async def stream_ollama(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]: """Streamt von Ollama.""" messages = [{"role": m.role, "content": m.content or ""} for m in request.messages] payload = { "model": model, "messages": messages, "stream": True, "options": {"temperature": request.temperature, "top_p": request.top_p}, } if request.max_tokens: payload["options"]["num_predict"] = request.max_tokens async with client.stream("POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) as response: response.raise_for_status() async for line in response.aiter_lines(): if not line: continue try: data = json.loads(line) content = data.get("message", {}).get("content", "") done = data.get("done", False) yield ChatCompletionChunk( id=response_id, model=model, choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None)], ) except json.JSONDecodeError: continue async def call_openai_compatible(client, backend, model, request) -> InferenceResult: """Ruft OpenAI-kompatible API auf (vLLM, etc.).""" headers = {"Content-Type": "application/json"} if backend.api_key: headers["Authorization"] = f"Bearer {backend.api_key}" payload = { "model": model, "messages": [m.model_dump(exclude_none=True) for m in request.messages], "stream": False, "temperature": request.temperature, "top_p": request.top_p, } if request.max_tokens: payload["max_tokens"] = request.max_tokens if request.stop: payload["stop"] = request.stop response = await client.post(f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) response.raise_for_status() data = response.json() choice = data.get("choices", [{}])[0] usage_data = data.get("usage", {}) return InferenceResult( content=choice.get("message", {}).get("content", ""), model=model, backend=backend.name, usage=Usage( prompt_tokens=usage_data.get("prompt_tokens", 0), completion_tokens=usage_data.get("completion_tokens", 0), total_tokens=usage_data.get("total_tokens", 0), ), finish_reason=choice.get("finish_reason", "stop"), ) async def stream_openai_compatible(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]: """Streamt von OpenAI-kompatibler API.""" headers = {"Content-Type": "application/json"} if backend.api_key: headers["Authorization"] = f"Bearer {backend.api_key}" payload = { "model": model, "messages": [m.model_dump(exclude_none=True) for m in request.messages], "stream": True, "temperature": request.temperature, "top_p": request.top_p, } if request.max_tokens: payload["max_tokens"] = request.max_tokens async with client.stream("POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) as response: response.raise_for_status() async for line in response.aiter_lines(): if not line or not line.startswith("data: "): continue data_str = line[6:] if data_str == "[DONE]": break try: data = json.loads(data_str) choice = data.get("choices", [{}])[0] delta = choice.get("delta", {}) yield ChatCompletionChunk( id=response_id, model=model, choices=[StreamChoice(index=0, delta=ChatChoiceDelta(role=delta.get("role"), content=delta.get("content")), finish_reason=choice.get("finish_reason"))], ) except json.JSONDecodeError: continue async def call_anthropic(backend, model, request) -> InferenceResult: """Ruft Anthropic Claude API auf.""" try: import anthropic except ImportError: raise ImportError("anthropic package required for Claude API") client = anthropic.AsyncAnthropic(api_key=backend.api_key) system_content = "" messages = [] for msg in request.messages: if msg.role == "system": system_content += (msg.content or "") + "\n" else: messages.append({"role": msg.role, "content": msg.content or ""}) response = await client.messages.create( model=model, max_tokens=request.max_tokens or 4096, system=system_content.strip() if system_content else None, messages=messages, temperature=request.temperature, top_p=request.top_p, ) content = "" if response.content: content = response.content[0].text if response.content[0].type == "text" else "" return InferenceResult( content=content, model=model, backend="anthropic", usage=Usage( prompt_tokens=response.usage.input_tokens, completion_tokens=response.usage.output_tokens, total_tokens=response.usage.input_tokens + response.usage.output_tokens, ), finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop", ) async def stream_anthropic(backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]: """Streamt von Anthropic Claude API.""" try: import anthropic except ImportError: raise ImportError("anthropic package required for Claude API") client = anthropic.AsyncAnthropic(api_key=backend.api_key) system_content = "" messages = [] for msg in request.messages: if msg.role == "system": system_content += (msg.content or "") + "\n" else: messages.append({"role": msg.role, "content": msg.content or ""}) async with client.messages.stream( model=model, max_tokens=request.max_tokens or 4096, system=system_content.strip() if system_content else None, messages=messages, temperature=request.temperature, top_p=request.top_p, ) as stream: async for text in stream.text_stream: yield ChatCompletionChunk( id=response_id, model=model, choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=text), finish_reason=None)], ) yield ChatCompletionChunk( id=response_id, model=model, choices=[StreamChoice(index=0, delta=ChatChoiceDelta(), finish_reason="stop")], )