breakpilot-lehrer/backend-lehrer/llm_gateway/services/inference_backends.py

"""
Inference Backends - Kommunikation mit einzelnen LLM-Providern.

Unterstützt Ollama, OpenAI-kompatible APIs und Anthropic Claude.
"""

import json
import logging
from typing import AsyncIterator, Optional
from dataclasses import dataclass

from ..config import LLMBackendConfig
from ..models.chat import (
    ChatCompletionRequest,
    ChatCompletionChunk,
    ChatMessage,
    StreamChoice,
    ChatChoiceDelta,
    Usage,
)

logger = logging.getLogger(__name__)


@dataclass
class InferenceResult:
    """Ergebnis einer Inference-Anfrage."""
    content: str
    model: str
    backend: str
    usage: Optional[Usage] = None
    finish_reason: str = "stop"


async def call_ollama(client, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest) -> InferenceResult:
    """Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
    messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]

    payload = {
        "model": model,
        "messages": messages,
        "stream": False,
        "options": {"temperature": request.temperature, "top_p": request.top_p},
    }
    if request.max_tokens:
        payload["options"]["num_predict"] = request.max_tokens

    response = await client.post(f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout)
    response.raise_for_status()
    data = response.json()

    return InferenceResult(
        content=data.get("message", {}).get("content", ""),
        model=model, backend="ollama",
        usage=Usage(
            prompt_tokens=data.get("prompt_eval_count", 0),
            completion_tokens=data.get("eval_count", 0),
            total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
        ),
        finish_reason="stop" if data.get("done") else "length",
    )


async def stream_ollama(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
    """Streamt von Ollama."""
    messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]

    payload = {
        "model": model, "messages": messages, "stream": True,
        "options": {"temperature": request.temperature, "top_p": request.top_p},
    }
    if request.max_tokens:
        payload["options"]["num_predict"] = request.max_tokens

    async with client.stream("POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) as response:
        response.raise_for_status()
        async for line in response.aiter_lines():
            if not line:
                continue
            try:
                data = json.loads(line)
                content = data.get("message", {}).get("content", "")
                done = data.get("done", False)
                yield ChatCompletionChunk(
                    id=response_id, model=model,
                    choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None)],
                )
            except json.JSONDecodeError:
                continue


async def call_openai_compatible(client, backend, model, request) -> InferenceResult:
    """Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
    headers = {"Content-Type": "application/json"}
    if backend.api_key:
        headers["Authorization"] = f"Bearer {backend.api_key}"

    payload = {
        "model": model,
        "messages": [m.model_dump(exclude_none=True) for m in request.messages],
        "stream": False, "temperature": request.temperature, "top_p": request.top_p,
    }
    if request.max_tokens:
        payload["max_tokens"] = request.max_tokens
    if request.stop:
        payload["stop"] = request.stop

    response = await client.post(f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout)
    response.raise_for_status()
    data = response.json()

    choice = data.get("choices", [{}])[0]
    usage_data = data.get("usage", {})

    return InferenceResult(
        content=choice.get("message", {}).get("content", ""),
        model=model, backend=backend.name,
        usage=Usage(
            prompt_tokens=usage_data.get("prompt_tokens", 0),
            completion_tokens=usage_data.get("completion_tokens", 0),
            total_tokens=usage_data.get("total_tokens", 0),
        ),
        finish_reason=choice.get("finish_reason", "stop"),
    )


async def stream_openai_compatible(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
    """Streamt von OpenAI-kompatibler API."""
    headers = {"Content-Type": "application/json"}
    if backend.api_key:
        headers["Authorization"] = f"Bearer {backend.api_key}"

    payload = {
        "model": model,
        "messages": [m.model_dump(exclude_none=True) for m in request.messages],
        "stream": True, "temperature": request.temperature, "top_p": request.top_p,
    }
    if request.max_tokens:
        payload["max_tokens"] = request.max_tokens

    async with client.stream("POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) as response:
        response.raise_for_status()
        async for line in response.aiter_lines():
            if not line or not line.startswith("data: "):
                continue
            data_str = line[6:]
            if data_str == "[DONE]":
                break
            try:
                data = json.loads(data_str)
                choice = data.get("choices", [{}])[0]
                delta = choice.get("delta", {})
                yield ChatCompletionChunk(
                    id=response_id, model=model,
                    choices=[StreamChoice(index=0, delta=ChatChoiceDelta(role=delta.get("role"), content=delta.get("content")), finish_reason=choice.get("finish_reason"))],
                )
            except json.JSONDecodeError:
                continue


async def call_anthropic(backend, model, request) -> InferenceResult:
    """Ruft Anthropic Claude API auf."""
    try:
        import anthropic
    except ImportError:
        raise ImportError("anthropic package required for Claude API")

    client = anthropic.AsyncAnthropic(api_key=backend.api_key)

    system_content = ""
    messages = []
    for msg in request.messages:
        if msg.role == "system":
            system_content += (msg.content or "") + "\n"
        else:
            messages.append({"role": msg.role, "content": msg.content or ""})

    response = await client.messages.create(
        model=model, max_tokens=request.max_tokens or 4096,
        system=system_content.strip() if system_content else None,
        messages=messages, temperature=request.temperature, top_p=request.top_p,
    )

    content = ""
    if response.content:
        content = response.content[0].text if response.content[0].type == "text" else ""

    return InferenceResult(
        content=content, model=model, backend="anthropic",
        usage=Usage(
            prompt_tokens=response.usage.input_tokens,
            completion_tokens=response.usage.output_tokens,
            total_tokens=response.usage.input_tokens + response.usage.output_tokens,
        ),
        finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
    )


async def stream_anthropic(backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
    """Streamt von Anthropic Claude API."""
    try:
        import anthropic
    except ImportError:
        raise ImportError("anthropic package required for Claude API")

    client = anthropic.AsyncAnthropic(api_key=backend.api_key)

    system_content = ""
    messages = []
    for msg in request.messages:
        if msg.role == "system":
            system_content += (msg.content or "") + "\n"
        else:
            messages.append({"role": msg.role, "content": msg.content or ""})

    async with client.messages.stream(
        model=model, max_tokens=request.max_tokens or 4096,
        system=system_content.strip() if system_content else None,
        messages=messages, temperature=request.temperature, top_p=request.top_p,
    ) as stream:
        async for text in stream.text_stream:
            yield ChatCompletionChunk(
                id=response_id, model=model,
                choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=text), finish_reason=None)],
            )

        yield ChatCompletionChunk(
            id=response_id, model=model,
            choices=[StreamChoice(index=0, delta=ChatChoiceDelta(), finish_reason="stop")],
        )