A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
523 lines
18 KiB
Python
523 lines
18 KiB
Python
"""
|
|
Inference Service - Kommunikation mit LLM Backends.
|
|
|
|
Unterstützt:
|
|
- Ollama (lokal)
|
|
- vLLM (remote, OpenAI-kompatibel)
|
|
- Anthropic Claude API (Fallback)
|
|
"""
|
|
|
|
import httpx
|
|
import json
|
|
import logging
|
|
from typing import AsyncIterator, Optional
|
|
from dataclasses import dataclass
|
|
|
|
from ..config import get_config, LLMBackendConfig
|
|
from ..models.chat import (
|
|
ChatCompletionRequest,
|
|
ChatCompletionResponse,
|
|
ChatCompletionChunk,
|
|
ChatMessage,
|
|
ChatChoice,
|
|
StreamChoice,
|
|
ChatChoiceDelta,
|
|
Usage,
|
|
ModelInfo,
|
|
ModelListResponse,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class InferenceResult:
|
|
"""Ergebnis einer Inference-Anfrage."""
|
|
content: str
|
|
model: str
|
|
backend: str
|
|
usage: Optional[Usage] = None
|
|
finish_reason: str = "stop"
|
|
|
|
|
|
class InferenceService:
|
|
"""Service für LLM Inference über verschiedene Backends."""
|
|
|
|
def __init__(self):
|
|
self.config = get_config()
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def get_client(self) -> httpx.AsyncClient:
|
|
"""Lazy initialization des HTTP Clients."""
|
|
if self._client is None:
|
|
self._client = httpx.AsyncClient(timeout=120.0)
|
|
return self._client
|
|
|
|
async def close(self):
|
|
"""Schließt den HTTP Client."""
|
|
if self._client:
|
|
await self._client.aclose()
|
|
self._client = None
|
|
|
|
def _get_available_backend(self, preferred_model: Optional[str] = None) -> Optional[LLMBackendConfig]:
|
|
"""Findet das erste verfügbare Backend basierend auf Priorität."""
|
|
for backend_name in self.config.backend_priority:
|
|
backend = getattr(self.config, backend_name, None)
|
|
if backend and backend.enabled:
|
|
return backend
|
|
return None
|
|
|
|
def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
|
|
"""
|
|
Mapped ein Modell-Name zum entsprechenden Backend.
|
|
|
|
Beispiele:
|
|
- "breakpilot-teacher-8b" → Ollama/vLLM mit llama3.1:8b
|
|
- "claude-3-5-sonnet" → Anthropic
|
|
"""
|
|
model_lower = model.lower()
|
|
|
|
# Explizite Claude-Modelle → Anthropic
|
|
if "claude" in model_lower:
|
|
if self.config.anthropic and self.config.anthropic.enabled:
|
|
return self.config.anthropic.default_model, self.config.anthropic
|
|
raise ValueError("Anthropic backend not configured")
|
|
|
|
# BreakPilot Modelle → primäres Backend
|
|
if "breakpilot" in model_lower or "teacher" in model_lower:
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
# Map zu tatsächlichem Modell-Namen
|
|
if "70b" in model_lower:
|
|
actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
|
else:
|
|
actual_model = "llama3.1:8b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
return actual_model, backend
|
|
raise ValueError("No LLM backend available")
|
|
|
|
# Mistral Modelle
|
|
if "mistral" in model_lower:
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
actual_model = "mistral:7b" if backend.name == "ollama" else "mistralai/Mistral-7B-Instruct-v0.2"
|
|
return actual_model, backend
|
|
raise ValueError("No LLM backend available")
|
|
|
|
# Fallback: verwende Modell-Name direkt
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
return model, backend
|
|
raise ValueError("No LLM backend available")
|
|
|
|
async def _call_ollama(
|
|
self,
|
|
backend: LLMBackendConfig,
|
|
model: str,
|
|
request: ChatCompletionRequest,
|
|
) -> InferenceResult:
|
|
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
|
|
client = await self.get_client()
|
|
|
|
# Ollama verwendet eigenes Format
|
|
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": request.temperature,
|
|
"top_p": request.top_p,
|
|
},
|
|
}
|
|
|
|
if request.max_tokens:
|
|
payload["options"]["num_predict"] = request.max_tokens
|
|
|
|
response = await client.post(
|
|
f"{backend.base_url}/api/chat",
|
|
json=payload,
|
|
timeout=backend.timeout,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
return InferenceResult(
|
|
content=data.get("message", {}).get("content", ""),
|
|
model=model,
|
|
backend="ollama",
|
|
usage=Usage(
|
|
prompt_tokens=data.get("prompt_eval_count", 0),
|
|
completion_tokens=data.get("eval_count", 0),
|
|
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
|
|
),
|
|
finish_reason="stop" if data.get("done") else "length",
|
|
)
|
|
|
|
async def _stream_ollama(
|
|
self,
|
|
backend: LLMBackendConfig,
|
|
model: str,
|
|
request: ChatCompletionRequest,
|
|
response_id: str,
|
|
) -> AsyncIterator[ChatCompletionChunk]:
|
|
"""Streamt von Ollama."""
|
|
client = await self.get_client()
|
|
|
|
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"stream": True,
|
|
"options": {
|
|
"temperature": request.temperature,
|
|
"top_p": request.top_p,
|
|
},
|
|
}
|
|
|
|
if request.max_tokens:
|
|
payload["options"]["num_predict"] = request.max_tokens
|
|
|
|
async with client.stream(
|
|
"POST",
|
|
f"{backend.base_url}/api/chat",
|
|
json=payload,
|
|
timeout=backend.timeout,
|
|
) as response:
|
|
response.raise_for_status()
|
|
async for line in response.aiter_lines():
|
|
if not line:
|
|
continue
|
|
try:
|
|
data = json.loads(line)
|
|
content = data.get("message", {}).get("content", "")
|
|
done = data.get("done", False)
|
|
|
|
yield ChatCompletionChunk(
|
|
id=response_id,
|
|
model=model,
|
|
choices=[
|
|
StreamChoice(
|
|
index=0,
|
|
delta=ChatChoiceDelta(content=content),
|
|
finish_reason="stop" if done else None,
|
|
)
|
|
],
|
|
)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
async def _call_openai_compatible(
|
|
self,
|
|
backend: LLMBackendConfig,
|
|
model: str,
|
|
request: ChatCompletionRequest,
|
|
) -> InferenceResult:
|
|
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
|
|
client = await self.get_client()
|
|
|
|
headers = {"Content-Type": "application/json"}
|
|
if backend.api_key:
|
|
headers["Authorization"] = f"Bearer {backend.api_key}"
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
|
|
"stream": False,
|
|
"temperature": request.temperature,
|
|
"top_p": request.top_p,
|
|
}
|
|
|
|
if request.max_tokens:
|
|
payload["max_tokens"] = request.max_tokens
|
|
if request.stop:
|
|
payload["stop"] = request.stop
|
|
|
|
response = await client.post(
|
|
f"{backend.base_url}/v1/chat/completions",
|
|
json=payload,
|
|
headers=headers,
|
|
timeout=backend.timeout,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
choice = data.get("choices", [{}])[0]
|
|
usage_data = data.get("usage", {})
|
|
|
|
return InferenceResult(
|
|
content=choice.get("message", {}).get("content", ""),
|
|
model=model,
|
|
backend=backend.name,
|
|
usage=Usage(
|
|
prompt_tokens=usage_data.get("prompt_tokens", 0),
|
|
completion_tokens=usage_data.get("completion_tokens", 0),
|
|
total_tokens=usage_data.get("total_tokens", 0),
|
|
),
|
|
finish_reason=choice.get("finish_reason", "stop"),
|
|
)
|
|
|
|
async def _stream_openai_compatible(
|
|
self,
|
|
backend: LLMBackendConfig,
|
|
model: str,
|
|
request: ChatCompletionRequest,
|
|
response_id: str,
|
|
) -> AsyncIterator[ChatCompletionChunk]:
|
|
"""Streamt von OpenAI-kompatibler API."""
|
|
client = await self.get_client()
|
|
|
|
headers = {"Content-Type": "application/json"}
|
|
if backend.api_key:
|
|
headers["Authorization"] = f"Bearer {backend.api_key}"
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
|
|
"stream": True,
|
|
"temperature": request.temperature,
|
|
"top_p": request.top_p,
|
|
}
|
|
|
|
if request.max_tokens:
|
|
payload["max_tokens"] = request.max_tokens
|
|
|
|
async with client.stream(
|
|
"POST",
|
|
f"{backend.base_url}/v1/chat/completions",
|
|
json=payload,
|
|
headers=headers,
|
|
timeout=backend.timeout,
|
|
) as response:
|
|
response.raise_for_status()
|
|
async for line in response.aiter_lines():
|
|
if not line or not line.startswith("data: "):
|
|
continue
|
|
data_str = line[6:] # Remove "data: " prefix
|
|
if data_str == "[DONE]":
|
|
break
|
|
try:
|
|
data = json.loads(data_str)
|
|
choice = data.get("choices", [{}])[0]
|
|
delta = choice.get("delta", {})
|
|
|
|
yield ChatCompletionChunk(
|
|
id=response_id,
|
|
model=model,
|
|
choices=[
|
|
StreamChoice(
|
|
index=0,
|
|
delta=ChatChoiceDelta(
|
|
role=delta.get("role"),
|
|
content=delta.get("content"),
|
|
),
|
|
finish_reason=choice.get("finish_reason"),
|
|
)
|
|
],
|
|
)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
async def _call_anthropic(
|
|
self,
|
|
backend: LLMBackendConfig,
|
|
model: str,
|
|
request: ChatCompletionRequest,
|
|
) -> InferenceResult:
|
|
"""Ruft Anthropic Claude API auf."""
|
|
# Anthropic SDK verwenden (bereits installiert)
|
|
try:
|
|
import anthropic
|
|
except ImportError:
|
|
raise ImportError("anthropic package required for Claude API")
|
|
|
|
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
|
|
|
|
# System message extrahieren
|
|
system_content = ""
|
|
messages = []
|
|
for msg in request.messages:
|
|
if msg.role == "system":
|
|
system_content += (msg.content or "") + "\n"
|
|
else:
|
|
messages.append({"role": msg.role, "content": msg.content or ""})
|
|
|
|
response = await client.messages.create(
|
|
model=model,
|
|
max_tokens=request.max_tokens or 4096,
|
|
system=system_content.strip() if system_content else None,
|
|
messages=messages,
|
|
temperature=request.temperature,
|
|
top_p=request.top_p,
|
|
)
|
|
|
|
content = ""
|
|
if response.content:
|
|
content = response.content[0].text if response.content[0].type == "text" else ""
|
|
|
|
return InferenceResult(
|
|
content=content,
|
|
model=model,
|
|
backend="anthropic",
|
|
usage=Usage(
|
|
prompt_tokens=response.usage.input_tokens,
|
|
completion_tokens=response.usage.output_tokens,
|
|
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
|
|
),
|
|
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
|
|
)
|
|
|
|
async def _stream_anthropic(
|
|
self,
|
|
backend: LLMBackendConfig,
|
|
model: str,
|
|
request: ChatCompletionRequest,
|
|
response_id: str,
|
|
) -> AsyncIterator[ChatCompletionChunk]:
|
|
"""Streamt von Anthropic Claude API."""
|
|
try:
|
|
import anthropic
|
|
except ImportError:
|
|
raise ImportError("anthropic package required for Claude API")
|
|
|
|
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
|
|
|
|
# System message extrahieren
|
|
system_content = ""
|
|
messages = []
|
|
for msg in request.messages:
|
|
if msg.role == "system":
|
|
system_content += (msg.content or "") + "\n"
|
|
else:
|
|
messages.append({"role": msg.role, "content": msg.content or ""})
|
|
|
|
async with client.messages.stream(
|
|
model=model,
|
|
max_tokens=request.max_tokens or 4096,
|
|
system=system_content.strip() if system_content else None,
|
|
messages=messages,
|
|
temperature=request.temperature,
|
|
top_p=request.top_p,
|
|
) as stream:
|
|
async for text in stream.text_stream:
|
|
yield ChatCompletionChunk(
|
|
id=response_id,
|
|
model=model,
|
|
choices=[
|
|
StreamChoice(
|
|
index=0,
|
|
delta=ChatChoiceDelta(content=text),
|
|
finish_reason=None,
|
|
)
|
|
],
|
|
)
|
|
|
|
# Final chunk with finish_reason
|
|
yield ChatCompletionChunk(
|
|
id=response_id,
|
|
model=model,
|
|
choices=[
|
|
StreamChoice(
|
|
index=0,
|
|
delta=ChatChoiceDelta(),
|
|
finish_reason="stop",
|
|
)
|
|
],
|
|
)
|
|
|
|
async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
|
|
"""
|
|
Führt Chat Completion durch (non-streaming).
|
|
"""
|
|
actual_model, backend = self._map_model_to_backend(request.model)
|
|
|
|
logger.info(f"Inference request: model={request.model} → {actual_model} via {backend.name}")
|
|
|
|
if backend.name == "ollama":
|
|
result = await self._call_ollama(backend, actual_model, request)
|
|
elif backend.name == "anthropic":
|
|
result = await self._call_anthropic(backend, actual_model, request)
|
|
else:
|
|
result = await self._call_openai_compatible(backend, actual_model, request)
|
|
|
|
return ChatCompletionResponse(
|
|
model=request.model, # Original requested model name
|
|
choices=[
|
|
ChatChoice(
|
|
index=0,
|
|
message=ChatMessage(role="assistant", content=result.content),
|
|
finish_reason=result.finish_reason,
|
|
)
|
|
],
|
|
usage=result.usage,
|
|
)
|
|
|
|
async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
|
|
"""
|
|
Führt Chat Completion mit Streaming durch.
|
|
"""
|
|
import uuid
|
|
response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
|
|
|
actual_model, backend = self._map_model_to_backend(request.model)
|
|
|
|
logger.info(f"Streaming request: model={request.model} → {actual_model} via {backend.name}")
|
|
|
|
if backend.name == "ollama":
|
|
async for chunk in self._stream_ollama(backend, actual_model, request, response_id):
|
|
yield chunk
|
|
elif backend.name == "anthropic":
|
|
async for chunk in self._stream_anthropic(backend, actual_model, request, response_id):
|
|
yield chunk
|
|
else:
|
|
async for chunk in self._stream_openai_compatible(backend, actual_model, request, response_id):
|
|
yield chunk
|
|
|
|
async def list_models(self) -> ModelListResponse:
|
|
"""Listet verfügbare Modelle."""
|
|
models = []
|
|
|
|
# BreakPilot Modelle (mapped zu verfügbaren Backends)
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
models.extend([
|
|
ModelInfo(
|
|
id="breakpilot-teacher-8b",
|
|
owned_by="breakpilot",
|
|
description="Llama 3.1 8B optimiert für Schulkontext",
|
|
context_length=8192,
|
|
),
|
|
ModelInfo(
|
|
id="breakpilot-teacher-70b",
|
|
owned_by="breakpilot",
|
|
description="Llama 3.1 70B für komplexe Aufgaben",
|
|
context_length=8192,
|
|
),
|
|
])
|
|
|
|
# Claude Modelle (wenn Anthropic konfiguriert)
|
|
if self.config.anthropic and self.config.anthropic.enabled:
|
|
models.append(
|
|
ModelInfo(
|
|
id="claude-3-5-sonnet",
|
|
owned_by="anthropic",
|
|
description="Claude 3.5 Sonnet - Fallback für höchste Qualität",
|
|
context_length=200000,
|
|
)
|
|
)
|
|
|
|
return ModelListResponse(data=models)
|
|
|
|
|
|
# Singleton
|
|
_inference_service: Optional[InferenceService] = None
|
|
|
|
|
|
def get_inference_service() -> InferenceService:
|
|
"""Gibt den Inference Service Singleton zurück."""
|
|
global _inference_service
|
|
if _inference_service is None:
|
|
_inference_service = InferenceService()
|
|
return _inference_service
|