klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
163 lines
6.1 KiB
Python
163 lines
6.1 KiB
Python
"""
|
|
Inference Service - Kommunikation mit LLM Backends.
|
|
|
|
Unterstützt:
|
|
- Ollama (lokal)
|
|
- vLLM (remote, OpenAI-kompatibel)
|
|
- Anthropic Claude API (Fallback)
|
|
"""
|
|
|
|
import httpx
|
|
import logging
|
|
from typing import AsyncIterator, Optional
|
|
|
|
from ..config import get_config, LLMBackendConfig
|
|
from ..models.chat import (
|
|
ChatCompletionRequest,
|
|
ChatCompletionResponse,
|
|
ChatCompletionChunk,
|
|
ChatMessage,
|
|
ChatChoice,
|
|
Usage,
|
|
ModelInfo,
|
|
ModelListResponse,
|
|
)
|
|
from .inference_backends import (
|
|
InferenceResult,
|
|
call_ollama,
|
|
stream_ollama,
|
|
call_openai_compatible,
|
|
stream_openai_compatible,
|
|
call_anthropic,
|
|
stream_anthropic,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class InferenceService:
|
|
"""Service für LLM Inference über verschiedene Backends."""
|
|
|
|
def __init__(self):
|
|
self.config = get_config()
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def get_client(self) -> httpx.AsyncClient:
|
|
"""Lazy initialization des HTTP Clients."""
|
|
if self._client is None:
|
|
self._client = httpx.AsyncClient(timeout=120.0)
|
|
return self._client
|
|
|
|
async def close(self):
|
|
"""Schließt den HTTP Client."""
|
|
if self._client:
|
|
await self._client.aclose()
|
|
self._client = None
|
|
|
|
def _get_available_backend(self, preferred_model: Optional[str] = None) -> Optional[LLMBackendConfig]:
|
|
"""Findet das erste verfügbare Backend basierend auf Priorität."""
|
|
for backend_name in self.config.backend_priority:
|
|
backend = getattr(self.config, backend_name, None)
|
|
if backend and backend.enabled:
|
|
return backend
|
|
return None
|
|
|
|
def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
|
|
"""Mapped ein Modell-Name zum entsprechenden Backend."""
|
|
model_lower = model.lower()
|
|
|
|
if "claude" in model_lower:
|
|
if self.config.anthropic and self.config.anthropic.enabled:
|
|
return self.config.anthropic.default_model, self.config.anthropic
|
|
raise ValueError("Anthropic backend not configured")
|
|
|
|
if "breakpilot" in model_lower or "teacher" in model_lower:
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
if "70b" in model_lower:
|
|
actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
|
else:
|
|
actual_model = "llama3.1:8b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
return actual_model, backend
|
|
raise ValueError("No LLM backend available")
|
|
|
|
if "mistral" in model_lower:
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
actual_model = "mistral:7b" if backend.name == "ollama" else "mistralai/Mistral-7B-Instruct-v0.2"
|
|
return actual_model, backend
|
|
raise ValueError("No LLM backend available")
|
|
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
return model, backend
|
|
raise ValueError("No LLM backend available")
|
|
|
|
async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
|
|
"""Führt Chat Completion durch (non-streaming)."""
|
|
actual_model, backend = self._map_model_to_backend(request.model)
|
|
logger.info(f"Inference request: model={request.model} -> {actual_model} via {backend.name}")
|
|
|
|
client = await self.get_client()
|
|
|
|
if backend.name == "ollama":
|
|
result = await call_ollama(client, backend, actual_model, request)
|
|
elif backend.name == "anthropic":
|
|
result = await call_anthropic(backend, actual_model, request)
|
|
else:
|
|
result = await call_openai_compatible(client, backend, actual_model, request)
|
|
|
|
return ChatCompletionResponse(
|
|
model=request.model,
|
|
choices=[ChatChoice(index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason)],
|
|
usage=result.usage,
|
|
)
|
|
|
|
async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
|
|
"""Führt Chat Completion mit Streaming durch."""
|
|
import uuid
|
|
response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
|
|
|
actual_model, backend = self._map_model_to_backend(request.model)
|
|
logger.info(f"Streaming request: model={request.model} -> {actual_model} via {backend.name}")
|
|
|
|
client = await self.get_client()
|
|
|
|
if backend.name == "ollama":
|
|
async for chunk in stream_ollama(client, backend, actual_model, request, response_id):
|
|
yield chunk
|
|
elif backend.name == "anthropic":
|
|
async for chunk in stream_anthropic(backend, actual_model, request, response_id):
|
|
yield chunk
|
|
else:
|
|
async for chunk in stream_openai_compatible(client, backend, actual_model, request, response_id):
|
|
yield chunk
|
|
|
|
async def list_models(self) -> ModelListResponse:
|
|
"""Listet verfügbare Modelle."""
|
|
models = []
|
|
|
|
backend = self._get_available_backend()
|
|
if backend:
|
|
models.extend([
|
|
ModelInfo(id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192),
|
|
ModelInfo(id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192),
|
|
])
|
|
|
|
if self.config.anthropic and self.config.anthropic.enabled:
|
|
models.append(ModelInfo(id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000))
|
|
|
|
return ModelListResponse(data=models)
|
|
|
|
|
|
# Singleton
|
|
_inference_service: Optional[InferenceService] = None
|
|
|
|
|
|
def get_inference_service() -> InferenceService:
|
|
"""Gibt den Inference Service Singleton zurück."""
|
|
global _inference_service
|
|
if _inference_service is None:
|
|
_inference_service = InferenceService()
|
|
return _inference_service
|