[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
--- a/backend-lehrer/llm_gateway/services/inference_backends.py
+++ b/backend-lehrer/llm_gateway/services/inference_backends.py
@@ -0,0 +1,230 @@
+"""
+Inference Backends - Kommunikation mit einzelnen LLM-Providern.
+
+Unterstützt Ollama, OpenAI-kompatible APIs und Anthropic Claude.
+"""
+
+import json
+import logging
+from typing import AsyncIterator, Optional
+from dataclasses import dataclass
+
+from ..config import LLMBackendConfig
+from ..models.chat import (
+    ChatCompletionRequest,
+    ChatCompletionChunk,
+    ChatMessage,
+    StreamChoice,
+    ChatChoiceDelta,
+    Usage,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class InferenceResult:
+    """Ergebnis einer Inference-Anfrage."""
+    content: str
+    model: str
+    backend: str
+    usage: Optional[Usage] = None
+    finish_reason: str = "stop"
+
+
+async def call_ollama(client, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest) -> InferenceResult:
+    """Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
+    messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
+
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": False,
+        "options": {"temperature": request.temperature, "top_p": request.top_p},
+    }
+    if request.max_tokens:
+        payload["options"]["num_predict"] = request.max_tokens
+
+    response = await client.post(f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout)
+    response.raise_for_status()
+    data = response.json()
+
+    return InferenceResult(
+        content=data.get("message", {}).get("content", ""),
+        model=model, backend="ollama",
+        usage=Usage(
+            prompt_tokens=data.get("prompt_eval_count", 0),
+            completion_tokens=data.get("eval_count", 0),
+            total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
+        ),
+        finish_reason="stop" if data.get("done") else "length",
+    )
+
+
+async def stream_ollama(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
+    """Streamt von Ollama."""
+    messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
+
+    payload = {
+        "model": model, "messages": messages, "stream": True,
+        "options": {"temperature": request.temperature, "top_p": request.top_p},
+    }
+    if request.max_tokens:
+        payload["options"]["num_predict"] = request.max_tokens
+
+    async with client.stream("POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) as response:
+        response.raise_for_status()
+        async for line in response.aiter_lines():
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                content = data.get("message", {}).get("content", "")
+                done = data.get("done", False)
+                yield ChatCompletionChunk(
+                    id=response_id, model=model,
+                    choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None)],
+                )
+            except json.JSONDecodeError:
+                continue
+
+
+async def call_openai_compatible(client, backend, model, request) -> InferenceResult:
+    """Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
+    headers = {"Content-Type": "application/json"}
+    if backend.api_key:
+        headers["Authorization"] = f"Bearer {backend.api_key}"
+
+    payload = {
+        "model": model,
+        "messages": [m.model_dump(exclude_none=True) for m in request.messages],
+        "stream": False, "temperature": request.temperature, "top_p": request.top_p,
+    }
+    if request.max_tokens:
+        payload["max_tokens"] = request.max_tokens
+    if request.stop:
+        payload["stop"] = request.stop
+
+    response = await client.post(f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout)
+    response.raise_for_status()
+    data = response.json()
+
+    choice = data.get("choices", [{}])[0]
+    usage_data = data.get("usage", {})
+
+    return InferenceResult(
+        content=choice.get("message", {}).get("content", ""),
+        model=model, backend=backend.name,
+        usage=Usage(
+            prompt_tokens=usage_data.get("prompt_tokens", 0),
+            completion_tokens=usage_data.get("completion_tokens", 0),
+            total_tokens=usage_data.get("total_tokens", 0),
+        ),
+        finish_reason=choice.get("finish_reason", "stop"),
+    )
+
+
+async def stream_openai_compatible(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
+    """Streamt von OpenAI-kompatibler API."""
+    headers = {"Content-Type": "application/json"}
+    if backend.api_key:
+        headers["Authorization"] = f"Bearer {backend.api_key}"
+
+    payload = {
+        "model": model,
+        "messages": [m.model_dump(exclude_none=True) for m in request.messages],
+        "stream": True, "temperature": request.temperature, "top_p": request.top_p,
+    }
+    if request.max_tokens:
+        payload["max_tokens"] = request.max_tokens
+
+    async with client.stream("POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) as response:
+        response.raise_for_status()
+        async for line in response.aiter_lines():
+            if not line or not line.startswith("data: "):
+                continue
+            data_str = line[6:]
+            if data_str == "[DONE]":
+                break
+            try:
+                data = json.loads(data_str)
+                choice = data.get("choices", [{}])[0]
+                delta = choice.get("delta", {})
+                yield ChatCompletionChunk(
+                    id=response_id, model=model,
+                    choices=[StreamChoice(index=0, delta=ChatChoiceDelta(role=delta.get("role"), content=delta.get("content")), finish_reason=choice.get("finish_reason"))],
+                )
+            except json.JSONDecodeError:
+                continue
+
+
+async def call_anthropic(backend, model, request) -> InferenceResult:
+    """Ruft Anthropic Claude API auf."""
+    try:
+        import anthropic
+    except ImportError:
+        raise ImportError("anthropic package required for Claude API")
+
+    client = anthropic.AsyncAnthropic(api_key=backend.api_key)
+
+    system_content = ""
+    messages = []
+    for msg in request.messages:
+        if msg.role == "system":
+            system_content += (msg.content or "") + "\n"
+        else:
+            messages.append({"role": msg.role, "content": msg.content or ""})
+
+    response = await client.messages.create(
+        model=model, max_tokens=request.max_tokens or 4096,
+        system=system_content.strip() if system_content else None,
+        messages=messages, temperature=request.temperature, top_p=request.top_p,
+    )
+
+    content = ""
+    if response.content:
+        content = response.content[0].text if response.content[0].type == "text" else ""
+
+    return InferenceResult(
+        content=content, model=model, backend="anthropic",
+        usage=Usage(
+            prompt_tokens=response.usage.input_tokens,
+            completion_tokens=response.usage.output_tokens,
+            total_tokens=response.usage.input_tokens + response.usage.output_tokens,
+        ),
+        finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
+    )
+
+
+async def stream_anthropic(backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
+    """Streamt von Anthropic Claude API."""
+    try:
+        import anthropic
+    except ImportError:
+        raise ImportError("anthropic package required for Claude API")
+
+    client = anthropic.AsyncAnthropic(api_key=backend.api_key)
+
+    system_content = ""
+    messages = []
+    for msg in request.messages:
+        if msg.role == "system":
+            system_content += (msg.content or "") + "\n"
+        else:
+            messages.append({"role": msg.role, "content": msg.content or ""})
+
+    async with client.messages.stream(
+        model=model, max_tokens=request.max_tokens or 4096,
+        system=system_content.strip() if system_content else None,
+        messages=messages, temperature=request.temperature, top_p=request.top_p,
+    ) as stream:
+        async for text in stream.text_stream:
+            yield ChatCompletionChunk(
+                id=response_id, model=model,
+                choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=text), finish_reason=None)],
+            )
+
+        yield ChatCompletionChunk(
+            id=response_id, model=model,
+            choices=[StreamChoice(index=0, delta=ChatChoiceDelta(), finish_reason="stop")],
+        )