[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions

View File

@@ -8,10 +8,8 @@ Unterstützt:
"""
import httpx
import json
import logging
from typing import AsyncIterator, Optional
from dataclasses import dataclass
from ..config import get_config, LLMBackendConfig
from ..models.chat import (
@@ -20,26 +18,23 @@ from ..models.chat import (
ChatCompletionChunk,
ChatMessage,
ChatChoice,
StreamChoice,
ChatChoiceDelta,
Usage,
ModelInfo,
ModelListResponse,
)
from .inference_backends import (
InferenceResult,
call_ollama,
stream_ollama,
call_openai_compatible,
stream_openai_compatible,
call_anthropic,
stream_anthropic,
)
logger = logging.getLogger(__name__)
@dataclass
class InferenceResult:
"""Ergebnis einer Inference-Anfrage."""
content: str
model: str
backend: str
usage: Optional[Usage] = None
finish_reason: str = "stop"
class InferenceService:
"""Service für LLM Inference über verschiedene Backends."""
@@ -68,26 +63,17 @@ class InferenceService:
return None
def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
"""
Mapped ein Modell-Name zum entsprechenden Backend.
Beispiele:
- "breakpilot-teacher-8b" → Ollama/vLLM mit llama3.1:8b
- "claude-3-5-sonnet" → Anthropic
"""
"""Mapped ein Modell-Name zum entsprechenden Backend."""
model_lower = model.lower()
# Explizite Claude-Modelle → Anthropic
if "claude" in model_lower:
if self.config.anthropic and self.config.anthropic.enabled:
return self.config.anthropic.default_model, self.config.anthropic
raise ValueError("Anthropic backend not configured")
# BreakPilot Modelle → primäres Backend
if "breakpilot" in model_lower or "teacher" in model_lower:
backend = self._get_available_backend()
if backend:
# Map zu tatsächlichem Modell-Namen
if "70b" in model_lower:
actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
else:
@@ -95,7 +81,6 @@ class InferenceService:
return actual_model, backend
raise ValueError("No LLM backend available")
# Mistral Modelle
if "mistral" in model_lower:
backend = self._get_available_backend()
if backend:
@@ -103,409 +88,64 @@ class InferenceService:
return actual_model, backend
raise ValueError("No LLM backend available")
# Fallback: verwende Modell-Name direkt
backend = self._get_available_backend()
if backend:
return model, backend
raise ValueError("No LLM backend available")
async def _call_ollama(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
) -> InferenceResult:
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
client = await self.get_client()
# Ollama verwendet eigenes Format
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model,
"messages": messages,
"stream": False,
"options": {
"temperature": request.temperature,
"top_p": request.top_p,
},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
response = await client.post(
f"{backend.base_url}/api/chat",
json=payload,
timeout=backend.timeout,
)
response.raise_for_status()
data = response.json()
return InferenceResult(
content=data.get("message", {}).get("content", ""),
model=model,
backend="ollama",
usage=Usage(
prompt_tokens=data.get("prompt_eval_count", 0),
completion_tokens=data.get("eval_count", 0),
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
),
finish_reason="stop" if data.get("done") else "length",
)
async def _stream_ollama(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
response_id: str,
) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Ollama."""
client = await self.get_client()
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model,
"messages": messages,
"stream": True,
"options": {
"temperature": request.temperature,
"top_p": request.top_p,
},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
async with client.stream(
"POST",
f"{backend.base_url}/api/chat",
json=payload,
timeout=backend.timeout,
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line:
continue
try:
data = json.loads(line)
content = data.get("message", {}).get("content", "")
done = data.get("done", False)
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(content=content),
finish_reason="stop" if done else None,
)
],
)
except json.JSONDecodeError:
continue
async def _call_openai_compatible(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
) -> InferenceResult:
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
client = await self.get_client()
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": False,
"temperature": request.temperature,
"top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
if request.stop:
payload["stop"] = request.stop
response = await client.post(
f"{backend.base_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=backend.timeout,
)
response.raise_for_status()
data = response.json()
choice = data.get("choices", [{}])[0]
usage_data = data.get("usage", {})
return InferenceResult(
content=choice.get("message", {}).get("content", ""),
model=model,
backend=backend.name,
usage=Usage(
prompt_tokens=usage_data.get("prompt_tokens", 0),
completion_tokens=usage_data.get("completion_tokens", 0),
total_tokens=usage_data.get("total_tokens", 0),
),
finish_reason=choice.get("finish_reason", "stop"),
)
async def _stream_openai_compatible(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
response_id: str,
) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von OpenAI-kompatibler API."""
client = await self.get_client()
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": True,
"temperature": request.temperature,
"top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
async with client.stream(
"POST",
f"{backend.base_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=backend.timeout,
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line or not line.startswith("data: "):
continue
data_str = line[6:] # Remove "data: " prefix
if data_str == "[DONE]":
break
try:
data = json.loads(data_str)
choice = data.get("choices", [{}])[0]
delta = choice.get("delta", {})
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(
role=delta.get("role"),
content=delta.get("content"),
),
finish_reason=choice.get("finish_reason"),
)
],
)
except json.JSONDecodeError:
continue
async def _call_anthropic(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
) -> InferenceResult:
"""Ruft Anthropic Claude API auf."""
# Anthropic SDK verwenden (bereits installiert)
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
# System message extrahieren
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
response = await client.messages.create(
model=model,
max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages,
temperature=request.temperature,
top_p=request.top_p,
)
content = ""
if response.content:
content = response.content[0].text if response.content[0].type == "text" else ""
return InferenceResult(
content=content,
model=model,
backend="anthropic",
usage=Usage(
prompt_tokens=response.usage.input_tokens,
completion_tokens=response.usage.output_tokens,
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
),
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
)
async def _stream_anthropic(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
response_id: str,
) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Anthropic Claude API."""
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
# System message extrahieren
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
async with client.messages.stream(
model=model,
max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages,
temperature=request.temperature,
top_p=request.top_p,
) as stream:
async for text in stream.text_stream:
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(content=text),
finish_reason=None,
)
],
)
# Final chunk with finish_reason
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(),
finish_reason="stop",
)
],
)
async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
"""
Führt Chat Completion durch (non-streaming).
"""
"""Führt Chat Completion durch (non-streaming)."""
actual_model, backend = self._map_model_to_backend(request.model)
logger.info(f"Inference request: model={request.model} -> {actual_model} via {backend.name}")
logger.info(f"Inference request: model={request.model}{actual_model} via {backend.name}")
client = await self.get_client()
if backend.name == "ollama":
result = await self._call_ollama(backend, actual_model, request)
result = await call_ollama(client, backend, actual_model, request)
elif backend.name == "anthropic":
result = await self._call_anthropic(backend, actual_model, request)
result = await call_anthropic(backend, actual_model, request)
else:
result = await self._call_openai_compatible(backend, actual_model, request)
result = await call_openai_compatible(client, backend, actual_model, request)
return ChatCompletionResponse(
model=request.model, # Original requested model name
choices=[
ChatChoice(
index=0,
message=ChatMessage(role="assistant", content=result.content),
finish_reason=result.finish_reason,
)
],
model=request.model,
choices=[ChatChoice(index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason)],
usage=result.usage,
)
async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
"""
Führt Chat Completion mit Streaming durch.
"""
"""Führt Chat Completion mit Streaming durch."""
import uuid
response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
actual_model, backend = self._map_model_to_backend(request.model)
logger.info(f"Streaming request: model={request.model} -> {actual_model} via {backend.name}")
logger.info(f"Streaming request: model={request.model}{actual_model} via {backend.name}")
client = await self.get_client()
if backend.name == "ollama":
async for chunk in self._stream_ollama(backend, actual_model, request, response_id):
async for chunk in stream_ollama(client, backend, actual_model, request, response_id):
yield chunk
elif backend.name == "anthropic":
async for chunk in self._stream_anthropic(backend, actual_model, request, response_id):
async for chunk in stream_anthropic(backend, actual_model, request, response_id):
yield chunk
else:
async for chunk in self._stream_openai_compatible(backend, actual_model, request, response_id):
async for chunk in stream_openai_compatible(client, backend, actual_model, request, response_id):
yield chunk
async def list_models(self) -> ModelListResponse:
"""Listet verfügbare Modelle."""
models = []
# BreakPilot Modelle (mapped zu verfügbaren Backends)
backend = self._get_available_backend()
if backend:
models.extend([
ModelInfo(
id="breakpilot-teacher-8b",
owned_by="breakpilot",
description="Llama 3.1 8B optimiert für Schulkontext",
context_length=8192,
),
ModelInfo(
id="breakpilot-teacher-70b",
owned_by="breakpilot",
description="Llama 3.1 70B für komplexe Aufgaben",
context_length=8192,
),
ModelInfo(id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192),
ModelInfo(id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192),
])
# Claude Modelle (wenn Anthropic konfiguriert)
if self.config.anthropic and self.config.anthropic.enabled:
models.append(
ModelInfo(
id="claude-3-5-sonnet",
owned_by="anthropic",
description="Claude 3.5 Sonnet - Fallback für höchste Qualität",
context_length=200000,
)
)
models.append(ModelInfo(id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000))
return ModelListResponse(data=models)

View File

@@ -0,0 +1,230 @@
"""
Inference Backends - Kommunikation mit einzelnen LLM-Providern.
Unterstützt Ollama, OpenAI-kompatible APIs und Anthropic Claude.
"""
import json
import logging
from typing import AsyncIterator, Optional
from dataclasses import dataclass
from ..config import LLMBackendConfig
from ..models.chat import (
ChatCompletionRequest,
ChatCompletionChunk,
ChatMessage,
StreamChoice,
ChatChoiceDelta,
Usage,
)
logger = logging.getLogger(__name__)
@dataclass
class InferenceResult:
"""Ergebnis einer Inference-Anfrage."""
content: str
model: str
backend: str
usage: Optional[Usage] = None
finish_reason: str = "stop"
async def call_ollama(client, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest) -> InferenceResult:
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model,
"messages": messages,
"stream": False,
"options": {"temperature": request.temperature, "top_p": request.top_p},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
response = await client.post(f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout)
response.raise_for_status()
data = response.json()
return InferenceResult(
content=data.get("message", {}).get("content", ""),
model=model, backend="ollama",
usage=Usage(
prompt_tokens=data.get("prompt_eval_count", 0),
completion_tokens=data.get("eval_count", 0),
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
),
finish_reason="stop" if data.get("done") else "length",
)
async def stream_ollama(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Ollama."""
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model, "messages": messages, "stream": True,
"options": {"temperature": request.temperature, "top_p": request.top_p},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
async with client.stream("POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line:
continue
try:
data = json.loads(line)
content = data.get("message", {}).get("content", "")
done = data.get("done", False)
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None)],
)
except json.JSONDecodeError:
continue
async def call_openai_compatible(client, backend, model, request) -> InferenceResult:
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": False, "temperature": request.temperature, "top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
if request.stop:
payload["stop"] = request.stop
response = await client.post(f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout)
response.raise_for_status()
data = response.json()
choice = data.get("choices", [{}])[0]
usage_data = data.get("usage", {})
return InferenceResult(
content=choice.get("message", {}).get("content", ""),
model=model, backend=backend.name,
usage=Usage(
prompt_tokens=usage_data.get("prompt_tokens", 0),
completion_tokens=usage_data.get("completion_tokens", 0),
total_tokens=usage_data.get("total_tokens", 0),
),
finish_reason=choice.get("finish_reason", "stop"),
)
async def stream_openai_compatible(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von OpenAI-kompatibler API."""
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": True, "temperature": request.temperature, "top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
async with client.stream("POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line or not line.startswith("data: "):
continue
data_str = line[6:]
if data_str == "[DONE]":
break
try:
data = json.loads(data_str)
choice = data.get("choices", [{}])[0]
delta = choice.get("delta", {})
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(role=delta.get("role"), content=delta.get("content")), finish_reason=choice.get("finish_reason"))],
)
except json.JSONDecodeError:
continue
async def call_anthropic(backend, model, request) -> InferenceResult:
"""Ruft Anthropic Claude API auf."""
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
response = await client.messages.create(
model=model, max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages, temperature=request.temperature, top_p=request.top_p,
)
content = ""
if response.content:
content = response.content[0].text if response.content[0].type == "text" else ""
return InferenceResult(
content=content, model=model, backend="anthropic",
usage=Usage(
prompt_tokens=response.usage.input_tokens,
completion_tokens=response.usage.output_tokens,
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
),
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
)
async def stream_anthropic(backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Anthropic Claude API."""
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
async with client.messages.stream(
model=model, max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages, temperature=request.temperature, top_p=request.top_p,
) as stream:
async for text in stream.text_stream:
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=text), finish_reason=None)],
)
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(), finish_reason="stop")],
)