[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions

View File

@@ -9,378 +9,33 @@ Dieses Modul ermoeglicht:
import asyncio
import logging
import time
import uuid
from datetime import datetime, timezone
from typing import Optional
from pydantic import BaseModel, Field
from fastapi import APIRouter, HTTPException, Depends
from ..models.chat import ChatMessage
from ..middleware.auth import verify_api_key
from .comparison_models import (
ComparisonRequest,
LLMResponse,
ComparisonResponse,
SavedComparison,
_comparisons_store,
_system_prompts_store,
)
from .comparison_providers import (
call_openai,
call_claude,
search_tavily,
search_edusearch,
call_selfhosted_with_search,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/comparison", tags=["LLM Comparison"])
class ComparisonRequest(BaseModel):
"""Request fuer LLM-Vergleich."""
prompt: str = Field(..., description="User prompt (z.B. Lehrer-Frage)")
system_prompt: Optional[str] = Field(None, description="Optionaler System Prompt")
enable_openai: bool = Field(True, description="OpenAI/ChatGPT aktivieren")
enable_claude: bool = Field(True, description="Claude aktivieren")
enable_selfhosted_tavily: bool = Field(True, description="Self-hosted + Tavily aktivieren")
enable_selfhosted_edusearch: bool = Field(True, description="Self-hosted + EduSearch aktivieren")
# Parameter fuer Self-hosted Modelle
selfhosted_model: str = Field("llama3.2:3b", description="Self-hosted Modell")
temperature: float = Field(0.7, ge=0.0, le=2.0, description="Temperature")
top_p: float = Field(0.9, ge=0.0, le=1.0, description="Top-p Sampling")
max_tokens: int = Field(2048, ge=1, le=8192, description="Max Tokens")
# Search Parameter
search_results_count: int = Field(5, ge=1, le=20, description="Anzahl Suchergebnisse")
edu_search_filters: Optional[dict] = Field(None, description="Filter fuer EduSearch")
class LLMResponse(BaseModel):
"""Antwort eines einzelnen LLM."""
provider: str
model: str
response: str
latency_ms: int
tokens_used: Optional[int] = None
search_results: Optional[list] = None
error: Optional[str] = None
timestamp: datetime = Field(default_factory=datetime.utcnow)
class ComparisonResponse(BaseModel):
"""Gesamt-Antwort des Vergleichs."""
comparison_id: str
prompt: str
system_prompt: Optional[str]
responses: list[LLMResponse]
created_at: datetime = Field(default_factory=datetime.utcnow)
class SavedComparison(BaseModel):
"""Gespeicherter Vergleich fuer QA."""
comparison_id: str
prompt: str
system_prompt: Optional[str]
responses: list[LLMResponse]
notes: Optional[str] = None
rating: Optional[dict] = None # {"openai": 4, "claude": 5, ...}
created_at: datetime
created_by: Optional[str] = None
# In-Memory Storage (in Production: Database)
_comparisons_store: dict[str, SavedComparison] = {}
_system_prompts_store: dict[str, dict] = {
"default": {
"id": "default",
"name": "Standard Lehrer-Assistent",
"prompt": """Du bist ein hilfreicher Assistent fuer Lehrkraefte in Deutschland.
Deine Aufgaben:
- Hilfe bei der Unterrichtsplanung
- Erklaerung von Fachinhalten
- Erstellung von Arbeitsblaettern und Pruefungen
- Beratung zu paedagogischen Methoden
Antworte immer auf Deutsch und beachte den deutschen Lehrplankontext.""",
"created_at": datetime.now(timezone.utc).isoformat(),
},
"curriculum": {
"id": "curriculum",
"name": "Lehrplan-Experte",
"prompt": """Du bist ein Experte fuer deutsche Lehrplaene und Bildungsstandards.
Du kennst:
- Lehrplaene aller 16 Bundeslaender
- KMK Bildungsstandards
- Kompetenzorientierung im deutschen Bildungssystem
Beziehe dich immer auf konkrete Lehrplanvorgaben wenn moeglich.""",
"created_at": datetime.now(timezone.utc).isoformat(),
},
"worksheet": {
"id": "worksheet",
"name": "Arbeitsblatt-Generator",
"prompt": """Du bist ein spezialisierter Assistent fuer die Erstellung von Arbeitsblaettern.
Erstelle didaktisch sinnvolle Aufgaben mit:
- Klaren Arbeitsanweisungen
- Differenzierungsmoeglichkeiten
- Loesungshinweisen
Format: Markdown mit klarer Struktur.""",
"created_at": datetime.now(timezone.utc).isoformat(),
},
}
async def _call_openai(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
"""Ruft OpenAI ChatGPT auf."""
import os
import httpx
start_time = time.time()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
return LLMResponse(
provider="openai",
model="gpt-4o-mini",
response="",
latency_ms=0,
error="OPENAI_API_KEY nicht konfiguriert"
)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
try:
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"model": "gpt-4o-mini",
"messages": messages,
"temperature": 0.7,
"max_tokens": 2048,
},
)
response.raise_for_status()
data = response.json()
latency_ms = int((time.time() - start_time) * 1000)
content = data["choices"][0]["message"]["content"]
tokens = data.get("usage", {}).get("total_tokens")
return LLMResponse(
provider="openai",
model="gpt-4o-mini",
response=content,
latency_ms=latency_ms,
tokens_used=tokens,
)
except Exception as e:
return LLMResponse(
provider="openai",
model="gpt-4o-mini",
response="",
latency_ms=int((time.time() - start_time) * 1000),
error=str(e),
)
async def _call_claude(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
"""Ruft Anthropic Claude auf."""
import os
start_time = time.time()
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
return LLMResponse(
provider="claude",
model="claude-3-5-sonnet-20241022",
response="",
latency_ms=0,
error="ANTHROPIC_API_KEY nicht konfiguriert"
)
try:
import anthropic
client = anthropic.AsyncAnthropic(api_key=api_key)
response = await client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=2048,
system=system_prompt or "",
messages=[{"role": "user", "content": prompt}],
)
latency_ms = int((time.time() - start_time) * 1000)
content = response.content[0].text if response.content else ""
tokens = response.usage.input_tokens + response.usage.output_tokens
return LLMResponse(
provider="claude",
model="claude-3-5-sonnet-20241022",
response=content,
latency_ms=latency_ms,
tokens_used=tokens,
)
except Exception as e:
return LLMResponse(
provider="claude",
model="claude-3-5-sonnet-20241022",
response="",
latency_ms=int((time.time() - start_time) * 1000),
error=str(e),
)
async def _search_tavily(query: str, count: int = 5) -> list[dict]:
"""Sucht mit Tavily API."""
import os
import httpx
api_key = os.getenv("TAVILY_API_KEY")
if not api_key:
return []
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
"https://api.tavily.com/search",
json={
"api_key": api_key,
"query": query,
"max_results": count,
"include_domains": [
"kmk.org", "bildungsserver.de", "bpb.de",
"bayern.de", "nrw.de", "berlin.de",
],
},
)
response.raise_for_status()
data = response.json()
return data.get("results", [])
except Exception as e:
logger.error(f"Tavily search error: {e}")
return []
async def _search_edusearch(query: str, count: int = 5, filters: Optional[dict] = None) -> list[dict]:
"""Sucht mit EduSearch API."""
import os
import httpx
edu_search_url = os.getenv("EDU_SEARCH_URL", "http://edu-search-service:8084")
try:
async with httpx.AsyncClient(timeout=30.0) as client:
payload = {
"q": query,
"limit": count,
"mode": "keyword",
}
if filters:
payload["filters"] = filters
response = await client.post(
f"{edu_search_url}/v1/search",
json=payload,
)
response.raise_for_status()
data = response.json()
# Formatiere Ergebnisse
results = []
for r in data.get("results", []):
results.append({
"title": r.get("title", ""),
"url": r.get("url", ""),
"content": r.get("snippet", ""),
"score": r.get("scores", {}).get("final", 0),
})
return results
except Exception as e:
logger.error(f"EduSearch error: {e}")
return []
async def _call_selfhosted_with_search(
prompt: str,
system_prompt: Optional[str],
search_provider: str,
search_results: list[dict],
model: str,
temperature: float,
top_p: float,
max_tokens: int,
) -> LLMResponse:
"""Ruft Self-hosted LLM mit Suchergebnissen auf."""
import os
import httpx
start_time = time.time()
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
# Baue Kontext aus Suchergebnissen
context_parts = []
for i, result in enumerate(search_results, 1):
context_parts.append(f"[{i}] {result.get('title', 'Untitled')}")
context_parts.append(f" URL: {result.get('url', '')}")
context_parts.append(f" {result.get('content', '')[:500]}")
context_parts.append("")
search_context = "\n".join(context_parts)
# Erweitere System Prompt mit Suchergebnissen
augmented_system = f"""{system_prompt or ''}
Du hast Zugriff auf folgende Suchergebnisse aus {"Tavily" if search_provider == "tavily" else "EduSearch (deutsche Bildungsquellen)"}:
{search_context}
Nutze diese Quellen um deine Antwort zu unterstuetzen. Zitiere relevante Quellen mit [Nummer]."""
messages = [
{"role": "system", "content": augmented_system},
{"role": "user", "content": prompt},
]
try:
async with httpx.AsyncClient(timeout=120.0) as client:
response = await client.post(
f"{ollama_url}/api/chat",
json={
"model": model,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature,
"top_p": top_p,
"num_predict": max_tokens,
},
},
)
response.raise_for_status()
data = response.json()
latency_ms = int((time.time() - start_time) * 1000)
content = data.get("message", {}).get("content", "")
tokens = data.get("prompt_eval_count", 0) + data.get("eval_count", 0)
return LLMResponse(
provider=f"selfhosted_{search_provider}",
model=model,
response=content,
latency_ms=latency_ms,
tokens_used=tokens,
search_results=search_results,
)
except Exception as e:
return LLMResponse(
provider=f"selfhosted_{search_provider}",
model=model,
response="",
latency_ms=int((time.time() - start_time) * 1000),
error=str(e),
search_results=search_results,
)
@router.post("/run", response_model=ComparisonResponse)
async def run_comparison(
request: ComparisonRequest,
@@ -395,23 +50,19 @@ async def run_comparison(
comparison_id = f"cmp-{uuid.uuid4().hex[:12]}"
tasks = []
# System Prompt vorbereiten
system_prompt = request.system_prompt
# OpenAI
if request.enable_openai:
tasks.append(("openai", _call_openai(request.prompt, system_prompt)))
tasks.append(("openai", call_openai(request.prompt, system_prompt)))
# Claude
if request.enable_claude:
tasks.append(("claude", _call_claude(request.prompt, system_prompt)))
tasks.append(("claude", call_claude(request.prompt, system_prompt)))
# Self-hosted + Tavily
if request.enable_selfhosted_tavily:
tavily_results = await _search_tavily(request.prompt, request.search_results_count)
tavily_results = await search_tavily(request.prompt, request.search_results_count)
tasks.append((
"selfhosted_tavily",
_call_selfhosted_with_search(
call_selfhosted_with_search(
request.prompt,
system_prompt,
"tavily",
@@ -423,16 +74,15 @@ async def run_comparison(
)
))
# Self-hosted + EduSearch
if request.enable_selfhosted_edusearch:
edu_results = await _search_edusearch(
edu_results = await search_edusearch(
request.prompt,
request.search_results_count,
request.edu_search_filters,
)
tasks.append((
"selfhosted_edusearch",
_call_selfhosted_with_search(
call_selfhosted_with_search(
request.prompt,
system_prompt,
"edusearch",
@@ -444,7 +94,6 @@ async def run_comparison(
)
))
# Parallele Ausfuehrung
responses = []
if tasks:
results = await asyncio.gather(*[t[1] for t in tasks], return_exceptions=True)

View File

@@ -0,0 +1,103 @@
"""
LLM Comparison - Pydantic Models und In-Memory Storage.
"""
from datetime import datetime, timezone
from typing import Optional
from pydantic import BaseModel, Field
class ComparisonRequest(BaseModel):
"""Request fuer LLM-Vergleich."""
prompt: str = Field(..., description="User prompt (z.B. Lehrer-Frage)")
system_prompt: Optional[str] = Field(None, description="Optionaler System Prompt")
enable_openai: bool = Field(True, description="OpenAI/ChatGPT aktivieren")
enable_claude: bool = Field(True, description="Claude aktivieren")
enable_selfhosted_tavily: bool = Field(True, description="Self-hosted + Tavily aktivieren")
enable_selfhosted_edusearch: bool = Field(True, description="Self-hosted + EduSearch aktivieren")
# Parameter fuer Self-hosted Modelle
selfhosted_model: str = Field("llama3.2:3b", description="Self-hosted Modell")
temperature: float = Field(0.7, ge=0.0, le=2.0, description="Temperature")
top_p: float = Field(0.9, ge=0.0, le=1.0, description="Top-p Sampling")
max_tokens: int = Field(2048, ge=1, le=8192, description="Max Tokens")
# Search Parameter
search_results_count: int = Field(5, ge=1, le=20, description="Anzahl Suchergebnisse")
edu_search_filters: Optional[dict] = Field(None, description="Filter fuer EduSearch")
class LLMResponse(BaseModel):
"""Antwort eines einzelnen LLM."""
provider: str
model: str
response: str
latency_ms: int
tokens_used: Optional[int] = None
search_results: Optional[list] = None
error: Optional[str] = None
timestamp: datetime = Field(default_factory=datetime.utcnow)
class ComparisonResponse(BaseModel):
"""Gesamt-Antwort des Vergleichs."""
comparison_id: str
prompt: str
system_prompt: Optional[str]
responses: list[LLMResponse]
created_at: datetime = Field(default_factory=datetime.utcnow)
class SavedComparison(BaseModel):
"""Gespeicherter Vergleich fuer QA."""
comparison_id: str
prompt: str
system_prompt: Optional[str]
responses: list[LLMResponse]
notes: Optional[str] = None
rating: Optional[dict] = None # {"openai": 4, "claude": 5, ...}
created_at: datetime
created_by: Optional[str] = None
# In-Memory Storage (in Production: Database)
_comparisons_store: dict[str, SavedComparison] = {}
_system_prompts_store: dict[str, dict] = {
"default": {
"id": "default",
"name": "Standard Lehrer-Assistent",
"prompt": """Du bist ein hilfreicher Assistent fuer Lehrkraefte in Deutschland.
Deine Aufgaben:
- Hilfe bei der Unterrichtsplanung
- Erklaerung von Fachinhalten
- Erstellung von Arbeitsblaettern und Pruefungen
- Beratung zu paedagogischen Methoden
Antworte immer auf Deutsch und beachte den deutschen Lehrplankontext.""",
"created_at": datetime.now(timezone.utc).isoformat(),
},
"curriculum": {
"id": "curriculum",
"name": "Lehrplan-Experte",
"prompt": """Du bist ein Experte fuer deutsche Lehrplaene und Bildungsstandards.
Du kennst:
- Lehrplaene aller 16 Bundeslaender
- KMK Bildungsstandards
- Kompetenzorientierung im deutschen Bildungssystem
Beziehe dich immer auf konkrete Lehrplanvorgaben wenn moeglich.""",
"created_at": datetime.now(timezone.utc).isoformat(),
},
"worksheet": {
"id": "worksheet",
"name": "Arbeitsblatt-Generator",
"prompt": """Du bist ein spezialisierter Assistent fuer die Erstellung von Arbeitsblaettern.
Erstelle didaktisch sinnvolle Aufgaben mit:
- Klaren Arbeitsanweisungen
- Differenzierungsmoeglichkeiten
- Loesungshinweisen
Format: Markdown mit klarer Struktur.""",
"created_at": datetime.now(timezone.utc).isoformat(),
},
}

View File

@@ -0,0 +1,270 @@
"""
LLM Comparison - Provider-Aufrufe (OpenAI, Claude, Self-hosted, Search).
"""
import logging
import time
from typing import Optional
from .comparison_models import LLMResponse
logger = logging.getLogger(__name__)
async def call_openai(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
"""Ruft OpenAI ChatGPT auf."""
import os
import httpx
start_time = time.time()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
return LLMResponse(
provider="openai",
model="gpt-4o-mini",
response="",
latency_ms=0,
error="OPENAI_API_KEY nicht konfiguriert"
)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
try:
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"model": "gpt-4o-mini",
"messages": messages,
"temperature": 0.7,
"max_tokens": 2048,
},
)
response.raise_for_status()
data = response.json()
latency_ms = int((time.time() - start_time) * 1000)
content = data["choices"][0]["message"]["content"]
tokens = data.get("usage", {}).get("total_tokens")
return LLMResponse(
provider="openai",
model="gpt-4o-mini",
response=content,
latency_ms=latency_ms,
tokens_used=tokens,
)
except Exception as e:
return LLMResponse(
provider="openai",
model="gpt-4o-mini",
response="",
latency_ms=int((time.time() - start_time) * 1000),
error=str(e),
)
async def call_claude(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
"""Ruft Anthropic Claude auf."""
import os
start_time = time.time()
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
return LLMResponse(
provider="claude",
model="claude-3-5-sonnet-20241022",
response="",
latency_ms=0,
error="ANTHROPIC_API_KEY nicht konfiguriert"
)
try:
import anthropic
client = anthropic.AsyncAnthropic(api_key=api_key)
response = await client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=2048,
system=system_prompt or "",
messages=[{"role": "user", "content": prompt}],
)
latency_ms = int((time.time() - start_time) * 1000)
content = response.content[0].text if response.content else ""
tokens = response.usage.input_tokens + response.usage.output_tokens
return LLMResponse(
provider="claude",
model="claude-3-5-sonnet-20241022",
response=content,
latency_ms=latency_ms,
tokens_used=tokens,
)
except Exception as e:
return LLMResponse(
provider="claude",
model="claude-3-5-sonnet-20241022",
response="",
latency_ms=int((time.time() - start_time) * 1000),
error=str(e),
)
async def search_tavily(query: str, count: int = 5) -> list[dict]:
"""Sucht mit Tavily API."""
import os
import httpx
api_key = os.getenv("TAVILY_API_KEY")
if not api_key:
return []
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
"https://api.tavily.com/search",
json={
"api_key": api_key,
"query": query,
"max_results": count,
"include_domains": [
"kmk.org", "bildungsserver.de", "bpb.de",
"bayern.de", "nrw.de", "berlin.de",
],
},
)
response.raise_for_status()
data = response.json()
return data.get("results", [])
except Exception as e:
logger.error(f"Tavily search error: {e}")
return []
async def search_edusearch(query: str, count: int = 5, filters: Optional[dict] = None) -> list[dict]:
"""Sucht mit EduSearch API."""
import os
import httpx
edu_search_url = os.getenv("EDU_SEARCH_URL", "http://edu-search-service:8084")
try:
async with httpx.AsyncClient(timeout=30.0) as client:
payload = {
"q": query,
"limit": count,
"mode": "keyword",
}
if filters:
payload["filters"] = filters
response = await client.post(
f"{edu_search_url}/v1/search",
json=payload,
)
response.raise_for_status()
data = response.json()
results = []
for r in data.get("results", []):
results.append({
"title": r.get("title", ""),
"url": r.get("url", ""),
"content": r.get("snippet", ""),
"score": r.get("scores", {}).get("final", 0),
})
return results
except Exception as e:
logger.error(f"EduSearch error: {e}")
return []
async def call_selfhosted_with_search(
prompt: str,
system_prompt: Optional[str],
search_provider: str,
search_results: list[dict],
model: str,
temperature: float,
top_p: float,
max_tokens: int,
) -> LLMResponse:
"""Ruft Self-hosted LLM mit Suchergebnissen auf."""
import os
import httpx
start_time = time.time()
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
# Baue Kontext aus Suchergebnissen
context_parts = []
for i, result in enumerate(search_results, 1):
context_parts.append(f"[{i}] {result.get('title', 'Untitled')}")
context_parts.append(f" URL: {result.get('url', '')}")
context_parts.append(f" {result.get('content', '')[:500]}")
context_parts.append("")
search_context = "\n".join(context_parts)
augmented_system = f"""{system_prompt or ''}
Du hast Zugriff auf folgende Suchergebnisse aus {"Tavily" if search_provider == "tavily" else "EduSearch (deutsche Bildungsquellen)"}:
{search_context}
Nutze diese Quellen um deine Antwort zu unterstuetzen. Zitiere relevante Quellen mit [Nummer]."""
messages = [
{"role": "system", "content": augmented_system},
{"role": "user", "content": prompt},
]
try:
async with httpx.AsyncClient(timeout=120.0) as client:
response = await client.post(
f"{ollama_url}/api/chat",
json={
"model": model,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature,
"top_p": top_p,
"num_predict": max_tokens,
},
},
)
response.raise_for_status()
data = response.json()
latency_ms = int((time.time() - start_time) * 1000)
content = data.get("message", {}).get("content", "")
tokens = data.get("prompt_eval_count", 0) + data.get("eval_count", 0)
return LLMResponse(
provider=f"selfhosted_{search_provider}",
model=model,
response=content,
latency_ms=latency_ms,
tokens_used=tokens,
search_results=search_results,
)
except Exception as e:
return LLMResponse(
provider=f"selfhosted_{search_provider}",
model=model,
response="",
latency_ms=int((time.time() - start_time) * 1000),
error=str(e),
search_results=search_results,
)

View File

@@ -8,10 +8,8 @@ Unterstützt:
"""
import httpx
import json
import logging
from typing import AsyncIterator, Optional
from dataclasses import dataclass
from ..config import get_config, LLMBackendConfig
from ..models.chat import (
@@ -20,26 +18,23 @@ from ..models.chat import (
ChatCompletionChunk,
ChatMessage,
ChatChoice,
StreamChoice,
ChatChoiceDelta,
Usage,
ModelInfo,
ModelListResponse,
)
from .inference_backends import (
InferenceResult,
call_ollama,
stream_ollama,
call_openai_compatible,
stream_openai_compatible,
call_anthropic,
stream_anthropic,
)
logger = logging.getLogger(__name__)
@dataclass
class InferenceResult:
"""Ergebnis einer Inference-Anfrage."""
content: str
model: str
backend: str
usage: Optional[Usage] = None
finish_reason: str = "stop"
class InferenceService:
"""Service für LLM Inference über verschiedene Backends."""
@@ -68,26 +63,17 @@ class InferenceService:
return None
def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
"""
Mapped ein Modell-Name zum entsprechenden Backend.
Beispiele:
- "breakpilot-teacher-8b" → Ollama/vLLM mit llama3.1:8b
- "claude-3-5-sonnet" → Anthropic
"""
"""Mapped ein Modell-Name zum entsprechenden Backend."""
model_lower = model.lower()
# Explizite Claude-Modelle → Anthropic
if "claude" in model_lower:
if self.config.anthropic and self.config.anthropic.enabled:
return self.config.anthropic.default_model, self.config.anthropic
raise ValueError("Anthropic backend not configured")
# BreakPilot Modelle → primäres Backend
if "breakpilot" in model_lower or "teacher" in model_lower:
backend = self._get_available_backend()
if backend:
# Map zu tatsächlichem Modell-Namen
if "70b" in model_lower:
actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
else:
@@ -95,7 +81,6 @@ class InferenceService:
return actual_model, backend
raise ValueError("No LLM backend available")
# Mistral Modelle
if "mistral" in model_lower:
backend = self._get_available_backend()
if backend:
@@ -103,409 +88,64 @@ class InferenceService:
return actual_model, backend
raise ValueError("No LLM backend available")
# Fallback: verwende Modell-Name direkt
backend = self._get_available_backend()
if backend:
return model, backend
raise ValueError("No LLM backend available")
async def _call_ollama(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
) -> InferenceResult:
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
client = await self.get_client()
# Ollama verwendet eigenes Format
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model,
"messages": messages,
"stream": False,
"options": {
"temperature": request.temperature,
"top_p": request.top_p,
},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
response = await client.post(
f"{backend.base_url}/api/chat",
json=payload,
timeout=backend.timeout,
)
response.raise_for_status()
data = response.json()
return InferenceResult(
content=data.get("message", {}).get("content", ""),
model=model,
backend="ollama",
usage=Usage(
prompt_tokens=data.get("prompt_eval_count", 0),
completion_tokens=data.get("eval_count", 0),
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
),
finish_reason="stop" if data.get("done") else "length",
)
async def _stream_ollama(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
response_id: str,
) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Ollama."""
client = await self.get_client()
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model,
"messages": messages,
"stream": True,
"options": {
"temperature": request.temperature,
"top_p": request.top_p,
},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
async with client.stream(
"POST",
f"{backend.base_url}/api/chat",
json=payload,
timeout=backend.timeout,
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line:
continue
try:
data = json.loads(line)
content = data.get("message", {}).get("content", "")
done = data.get("done", False)
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(content=content),
finish_reason="stop" if done else None,
)
],
)
except json.JSONDecodeError:
continue
async def _call_openai_compatible(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
) -> InferenceResult:
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
client = await self.get_client()
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": False,
"temperature": request.temperature,
"top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
if request.stop:
payload["stop"] = request.stop
response = await client.post(
f"{backend.base_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=backend.timeout,
)
response.raise_for_status()
data = response.json()
choice = data.get("choices", [{}])[0]
usage_data = data.get("usage", {})
return InferenceResult(
content=choice.get("message", {}).get("content", ""),
model=model,
backend=backend.name,
usage=Usage(
prompt_tokens=usage_data.get("prompt_tokens", 0),
completion_tokens=usage_data.get("completion_tokens", 0),
total_tokens=usage_data.get("total_tokens", 0),
),
finish_reason=choice.get("finish_reason", "stop"),
)
async def _stream_openai_compatible(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
response_id: str,
) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von OpenAI-kompatibler API."""
client = await self.get_client()
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": True,
"temperature": request.temperature,
"top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
async with client.stream(
"POST",
f"{backend.base_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=backend.timeout,
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line or not line.startswith("data: "):
continue
data_str = line[6:] # Remove "data: " prefix
if data_str == "[DONE]":
break
try:
data = json.loads(data_str)
choice = data.get("choices", [{}])[0]
delta = choice.get("delta", {})
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(
role=delta.get("role"),
content=delta.get("content"),
),
finish_reason=choice.get("finish_reason"),
)
],
)
except json.JSONDecodeError:
continue
async def _call_anthropic(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
) -> InferenceResult:
"""Ruft Anthropic Claude API auf."""
# Anthropic SDK verwenden (bereits installiert)
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
# System message extrahieren
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
response = await client.messages.create(
model=model,
max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages,
temperature=request.temperature,
top_p=request.top_p,
)
content = ""
if response.content:
content = response.content[0].text if response.content[0].type == "text" else ""
return InferenceResult(
content=content,
model=model,
backend="anthropic",
usage=Usage(
prompt_tokens=response.usage.input_tokens,
completion_tokens=response.usage.output_tokens,
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
),
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
)
async def _stream_anthropic(
self,
backend: LLMBackendConfig,
model: str,
request: ChatCompletionRequest,
response_id: str,
) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Anthropic Claude API."""
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
# System message extrahieren
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
async with client.messages.stream(
model=model,
max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages,
temperature=request.temperature,
top_p=request.top_p,
) as stream:
async for text in stream.text_stream:
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(content=text),
finish_reason=None,
)
],
)
# Final chunk with finish_reason
yield ChatCompletionChunk(
id=response_id,
model=model,
choices=[
StreamChoice(
index=0,
delta=ChatChoiceDelta(),
finish_reason="stop",
)
],
)
async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
"""
Führt Chat Completion durch (non-streaming).
"""
"""Führt Chat Completion durch (non-streaming)."""
actual_model, backend = self._map_model_to_backend(request.model)
logger.info(f"Inference request: model={request.model} -> {actual_model} via {backend.name}")
logger.info(f"Inference request: model={request.model}{actual_model} via {backend.name}")
client = await self.get_client()
if backend.name == "ollama":
result = await self._call_ollama(backend, actual_model, request)
result = await call_ollama(client, backend, actual_model, request)
elif backend.name == "anthropic":
result = await self._call_anthropic(backend, actual_model, request)
result = await call_anthropic(backend, actual_model, request)
else:
result = await self._call_openai_compatible(backend, actual_model, request)
result = await call_openai_compatible(client, backend, actual_model, request)
return ChatCompletionResponse(
model=request.model, # Original requested model name
choices=[
ChatChoice(
index=0,
message=ChatMessage(role="assistant", content=result.content),
finish_reason=result.finish_reason,
)
],
model=request.model,
choices=[ChatChoice(index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason)],
usage=result.usage,
)
async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
"""
Führt Chat Completion mit Streaming durch.
"""
"""Führt Chat Completion mit Streaming durch."""
import uuid
response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
actual_model, backend = self._map_model_to_backend(request.model)
logger.info(f"Streaming request: model={request.model} -> {actual_model} via {backend.name}")
logger.info(f"Streaming request: model={request.model}{actual_model} via {backend.name}")
client = await self.get_client()
if backend.name == "ollama":
async for chunk in self._stream_ollama(backend, actual_model, request, response_id):
async for chunk in stream_ollama(client, backend, actual_model, request, response_id):
yield chunk
elif backend.name == "anthropic":
async for chunk in self._stream_anthropic(backend, actual_model, request, response_id):
async for chunk in stream_anthropic(backend, actual_model, request, response_id):
yield chunk
else:
async for chunk in self._stream_openai_compatible(backend, actual_model, request, response_id):
async for chunk in stream_openai_compatible(client, backend, actual_model, request, response_id):
yield chunk
async def list_models(self) -> ModelListResponse:
"""Listet verfügbare Modelle."""
models = []
# BreakPilot Modelle (mapped zu verfügbaren Backends)
backend = self._get_available_backend()
if backend:
models.extend([
ModelInfo(
id="breakpilot-teacher-8b",
owned_by="breakpilot",
description="Llama 3.1 8B optimiert für Schulkontext",
context_length=8192,
),
ModelInfo(
id="breakpilot-teacher-70b",
owned_by="breakpilot",
description="Llama 3.1 70B für komplexe Aufgaben",
context_length=8192,
),
ModelInfo(id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192),
ModelInfo(id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192),
])
# Claude Modelle (wenn Anthropic konfiguriert)
if self.config.anthropic and self.config.anthropic.enabled:
models.append(
ModelInfo(
id="claude-3-5-sonnet",
owned_by="anthropic",
description="Claude 3.5 Sonnet - Fallback für höchste Qualität",
context_length=200000,
)
)
models.append(ModelInfo(id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000))
return ModelListResponse(data=models)

View File

@@ -0,0 +1,230 @@
"""
Inference Backends - Kommunikation mit einzelnen LLM-Providern.
Unterstützt Ollama, OpenAI-kompatible APIs und Anthropic Claude.
"""
import json
import logging
from typing import AsyncIterator, Optional
from dataclasses import dataclass
from ..config import LLMBackendConfig
from ..models.chat import (
ChatCompletionRequest,
ChatCompletionChunk,
ChatMessage,
StreamChoice,
ChatChoiceDelta,
Usage,
)
logger = logging.getLogger(__name__)
@dataclass
class InferenceResult:
"""Ergebnis einer Inference-Anfrage."""
content: str
model: str
backend: str
usage: Optional[Usage] = None
finish_reason: str = "stop"
async def call_ollama(client, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest) -> InferenceResult:
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model,
"messages": messages,
"stream": False,
"options": {"temperature": request.temperature, "top_p": request.top_p},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
response = await client.post(f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout)
response.raise_for_status()
data = response.json()
return InferenceResult(
content=data.get("message", {}).get("content", ""),
model=model, backend="ollama",
usage=Usage(
prompt_tokens=data.get("prompt_eval_count", 0),
completion_tokens=data.get("eval_count", 0),
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
),
finish_reason="stop" if data.get("done") else "length",
)
async def stream_ollama(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Ollama."""
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
payload = {
"model": model, "messages": messages, "stream": True,
"options": {"temperature": request.temperature, "top_p": request.top_p},
}
if request.max_tokens:
payload["options"]["num_predict"] = request.max_tokens
async with client.stream("POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line:
continue
try:
data = json.loads(line)
content = data.get("message", {}).get("content", "")
done = data.get("done", False)
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None)],
)
except json.JSONDecodeError:
continue
async def call_openai_compatible(client, backend, model, request) -> InferenceResult:
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": False, "temperature": request.temperature, "top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
if request.stop:
payload["stop"] = request.stop
response = await client.post(f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout)
response.raise_for_status()
data = response.json()
choice = data.get("choices", [{}])[0]
usage_data = data.get("usage", {})
return InferenceResult(
content=choice.get("message", {}).get("content", ""),
model=model, backend=backend.name,
usage=Usage(
prompt_tokens=usage_data.get("prompt_tokens", 0),
completion_tokens=usage_data.get("completion_tokens", 0),
total_tokens=usage_data.get("total_tokens", 0),
),
finish_reason=choice.get("finish_reason", "stop"),
)
async def stream_openai_compatible(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von OpenAI-kompatibler API."""
headers = {"Content-Type": "application/json"}
if backend.api_key:
headers["Authorization"] = f"Bearer {backend.api_key}"
payload = {
"model": model,
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
"stream": True, "temperature": request.temperature, "top_p": request.top_p,
}
if request.max_tokens:
payload["max_tokens"] = request.max_tokens
async with client.stream("POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line or not line.startswith("data: "):
continue
data_str = line[6:]
if data_str == "[DONE]":
break
try:
data = json.loads(data_str)
choice = data.get("choices", [{}])[0]
delta = choice.get("delta", {})
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(role=delta.get("role"), content=delta.get("content")), finish_reason=choice.get("finish_reason"))],
)
except json.JSONDecodeError:
continue
async def call_anthropic(backend, model, request) -> InferenceResult:
"""Ruft Anthropic Claude API auf."""
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
response = await client.messages.create(
model=model, max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages, temperature=request.temperature, top_p=request.top_p,
)
content = ""
if response.content:
content = response.content[0].text if response.content[0].type == "text" else ""
return InferenceResult(
content=content, model=model, backend="anthropic",
usage=Usage(
prompt_tokens=response.usage.input_tokens,
completion_tokens=response.usage.output_tokens,
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
),
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
)
async def stream_anthropic(backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
"""Streamt von Anthropic Claude API."""
try:
import anthropic
except ImportError:
raise ImportError("anthropic package required for Claude API")
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
system_content = ""
messages = []
for msg in request.messages:
if msg.role == "system":
system_content += (msg.content or "") + "\n"
else:
messages.append({"role": msg.role, "content": msg.content or ""})
async with client.messages.stream(
model=model, max_tokens=request.max_tokens or 4096,
system=system_content.strip() if system_content else None,
messages=messages, temperature=request.temperature, top_p=request.top_p,
) as stream:
async for text in stream.text_stream:
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=text), finish_reason=None)],
)
yield ChatCompletionChunk(
id=response_id, model=model,
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(), finish_reason="stop")],
)