[split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,378 +9,33 @@ Dieses Modul ermoeglicht:
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
|
||||
from ..models.chat import ChatMessage
|
||||
from ..middleware.auth import verify_api_key
|
||||
from .comparison_models import (
|
||||
ComparisonRequest,
|
||||
LLMResponse,
|
||||
ComparisonResponse,
|
||||
SavedComparison,
|
||||
_comparisons_store,
|
||||
_system_prompts_store,
|
||||
)
|
||||
from .comparison_providers import (
|
||||
call_openai,
|
||||
call_claude,
|
||||
search_tavily,
|
||||
search_edusearch,
|
||||
call_selfhosted_with_search,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/comparison", tags=["LLM Comparison"])
|
||||
|
||||
|
||||
class ComparisonRequest(BaseModel):
|
||||
"""Request fuer LLM-Vergleich."""
|
||||
prompt: str = Field(..., description="User prompt (z.B. Lehrer-Frage)")
|
||||
system_prompt: Optional[str] = Field(None, description="Optionaler System Prompt")
|
||||
enable_openai: bool = Field(True, description="OpenAI/ChatGPT aktivieren")
|
||||
enable_claude: bool = Field(True, description="Claude aktivieren")
|
||||
enable_selfhosted_tavily: bool = Field(True, description="Self-hosted + Tavily aktivieren")
|
||||
enable_selfhosted_edusearch: bool = Field(True, description="Self-hosted + EduSearch aktivieren")
|
||||
|
||||
# Parameter fuer Self-hosted Modelle
|
||||
selfhosted_model: str = Field("llama3.2:3b", description="Self-hosted Modell")
|
||||
temperature: float = Field(0.7, ge=0.0, le=2.0, description="Temperature")
|
||||
top_p: float = Field(0.9, ge=0.0, le=1.0, description="Top-p Sampling")
|
||||
max_tokens: int = Field(2048, ge=1, le=8192, description="Max Tokens")
|
||||
|
||||
# Search Parameter
|
||||
search_results_count: int = Field(5, ge=1, le=20, description="Anzahl Suchergebnisse")
|
||||
edu_search_filters: Optional[dict] = Field(None, description="Filter fuer EduSearch")
|
||||
|
||||
|
||||
class LLMResponse(BaseModel):
|
||||
"""Antwort eines einzelnen LLM."""
|
||||
provider: str
|
||||
model: str
|
||||
response: str
|
||||
latency_ms: int
|
||||
tokens_used: Optional[int] = None
|
||||
search_results: Optional[list] = None
|
||||
error: Optional[str] = None
|
||||
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
class ComparisonResponse(BaseModel):
|
||||
"""Gesamt-Antwort des Vergleichs."""
|
||||
comparison_id: str
|
||||
prompt: str
|
||||
system_prompt: Optional[str]
|
||||
responses: list[LLMResponse]
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
class SavedComparison(BaseModel):
|
||||
"""Gespeicherter Vergleich fuer QA."""
|
||||
comparison_id: str
|
||||
prompt: str
|
||||
system_prompt: Optional[str]
|
||||
responses: list[LLMResponse]
|
||||
notes: Optional[str] = None
|
||||
rating: Optional[dict] = None # {"openai": 4, "claude": 5, ...}
|
||||
created_at: datetime
|
||||
created_by: Optional[str] = None
|
||||
|
||||
|
||||
# In-Memory Storage (in Production: Database)
|
||||
_comparisons_store: dict[str, SavedComparison] = {}
|
||||
_system_prompts_store: dict[str, dict] = {
|
||||
"default": {
|
||||
"id": "default",
|
||||
"name": "Standard Lehrer-Assistent",
|
||||
"prompt": """Du bist ein hilfreicher Assistent fuer Lehrkraefte in Deutschland.
|
||||
Deine Aufgaben:
|
||||
- Hilfe bei der Unterrichtsplanung
|
||||
- Erklaerung von Fachinhalten
|
||||
- Erstellung von Arbeitsblaettern und Pruefungen
|
||||
- Beratung zu paedagogischen Methoden
|
||||
|
||||
Antworte immer auf Deutsch und beachte den deutschen Lehrplankontext.""",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
},
|
||||
"curriculum": {
|
||||
"id": "curriculum",
|
||||
"name": "Lehrplan-Experte",
|
||||
"prompt": """Du bist ein Experte fuer deutsche Lehrplaene und Bildungsstandards.
|
||||
Du kennst:
|
||||
- Lehrplaene aller 16 Bundeslaender
|
||||
- KMK Bildungsstandards
|
||||
- Kompetenzorientierung im deutschen Bildungssystem
|
||||
|
||||
Beziehe dich immer auf konkrete Lehrplanvorgaben wenn moeglich.""",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
},
|
||||
"worksheet": {
|
||||
"id": "worksheet",
|
||||
"name": "Arbeitsblatt-Generator",
|
||||
"prompt": """Du bist ein spezialisierter Assistent fuer die Erstellung von Arbeitsblaettern.
|
||||
Erstelle didaktisch sinnvolle Aufgaben mit:
|
||||
- Klaren Arbeitsanweisungen
|
||||
- Differenzierungsmoeglichkeiten
|
||||
- Loesungshinweisen
|
||||
|
||||
Format: Markdown mit klarer Struktur.""",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def _call_openai(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
|
||||
"""Ruft OpenAI ChatGPT auf."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
start_time = time.time()
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
return LLMResponse(
|
||||
provider="openai",
|
||||
model="gpt-4o-mini",
|
||||
response="",
|
||||
latency_ms=0,
|
||||
error="OPENAI_API_KEY nicht konfiguriert"
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": messages,
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 2048,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
tokens = data.get("usage", {}).get("total_tokens")
|
||||
|
||||
return LLMResponse(
|
||||
provider="openai",
|
||||
model="gpt-4o-mini",
|
||||
response=content,
|
||||
latency_ms=latency_ms,
|
||||
tokens_used=tokens,
|
||||
)
|
||||
except Exception as e:
|
||||
return LLMResponse(
|
||||
provider="openai",
|
||||
model="gpt-4o-mini",
|
||||
response="",
|
||||
latency_ms=int((time.time() - start_time) * 1000),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def _call_claude(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
|
||||
"""Ruft Anthropic Claude auf."""
|
||||
import os
|
||||
|
||||
start_time = time.time()
|
||||
api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
return LLMResponse(
|
||||
provider="claude",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
response="",
|
||||
latency_ms=0,
|
||||
error="ANTHROPIC_API_KEY nicht konfiguriert"
|
||||
)
|
||||
|
||||
try:
|
||||
import anthropic
|
||||
client = anthropic.AsyncAnthropic(api_key=api_key)
|
||||
|
||||
response = await client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=2048,
|
||||
system=system_prompt or "",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
content = response.content[0].text if response.content else ""
|
||||
tokens = response.usage.input_tokens + response.usage.output_tokens
|
||||
|
||||
return LLMResponse(
|
||||
provider="claude",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
response=content,
|
||||
latency_ms=latency_ms,
|
||||
tokens_used=tokens,
|
||||
)
|
||||
except Exception as e:
|
||||
return LLMResponse(
|
||||
provider="claude",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
response="",
|
||||
latency_ms=int((time.time() - start_time) * 1000),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def _search_tavily(query: str, count: int = 5) -> list[dict]:
|
||||
"""Sucht mit Tavily API."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
api_key = os.getenv("TAVILY_API_KEY")
|
||||
if not api_key:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
"https://api.tavily.com/search",
|
||||
json={
|
||||
"api_key": api_key,
|
||||
"query": query,
|
||||
"max_results": count,
|
||||
"include_domains": [
|
||||
"kmk.org", "bildungsserver.de", "bpb.de",
|
||||
"bayern.de", "nrw.de", "berlin.de",
|
||||
],
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("results", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Tavily search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def _search_edusearch(query: str, count: int = 5, filters: Optional[dict] = None) -> list[dict]:
|
||||
"""Sucht mit EduSearch API."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
edu_search_url = os.getenv("EDU_SEARCH_URL", "http://edu-search-service:8084")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
payload = {
|
||||
"q": query,
|
||||
"limit": count,
|
||||
"mode": "keyword",
|
||||
}
|
||||
if filters:
|
||||
payload["filters"] = filters
|
||||
|
||||
response = await client.post(
|
||||
f"{edu_search_url}/v1/search",
|
||||
json=payload,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Formatiere Ergebnisse
|
||||
results = []
|
||||
for r in data.get("results", []):
|
||||
results.append({
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"content": r.get("snippet", ""),
|
||||
"score": r.get("scores", {}).get("final", 0),
|
||||
})
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"EduSearch error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def _call_selfhosted_with_search(
|
||||
prompt: str,
|
||||
system_prompt: Optional[str],
|
||||
search_provider: str,
|
||||
search_results: list[dict],
|
||||
model: str,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
max_tokens: int,
|
||||
) -> LLMResponse:
|
||||
"""Ruft Self-hosted LLM mit Suchergebnissen auf."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
start_time = time.time()
|
||||
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||||
|
||||
# Baue Kontext aus Suchergebnissen
|
||||
context_parts = []
|
||||
for i, result in enumerate(search_results, 1):
|
||||
context_parts.append(f"[{i}] {result.get('title', 'Untitled')}")
|
||||
context_parts.append(f" URL: {result.get('url', '')}")
|
||||
context_parts.append(f" {result.get('content', '')[:500]}")
|
||||
context_parts.append("")
|
||||
|
||||
search_context = "\n".join(context_parts)
|
||||
|
||||
# Erweitere System Prompt mit Suchergebnissen
|
||||
augmented_system = f"""{system_prompt or ''}
|
||||
|
||||
Du hast Zugriff auf folgende Suchergebnisse aus {"Tavily" if search_provider == "tavily" else "EduSearch (deutsche Bildungsquellen)"}:
|
||||
|
||||
{search_context}
|
||||
|
||||
Nutze diese Quellen um deine Antwort zu unterstuetzen. Zitiere relevante Quellen mit [Nummer]."""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": augmented_system},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{ollama_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
content = data.get("message", {}).get("content", "")
|
||||
tokens = data.get("prompt_eval_count", 0) + data.get("eval_count", 0)
|
||||
|
||||
return LLMResponse(
|
||||
provider=f"selfhosted_{search_provider}",
|
||||
model=model,
|
||||
response=content,
|
||||
latency_ms=latency_ms,
|
||||
tokens_used=tokens,
|
||||
search_results=search_results,
|
||||
)
|
||||
except Exception as e:
|
||||
return LLMResponse(
|
||||
provider=f"selfhosted_{search_provider}",
|
||||
model=model,
|
||||
response="",
|
||||
latency_ms=int((time.time() - start_time) * 1000),
|
||||
error=str(e),
|
||||
search_results=search_results,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/run", response_model=ComparisonResponse)
|
||||
async def run_comparison(
|
||||
request: ComparisonRequest,
|
||||
@@ -395,23 +50,19 @@ async def run_comparison(
|
||||
comparison_id = f"cmp-{uuid.uuid4().hex[:12]}"
|
||||
tasks = []
|
||||
|
||||
# System Prompt vorbereiten
|
||||
system_prompt = request.system_prompt
|
||||
|
||||
# OpenAI
|
||||
if request.enable_openai:
|
||||
tasks.append(("openai", _call_openai(request.prompt, system_prompt)))
|
||||
tasks.append(("openai", call_openai(request.prompt, system_prompt)))
|
||||
|
||||
# Claude
|
||||
if request.enable_claude:
|
||||
tasks.append(("claude", _call_claude(request.prompt, system_prompt)))
|
||||
tasks.append(("claude", call_claude(request.prompt, system_prompt)))
|
||||
|
||||
# Self-hosted + Tavily
|
||||
if request.enable_selfhosted_tavily:
|
||||
tavily_results = await _search_tavily(request.prompt, request.search_results_count)
|
||||
tavily_results = await search_tavily(request.prompt, request.search_results_count)
|
||||
tasks.append((
|
||||
"selfhosted_tavily",
|
||||
_call_selfhosted_with_search(
|
||||
call_selfhosted_with_search(
|
||||
request.prompt,
|
||||
system_prompt,
|
||||
"tavily",
|
||||
@@ -423,16 +74,15 @@ async def run_comparison(
|
||||
)
|
||||
))
|
||||
|
||||
# Self-hosted + EduSearch
|
||||
if request.enable_selfhosted_edusearch:
|
||||
edu_results = await _search_edusearch(
|
||||
edu_results = await search_edusearch(
|
||||
request.prompt,
|
||||
request.search_results_count,
|
||||
request.edu_search_filters,
|
||||
)
|
||||
tasks.append((
|
||||
"selfhosted_edusearch",
|
||||
_call_selfhosted_with_search(
|
||||
call_selfhosted_with_search(
|
||||
request.prompt,
|
||||
system_prompt,
|
||||
"edusearch",
|
||||
@@ -444,7 +94,6 @@ async def run_comparison(
|
||||
)
|
||||
))
|
||||
|
||||
# Parallele Ausfuehrung
|
||||
responses = []
|
||||
if tasks:
|
||||
results = await asyncio.gather(*[t[1] for t in tasks], return_exceptions=True)
|
||||
|
||||
103
backend-lehrer/llm_gateway/routes/comparison_models.py
Normal file
103
backend-lehrer/llm_gateway/routes/comparison_models.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""
|
||||
LLM Comparison - Pydantic Models und In-Memory Storage.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ComparisonRequest(BaseModel):
|
||||
"""Request fuer LLM-Vergleich."""
|
||||
prompt: str = Field(..., description="User prompt (z.B. Lehrer-Frage)")
|
||||
system_prompt: Optional[str] = Field(None, description="Optionaler System Prompt")
|
||||
enable_openai: bool = Field(True, description="OpenAI/ChatGPT aktivieren")
|
||||
enable_claude: bool = Field(True, description="Claude aktivieren")
|
||||
enable_selfhosted_tavily: bool = Field(True, description="Self-hosted + Tavily aktivieren")
|
||||
enable_selfhosted_edusearch: bool = Field(True, description="Self-hosted + EduSearch aktivieren")
|
||||
|
||||
# Parameter fuer Self-hosted Modelle
|
||||
selfhosted_model: str = Field("llama3.2:3b", description="Self-hosted Modell")
|
||||
temperature: float = Field(0.7, ge=0.0, le=2.0, description="Temperature")
|
||||
top_p: float = Field(0.9, ge=0.0, le=1.0, description="Top-p Sampling")
|
||||
max_tokens: int = Field(2048, ge=1, le=8192, description="Max Tokens")
|
||||
|
||||
# Search Parameter
|
||||
search_results_count: int = Field(5, ge=1, le=20, description="Anzahl Suchergebnisse")
|
||||
edu_search_filters: Optional[dict] = Field(None, description="Filter fuer EduSearch")
|
||||
|
||||
|
||||
class LLMResponse(BaseModel):
|
||||
"""Antwort eines einzelnen LLM."""
|
||||
provider: str
|
||||
model: str
|
||||
response: str
|
||||
latency_ms: int
|
||||
tokens_used: Optional[int] = None
|
||||
search_results: Optional[list] = None
|
||||
error: Optional[str] = None
|
||||
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
class ComparisonResponse(BaseModel):
|
||||
"""Gesamt-Antwort des Vergleichs."""
|
||||
comparison_id: str
|
||||
prompt: str
|
||||
system_prompt: Optional[str]
|
||||
responses: list[LLMResponse]
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
class SavedComparison(BaseModel):
|
||||
"""Gespeicherter Vergleich fuer QA."""
|
||||
comparison_id: str
|
||||
prompt: str
|
||||
system_prompt: Optional[str]
|
||||
responses: list[LLMResponse]
|
||||
notes: Optional[str] = None
|
||||
rating: Optional[dict] = None # {"openai": 4, "claude": 5, ...}
|
||||
created_at: datetime
|
||||
created_by: Optional[str] = None
|
||||
|
||||
|
||||
# In-Memory Storage (in Production: Database)
|
||||
_comparisons_store: dict[str, SavedComparison] = {}
|
||||
_system_prompts_store: dict[str, dict] = {
|
||||
"default": {
|
||||
"id": "default",
|
||||
"name": "Standard Lehrer-Assistent",
|
||||
"prompt": """Du bist ein hilfreicher Assistent fuer Lehrkraefte in Deutschland.
|
||||
Deine Aufgaben:
|
||||
- Hilfe bei der Unterrichtsplanung
|
||||
- Erklaerung von Fachinhalten
|
||||
- Erstellung von Arbeitsblaettern und Pruefungen
|
||||
- Beratung zu paedagogischen Methoden
|
||||
|
||||
Antworte immer auf Deutsch und beachte den deutschen Lehrplankontext.""",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
},
|
||||
"curriculum": {
|
||||
"id": "curriculum",
|
||||
"name": "Lehrplan-Experte",
|
||||
"prompt": """Du bist ein Experte fuer deutsche Lehrplaene und Bildungsstandards.
|
||||
Du kennst:
|
||||
- Lehrplaene aller 16 Bundeslaender
|
||||
- KMK Bildungsstandards
|
||||
- Kompetenzorientierung im deutschen Bildungssystem
|
||||
|
||||
Beziehe dich immer auf konkrete Lehrplanvorgaben wenn moeglich.""",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
},
|
||||
"worksheet": {
|
||||
"id": "worksheet",
|
||||
"name": "Arbeitsblatt-Generator",
|
||||
"prompt": """Du bist ein spezialisierter Assistent fuer die Erstellung von Arbeitsblaettern.
|
||||
Erstelle didaktisch sinnvolle Aufgaben mit:
|
||||
- Klaren Arbeitsanweisungen
|
||||
- Differenzierungsmoeglichkeiten
|
||||
- Loesungshinweisen
|
||||
|
||||
Format: Markdown mit klarer Struktur.""",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
},
|
||||
}
|
||||
270
backend-lehrer/llm_gateway/routes/comparison_providers.py
Normal file
270
backend-lehrer/llm_gateway/routes/comparison_providers.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""
|
||||
LLM Comparison - Provider-Aufrufe (OpenAI, Claude, Self-hosted, Search).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from .comparison_models import LLMResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def call_openai(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
|
||||
"""Ruft OpenAI ChatGPT auf."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
start_time = time.time()
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
return LLMResponse(
|
||||
provider="openai",
|
||||
model="gpt-4o-mini",
|
||||
response="",
|
||||
latency_ms=0,
|
||||
error="OPENAI_API_KEY nicht konfiguriert"
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": messages,
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 2048,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
tokens = data.get("usage", {}).get("total_tokens")
|
||||
|
||||
return LLMResponse(
|
||||
provider="openai",
|
||||
model="gpt-4o-mini",
|
||||
response=content,
|
||||
latency_ms=latency_ms,
|
||||
tokens_used=tokens,
|
||||
)
|
||||
except Exception as e:
|
||||
return LLMResponse(
|
||||
provider="openai",
|
||||
model="gpt-4o-mini",
|
||||
response="",
|
||||
latency_ms=int((time.time() - start_time) * 1000),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def call_claude(prompt: str, system_prompt: Optional[str]) -> LLMResponse:
|
||||
"""Ruft Anthropic Claude auf."""
|
||||
import os
|
||||
|
||||
start_time = time.time()
|
||||
api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
return LLMResponse(
|
||||
provider="claude",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
response="",
|
||||
latency_ms=0,
|
||||
error="ANTHROPIC_API_KEY nicht konfiguriert"
|
||||
)
|
||||
|
||||
try:
|
||||
import anthropic
|
||||
client = anthropic.AsyncAnthropic(api_key=api_key)
|
||||
|
||||
response = await client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=2048,
|
||||
system=system_prompt or "",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
content = response.content[0].text if response.content else ""
|
||||
tokens = response.usage.input_tokens + response.usage.output_tokens
|
||||
|
||||
return LLMResponse(
|
||||
provider="claude",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
response=content,
|
||||
latency_ms=latency_ms,
|
||||
tokens_used=tokens,
|
||||
)
|
||||
except Exception as e:
|
||||
return LLMResponse(
|
||||
provider="claude",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
response="",
|
||||
latency_ms=int((time.time() - start_time) * 1000),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def search_tavily(query: str, count: int = 5) -> list[dict]:
|
||||
"""Sucht mit Tavily API."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
api_key = os.getenv("TAVILY_API_KEY")
|
||||
if not api_key:
|
||||
return []
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
"https://api.tavily.com/search",
|
||||
json={
|
||||
"api_key": api_key,
|
||||
"query": query,
|
||||
"max_results": count,
|
||||
"include_domains": [
|
||||
"kmk.org", "bildungsserver.de", "bpb.de",
|
||||
"bayern.de", "nrw.de", "berlin.de",
|
||||
],
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("results", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Tavily search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def search_edusearch(query: str, count: int = 5, filters: Optional[dict] = None) -> list[dict]:
|
||||
"""Sucht mit EduSearch API."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
edu_search_url = os.getenv("EDU_SEARCH_URL", "http://edu-search-service:8084")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
payload = {
|
||||
"q": query,
|
||||
"limit": count,
|
||||
"mode": "keyword",
|
||||
}
|
||||
if filters:
|
||||
payload["filters"] = filters
|
||||
|
||||
response = await client.post(
|
||||
f"{edu_search_url}/v1/search",
|
||||
json=payload,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = []
|
||||
for r in data.get("results", []):
|
||||
results.append({
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"content": r.get("snippet", ""),
|
||||
"score": r.get("scores", {}).get("final", 0),
|
||||
})
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"EduSearch error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def call_selfhosted_with_search(
|
||||
prompt: str,
|
||||
system_prompt: Optional[str],
|
||||
search_provider: str,
|
||||
search_results: list[dict],
|
||||
model: str,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
max_tokens: int,
|
||||
) -> LLMResponse:
|
||||
"""Ruft Self-hosted LLM mit Suchergebnissen auf."""
|
||||
import os
|
||||
import httpx
|
||||
|
||||
start_time = time.time()
|
||||
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||||
|
||||
# Baue Kontext aus Suchergebnissen
|
||||
context_parts = []
|
||||
for i, result in enumerate(search_results, 1):
|
||||
context_parts.append(f"[{i}] {result.get('title', 'Untitled')}")
|
||||
context_parts.append(f" URL: {result.get('url', '')}")
|
||||
context_parts.append(f" {result.get('content', '')[:500]}")
|
||||
context_parts.append("")
|
||||
|
||||
search_context = "\n".join(context_parts)
|
||||
|
||||
augmented_system = f"""{system_prompt or ''}
|
||||
|
||||
Du hast Zugriff auf folgende Suchergebnisse aus {"Tavily" if search_provider == "tavily" else "EduSearch (deutsche Bildungsquellen)"}:
|
||||
|
||||
{search_context}
|
||||
|
||||
Nutze diese Quellen um deine Antwort zu unterstuetzen. Zitiere relevante Quellen mit [Nummer]."""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": augmented_system},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{ollama_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
content = data.get("message", {}).get("content", "")
|
||||
tokens = data.get("prompt_eval_count", 0) + data.get("eval_count", 0)
|
||||
|
||||
return LLMResponse(
|
||||
provider=f"selfhosted_{search_provider}",
|
||||
model=model,
|
||||
response=content,
|
||||
latency_ms=latency_ms,
|
||||
tokens_used=tokens,
|
||||
search_results=search_results,
|
||||
)
|
||||
except Exception as e:
|
||||
return LLMResponse(
|
||||
provider=f"selfhosted_{search_provider}",
|
||||
model=model,
|
||||
response="",
|
||||
latency_ms=int((time.time() - start_time) * 1000),
|
||||
error=str(e),
|
||||
search_results=search_results,
|
||||
)
|
||||
@@ -8,10 +8,8 @@ Unterstützt:
|
||||
"""
|
||||
|
||||
import httpx
|
||||
import json
|
||||
import logging
|
||||
from typing import AsyncIterator, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..config import get_config, LLMBackendConfig
|
||||
from ..models.chat import (
|
||||
@@ -20,26 +18,23 @@ from ..models.chat import (
|
||||
ChatCompletionChunk,
|
||||
ChatMessage,
|
||||
ChatChoice,
|
||||
StreamChoice,
|
||||
ChatChoiceDelta,
|
||||
Usage,
|
||||
ModelInfo,
|
||||
ModelListResponse,
|
||||
)
|
||||
from .inference_backends import (
|
||||
InferenceResult,
|
||||
call_ollama,
|
||||
stream_ollama,
|
||||
call_openai_compatible,
|
||||
stream_openai_compatible,
|
||||
call_anthropic,
|
||||
stream_anthropic,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceResult:
|
||||
"""Ergebnis einer Inference-Anfrage."""
|
||||
content: str
|
||||
model: str
|
||||
backend: str
|
||||
usage: Optional[Usage] = None
|
||||
finish_reason: str = "stop"
|
||||
|
||||
|
||||
class InferenceService:
|
||||
"""Service für LLM Inference über verschiedene Backends."""
|
||||
|
||||
@@ -68,26 +63,17 @@ class InferenceService:
|
||||
return None
|
||||
|
||||
def _map_model_to_backend(self, model: str) -> tuple[str, LLMBackendConfig]:
|
||||
"""
|
||||
Mapped ein Modell-Name zum entsprechenden Backend.
|
||||
|
||||
Beispiele:
|
||||
- "breakpilot-teacher-8b" → Ollama/vLLM mit llama3.1:8b
|
||||
- "claude-3-5-sonnet" → Anthropic
|
||||
"""
|
||||
"""Mapped ein Modell-Name zum entsprechenden Backend."""
|
||||
model_lower = model.lower()
|
||||
|
||||
# Explizite Claude-Modelle → Anthropic
|
||||
if "claude" in model_lower:
|
||||
if self.config.anthropic and self.config.anthropic.enabled:
|
||||
return self.config.anthropic.default_model, self.config.anthropic
|
||||
raise ValueError("Anthropic backend not configured")
|
||||
|
||||
# BreakPilot Modelle → primäres Backend
|
||||
if "breakpilot" in model_lower or "teacher" in model_lower:
|
||||
backend = self._get_available_backend()
|
||||
if backend:
|
||||
# Map zu tatsächlichem Modell-Namen
|
||||
if "70b" in model_lower:
|
||||
actual_model = "llama3.1:70b" if backend.name == "ollama" else "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
||||
else:
|
||||
@@ -95,7 +81,6 @@ class InferenceService:
|
||||
return actual_model, backend
|
||||
raise ValueError("No LLM backend available")
|
||||
|
||||
# Mistral Modelle
|
||||
if "mistral" in model_lower:
|
||||
backend = self._get_available_backend()
|
||||
if backend:
|
||||
@@ -103,409 +88,64 @@ class InferenceService:
|
||||
return actual_model, backend
|
||||
raise ValueError("No LLM backend available")
|
||||
|
||||
# Fallback: verwende Modell-Name direkt
|
||||
backend = self._get_available_backend()
|
||||
if backend:
|
||||
return model, backend
|
||||
raise ValueError("No LLM backend available")
|
||||
|
||||
async def _call_ollama(
|
||||
self,
|
||||
backend: LLMBackendConfig,
|
||||
model: str,
|
||||
request: ChatCompletionRequest,
|
||||
) -> InferenceResult:
|
||||
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
|
||||
client = await self.get_client()
|
||||
|
||||
# Ollama verwendet eigenes Format
|
||||
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": request.temperature,
|
||||
"top_p": request.top_p,
|
||||
},
|
||||
}
|
||||
|
||||
if request.max_tokens:
|
||||
payload["options"]["num_predict"] = request.max_tokens
|
||||
|
||||
response = await client.post(
|
||||
f"{backend.base_url}/api/chat",
|
||||
json=payload,
|
||||
timeout=backend.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
return InferenceResult(
|
||||
content=data.get("message", {}).get("content", ""),
|
||||
model=model,
|
||||
backend="ollama",
|
||||
usage=Usage(
|
||||
prompt_tokens=data.get("prompt_eval_count", 0),
|
||||
completion_tokens=data.get("eval_count", 0),
|
||||
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
|
||||
),
|
||||
finish_reason="stop" if data.get("done") else "length",
|
||||
)
|
||||
|
||||
async def _stream_ollama(
|
||||
self,
|
||||
backend: LLMBackendConfig,
|
||||
model: str,
|
||||
request: ChatCompletionRequest,
|
||||
response_id: str,
|
||||
) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""Streamt von Ollama."""
|
||||
client = await self.get_client()
|
||||
|
||||
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"options": {
|
||||
"temperature": request.temperature,
|
||||
"top_p": request.top_p,
|
||||
},
|
||||
}
|
||||
|
||||
if request.max_tokens:
|
||||
payload["options"]["num_predict"] = request.max_tokens
|
||||
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{backend.base_url}/api/chat",
|
||||
json=payload,
|
||||
timeout=backend.timeout,
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
async for line in response.aiter_lines():
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
content = data.get("message", {}).get("content", "")
|
||||
done = data.get("done", False)
|
||||
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id,
|
||||
model=model,
|
||||
choices=[
|
||||
StreamChoice(
|
||||
index=0,
|
||||
delta=ChatChoiceDelta(content=content),
|
||||
finish_reason="stop" if done else None,
|
||||
)
|
||||
],
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
async def _call_openai_compatible(
|
||||
self,
|
||||
backend: LLMBackendConfig,
|
||||
model: str,
|
||||
request: ChatCompletionRequest,
|
||||
) -> InferenceResult:
|
||||
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
|
||||
client = await self.get_client()
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if backend.api_key:
|
||||
headers["Authorization"] = f"Bearer {backend.api_key}"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
|
||||
"stream": False,
|
||||
"temperature": request.temperature,
|
||||
"top_p": request.top_p,
|
||||
}
|
||||
|
||||
if request.max_tokens:
|
||||
payload["max_tokens"] = request.max_tokens
|
||||
if request.stop:
|
||||
payload["stop"] = request.stop
|
||||
|
||||
response = await client.post(
|
||||
f"{backend.base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=backend.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
choice = data.get("choices", [{}])[0]
|
||||
usage_data = data.get("usage", {})
|
||||
|
||||
return InferenceResult(
|
||||
content=choice.get("message", {}).get("content", ""),
|
||||
model=model,
|
||||
backend=backend.name,
|
||||
usage=Usage(
|
||||
prompt_tokens=usage_data.get("prompt_tokens", 0),
|
||||
completion_tokens=usage_data.get("completion_tokens", 0),
|
||||
total_tokens=usage_data.get("total_tokens", 0),
|
||||
),
|
||||
finish_reason=choice.get("finish_reason", "stop"),
|
||||
)
|
||||
|
||||
async def _stream_openai_compatible(
|
||||
self,
|
||||
backend: LLMBackendConfig,
|
||||
model: str,
|
||||
request: ChatCompletionRequest,
|
||||
response_id: str,
|
||||
) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""Streamt von OpenAI-kompatibler API."""
|
||||
client = await self.get_client()
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if backend.api_key:
|
||||
headers["Authorization"] = f"Bearer {backend.api_key}"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
|
||||
"stream": True,
|
||||
"temperature": request.temperature,
|
||||
"top_p": request.top_p,
|
||||
}
|
||||
|
||||
if request.max_tokens:
|
||||
payload["max_tokens"] = request.max_tokens
|
||||
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{backend.base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=backend.timeout,
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
async for line in response.aiter_lines():
|
||||
if not line or not line.startswith("data: "):
|
||||
continue
|
||||
data_str = line[6:] # Remove "data: " prefix
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
try:
|
||||
data = json.loads(data_str)
|
||||
choice = data.get("choices", [{}])[0]
|
||||
delta = choice.get("delta", {})
|
||||
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id,
|
||||
model=model,
|
||||
choices=[
|
||||
StreamChoice(
|
||||
index=0,
|
||||
delta=ChatChoiceDelta(
|
||||
role=delta.get("role"),
|
||||
content=delta.get("content"),
|
||||
),
|
||||
finish_reason=choice.get("finish_reason"),
|
||||
)
|
||||
],
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
async def _call_anthropic(
|
||||
self,
|
||||
backend: LLMBackendConfig,
|
||||
model: str,
|
||||
request: ChatCompletionRequest,
|
||||
) -> InferenceResult:
|
||||
"""Ruft Anthropic Claude API auf."""
|
||||
# Anthropic SDK verwenden (bereits installiert)
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
raise ImportError("anthropic package required for Claude API")
|
||||
|
||||
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
|
||||
|
||||
# System message extrahieren
|
||||
system_content = ""
|
||||
messages = []
|
||||
for msg in request.messages:
|
||||
if msg.role == "system":
|
||||
system_content += (msg.content or "") + "\n"
|
||||
else:
|
||||
messages.append({"role": msg.role, "content": msg.content or ""})
|
||||
|
||||
response = await client.messages.create(
|
||||
model=model,
|
||||
max_tokens=request.max_tokens or 4096,
|
||||
system=system_content.strip() if system_content else None,
|
||||
messages=messages,
|
||||
temperature=request.temperature,
|
||||
top_p=request.top_p,
|
||||
)
|
||||
|
||||
content = ""
|
||||
if response.content:
|
||||
content = response.content[0].text if response.content[0].type == "text" else ""
|
||||
|
||||
return InferenceResult(
|
||||
content=content,
|
||||
model=model,
|
||||
backend="anthropic",
|
||||
usage=Usage(
|
||||
prompt_tokens=response.usage.input_tokens,
|
||||
completion_tokens=response.usage.output_tokens,
|
||||
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
|
||||
),
|
||||
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
|
||||
)
|
||||
|
||||
async def _stream_anthropic(
|
||||
self,
|
||||
backend: LLMBackendConfig,
|
||||
model: str,
|
||||
request: ChatCompletionRequest,
|
||||
response_id: str,
|
||||
) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""Streamt von Anthropic Claude API."""
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
raise ImportError("anthropic package required for Claude API")
|
||||
|
||||
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
|
||||
|
||||
# System message extrahieren
|
||||
system_content = ""
|
||||
messages = []
|
||||
for msg in request.messages:
|
||||
if msg.role == "system":
|
||||
system_content += (msg.content or "") + "\n"
|
||||
else:
|
||||
messages.append({"role": msg.role, "content": msg.content or ""})
|
||||
|
||||
async with client.messages.stream(
|
||||
model=model,
|
||||
max_tokens=request.max_tokens or 4096,
|
||||
system=system_content.strip() if system_content else None,
|
||||
messages=messages,
|
||||
temperature=request.temperature,
|
||||
top_p=request.top_p,
|
||||
) as stream:
|
||||
async for text in stream.text_stream:
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id,
|
||||
model=model,
|
||||
choices=[
|
||||
StreamChoice(
|
||||
index=0,
|
||||
delta=ChatChoiceDelta(content=text),
|
||||
finish_reason=None,
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
# Final chunk with finish_reason
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id,
|
||||
model=model,
|
||||
choices=[
|
||||
StreamChoice(
|
||||
index=0,
|
||||
delta=ChatChoiceDelta(),
|
||||
finish_reason="stop",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
async def complete(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
|
||||
"""
|
||||
Führt Chat Completion durch (non-streaming).
|
||||
"""
|
||||
"""Führt Chat Completion durch (non-streaming)."""
|
||||
actual_model, backend = self._map_model_to_backend(request.model)
|
||||
logger.info(f"Inference request: model={request.model} -> {actual_model} via {backend.name}")
|
||||
|
||||
logger.info(f"Inference request: model={request.model} → {actual_model} via {backend.name}")
|
||||
client = await self.get_client()
|
||||
|
||||
if backend.name == "ollama":
|
||||
result = await self._call_ollama(backend, actual_model, request)
|
||||
result = await call_ollama(client, backend, actual_model, request)
|
||||
elif backend.name == "anthropic":
|
||||
result = await self._call_anthropic(backend, actual_model, request)
|
||||
result = await call_anthropic(backend, actual_model, request)
|
||||
else:
|
||||
result = await self._call_openai_compatible(backend, actual_model, request)
|
||||
result = await call_openai_compatible(client, backend, actual_model, request)
|
||||
|
||||
return ChatCompletionResponse(
|
||||
model=request.model, # Original requested model name
|
||||
choices=[
|
||||
ChatChoice(
|
||||
index=0,
|
||||
message=ChatMessage(role="assistant", content=result.content),
|
||||
finish_reason=result.finish_reason,
|
||||
)
|
||||
],
|
||||
model=request.model,
|
||||
choices=[ChatChoice(index=0, message=ChatMessage(role="assistant", content=result.content), finish_reason=result.finish_reason)],
|
||||
usage=result.usage,
|
||||
)
|
||||
|
||||
async def stream(self, request: ChatCompletionRequest) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""
|
||||
Führt Chat Completion mit Streaming durch.
|
||||
"""
|
||||
"""Führt Chat Completion mit Streaming durch."""
|
||||
import uuid
|
||||
response_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
||||
|
||||
actual_model, backend = self._map_model_to_backend(request.model)
|
||||
logger.info(f"Streaming request: model={request.model} -> {actual_model} via {backend.name}")
|
||||
|
||||
logger.info(f"Streaming request: model={request.model} → {actual_model} via {backend.name}")
|
||||
client = await self.get_client()
|
||||
|
||||
if backend.name == "ollama":
|
||||
async for chunk in self._stream_ollama(backend, actual_model, request, response_id):
|
||||
async for chunk in stream_ollama(client, backend, actual_model, request, response_id):
|
||||
yield chunk
|
||||
elif backend.name == "anthropic":
|
||||
async for chunk in self._stream_anthropic(backend, actual_model, request, response_id):
|
||||
async for chunk in stream_anthropic(backend, actual_model, request, response_id):
|
||||
yield chunk
|
||||
else:
|
||||
async for chunk in self._stream_openai_compatible(backend, actual_model, request, response_id):
|
||||
async for chunk in stream_openai_compatible(client, backend, actual_model, request, response_id):
|
||||
yield chunk
|
||||
|
||||
async def list_models(self) -> ModelListResponse:
|
||||
"""Listet verfügbare Modelle."""
|
||||
models = []
|
||||
|
||||
# BreakPilot Modelle (mapped zu verfügbaren Backends)
|
||||
backend = self._get_available_backend()
|
||||
if backend:
|
||||
models.extend([
|
||||
ModelInfo(
|
||||
id="breakpilot-teacher-8b",
|
||||
owned_by="breakpilot",
|
||||
description="Llama 3.1 8B optimiert für Schulkontext",
|
||||
context_length=8192,
|
||||
),
|
||||
ModelInfo(
|
||||
id="breakpilot-teacher-70b",
|
||||
owned_by="breakpilot",
|
||||
description="Llama 3.1 70B für komplexe Aufgaben",
|
||||
context_length=8192,
|
||||
),
|
||||
ModelInfo(id="breakpilot-teacher-8b", owned_by="breakpilot", description="Llama 3.1 8B optimiert für Schulkontext", context_length=8192),
|
||||
ModelInfo(id="breakpilot-teacher-70b", owned_by="breakpilot", description="Llama 3.1 70B für komplexe Aufgaben", context_length=8192),
|
||||
])
|
||||
|
||||
# Claude Modelle (wenn Anthropic konfiguriert)
|
||||
if self.config.anthropic and self.config.anthropic.enabled:
|
||||
models.append(
|
||||
ModelInfo(
|
||||
id="claude-3-5-sonnet",
|
||||
owned_by="anthropic",
|
||||
description="Claude 3.5 Sonnet - Fallback für höchste Qualität",
|
||||
context_length=200000,
|
||||
)
|
||||
)
|
||||
models.append(ModelInfo(id="claude-3-5-sonnet", owned_by="anthropic", description="Claude 3.5 Sonnet - Fallback für höchste Qualität", context_length=200000))
|
||||
|
||||
return ModelListResponse(data=models)
|
||||
|
||||
|
||||
230
backend-lehrer/llm_gateway/services/inference_backends.py
Normal file
230
backend-lehrer/llm_gateway/services/inference_backends.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Inference Backends - Kommunikation mit einzelnen LLM-Providern.
|
||||
|
||||
Unterstützt Ollama, OpenAI-kompatible APIs und Anthropic Claude.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import AsyncIterator, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..config import LLMBackendConfig
|
||||
from ..models.chat import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionChunk,
|
||||
ChatMessage,
|
||||
StreamChoice,
|
||||
ChatChoiceDelta,
|
||||
Usage,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceResult:
|
||||
"""Ergebnis einer Inference-Anfrage."""
|
||||
content: str
|
||||
model: str
|
||||
backend: str
|
||||
usage: Optional[Usage] = None
|
||||
finish_reason: str = "stop"
|
||||
|
||||
|
||||
async def call_ollama(client, backend: LLMBackendConfig, model: str, request: ChatCompletionRequest) -> InferenceResult:
|
||||
"""Ruft Ollama API auf (nicht OpenAI-kompatibel)."""
|
||||
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {"temperature": request.temperature, "top_p": request.top_p},
|
||||
}
|
||||
if request.max_tokens:
|
||||
payload["options"]["num_predict"] = request.max_tokens
|
||||
|
||||
response = await client.post(f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
return InferenceResult(
|
||||
content=data.get("message", {}).get("content", ""),
|
||||
model=model, backend="ollama",
|
||||
usage=Usage(
|
||||
prompt_tokens=data.get("prompt_eval_count", 0),
|
||||
completion_tokens=data.get("eval_count", 0),
|
||||
total_tokens=data.get("prompt_eval_count", 0) + data.get("eval_count", 0),
|
||||
),
|
||||
finish_reason="stop" if data.get("done") else "length",
|
||||
)
|
||||
|
||||
|
||||
async def stream_ollama(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""Streamt von Ollama."""
|
||||
messages = [{"role": m.role, "content": m.content or ""} for m in request.messages]
|
||||
|
||||
payload = {
|
||||
"model": model, "messages": messages, "stream": True,
|
||||
"options": {"temperature": request.temperature, "top_p": request.top_p},
|
||||
}
|
||||
if request.max_tokens:
|
||||
payload["options"]["num_predict"] = request.max_tokens
|
||||
|
||||
async with client.stream("POST", f"{backend.base_url}/api/chat", json=payload, timeout=backend.timeout) as response:
|
||||
response.raise_for_status()
|
||||
async for line in response.aiter_lines():
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
content = data.get("message", {}).get("content", "")
|
||||
done = data.get("done", False)
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id, model=model,
|
||||
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=content), finish_reason="stop" if done else None)],
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
async def call_openai_compatible(client, backend, model, request) -> InferenceResult:
|
||||
"""Ruft OpenAI-kompatible API auf (vLLM, etc.)."""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if backend.api_key:
|
||||
headers["Authorization"] = f"Bearer {backend.api_key}"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
|
||||
"stream": False, "temperature": request.temperature, "top_p": request.top_p,
|
||||
}
|
||||
if request.max_tokens:
|
||||
payload["max_tokens"] = request.max_tokens
|
||||
if request.stop:
|
||||
payload["stop"] = request.stop
|
||||
|
||||
response = await client.post(f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
choice = data.get("choices", [{}])[0]
|
||||
usage_data = data.get("usage", {})
|
||||
|
||||
return InferenceResult(
|
||||
content=choice.get("message", {}).get("content", ""),
|
||||
model=model, backend=backend.name,
|
||||
usage=Usage(
|
||||
prompt_tokens=usage_data.get("prompt_tokens", 0),
|
||||
completion_tokens=usage_data.get("completion_tokens", 0),
|
||||
total_tokens=usage_data.get("total_tokens", 0),
|
||||
),
|
||||
finish_reason=choice.get("finish_reason", "stop"),
|
||||
)
|
||||
|
||||
|
||||
async def stream_openai_compatible(client, backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""Streamt von OpenAI-kompatibler API."""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if backend.api_key:
|
||||
headers["Authorization"] = f"Bearer {backend.api_key}"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [m.model_dump(exclude_none=True) for m in request.messages],
|
||||
"stream": True, "temperature": request.temperature, "top_p": request.top_p,
|
||||
}
|
||||
if request.max_tokens:
|
||||
payload["max_tokens"] = request.max_tokens
|
||||
|
||||
async with client.stream("POST", f"{backend.base_url}/v1/chat/completions", json=payload, headers=headers, timeout=backend.timeout) as response:
|
||||
response.raise_for_status()
|
||||
async for line in response.aiter_lines():
|
||||
if not line or not line.startswith("data: "):
|
||||
continue
|
||||
data_str = line[6:]
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
try:
|
||||
data = json.loads(data_str)
|
||||
choice = data.get("choices", [{}])[0]
|
||||
delta = choice.get("delta", {})
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id, model=model,
|
||||
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(role=delta.get("role"), content=delta.get("content")), finish_reason=choice.get("finish_reason"))],
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
async def call_anthropic(backend, model, request) -> InferenceResult:
|
||||
"""Ruft Anthropic Claude API auf."""
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
raise ImportError("anthropic package required for Claude API")
|
||||
|
||||
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
|
||||
|
||||
system_content = ""
|
||||
messages = []
|
||||
for msg in request.messages:
|
||||
if msg.role == "system":
|
||||
system_content += (msg.content or "") + "\n"
|
||||
else:
|
||||
messages.append({"role": msg.role, "content": msg.content or ""})
|
||||
|
||||
response = await client.messages.create(
|
||||
model=model, max_tokens=request.max_tokens or 4096,
|
||||
system=system_content.strip() if system_content else None,
|
||||
messages=messages, temperature=request.temperature, top_p=request.top_p,
|
||||
)
|
||||
|
||||
content = ""
|
||||
if response.content:
|
||||
content = response.content[0].text if response.content[0].type == "text" else ""
|
||||
|
||||
return InferenceResult(
|
||||
content=content, model=model, backend="anthropic",
|
||||
usage=Usage(
|
||||
prompt_tokens=response.usage.input_tokens,
|
||||
completion_tokens=response.usage.output_tokens,
|
||||
total_tokens=response.usage.input_tokens + response.usage.output_tokens,
|
||||
),
|
||||
finish_reason="stop" if response.stop_reason == "end_turn" else response.stop_reason or "stop",
|
||||
)
|
||||
|
||||
|
||||
async def stream_anthropic(backend, model, request, response_id) -> AsyncIterator[ChatCompletionChunk]:
|
||||
"""Streamt von Anthropic Claude API."""
|
||||
try:
|
||||
import anthropic
|
||||
except ImportError:
|
||||
raise ImportError("anthropic package required for Claude API")
|
||||
|
||||
client = anthropic.AsyncAnthropic(api_key=backend.api_key)
|
||||
|
||||
system_content = ""
|
||||
messages = []
|
||||
for msg in request.messages:
|
||||
if msg.role == "system":
|
||||
system_content += (msg.content or "") + "\n"
|
||||
else:
|
||||
messages.append({"role": msg.role, "content": msg.content or ""})
|
||||
|
||||
async with client.messages.stream(
|
||||
model=model, max_tokens=request.max_tokens or 4096,
|
||||
system=system_content.strip() if system_content else None,
|
||||
messages=messages, temperature=request.temperature, top_p=request.top_p,
|
||||
) as stream:
|
||||
async for text in stream.text_stream:
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id, model=model,
|
||||
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(content=text), finish_reason=None)],
|
||||
)
|
||||
|
||||
yield ChatCompletionChunk(
|
||||
id=response_id, model=model,
|
||||
choices=[StreamChoice(index=0, delta=ChatChoiceDelta(), finish_reason="stop")],
|
||||
)
|
||||
Reference in New Issue
Block a user