/** * Gemeinsame LLM-Kaskade fuer die Drafting-Engine. * * Reihenfolge: OVH/LiteLLM (gpt-oss-120b) zuerst — der prod-Chat-LLM; Ollama als * Dev-Fallback. Auf prod ist OLLAMA_URL embedding-only (kein Chat-Modell), daher * ist OVH dort der einzige funktionierende Pfad — genau wie beim Compliance-Advisor * (siehe lib/sdk/agents/advisor-llm). Das Backend nutzt OVH + JSON-Output bereits * erfolgreich auf prod (extract-datasheet), dieselbe Technik wird hier gespiegelt. */ const OLLAMA_URL = process.env.OLLAMA_URL || 'http://host.docker.internal:11434' const OLLAMA_MODEL = process.env.COMPLIANCE_LLM_MODEL || 'qwen2.5vl:32b' const OVH_URL = (process.env.OVH_LLM_URL || '').replace(/\/+$/, '') const OVH_MODEL = process.env.OVH_LLM_MODEL || '' const OVH_KEY = process.env.OVH_LLM_KEY || '' export interface ChatMessage { role: string content: string } export interface CascadeOpts { json?: boolean maxTokens?: number temperature?: number timeoutMs?: number } export interface CascadeResult { content: string tokensUsed: number provider: 'ovh' | 'ollama' } // --- Stream-Parser (pure, testbar) --- /** Text-Delta aus einer Ollama-NDJSON-Zeile (message.content). */ export function parseOllamaLine(line: string): string | null { const t = line.trim() if (!t) return null try { return JSON.parse(t)?.message?.content || null } catch { return null } } /** Text-Delta aus einer OpenAI/OVH-SSE-Zeile (choices[0].delta.content). */ export function parseSSELine(line: string): string | null { const t = line.trim() if (!t.startsWith('data:')) return null const payload = t.slice(5).trim() if (!payload || payload === '[DONE]') return null try { return JSON.parse(payload)?.choices?.[0]?.delta?.content || null } catch { return null } } // --- Non-Streaming (cascadeComplete) --- async function ovhComplete(messages: ChatMessage[], o: CascadeOpts): Promise { if (!OVH_URL || !OVH_MODEL) return null try { const headers: Record = { 'Content-Type': 'application/json' } if (OVH_KEY) headers['Authorization'] = `Bearer ${OVH_KEY}` const payload: Record = { model: OVH_MODEL, messages, stream: false, temperature: o.temperature ?? 0.15, max_tokens: o.maxTokens ?? 4096, } if (o.json) payload.response_format = { type: 'json_object' } const r = await fetch(`${OVH_URL}/v1/chat/completions`, { method: 'POST', headers, body: JSON.stringify(payload), signal: AbortSignal.timeout(o.timeoutMs ?? 120000), }) if (!r.ok) return null const d = await r.json() const content = d?.choices?.[0]?.message?.content || '' if (!content) return null const usage = d?.usage || {} return { content, tokensUsed: usage.completion_tokens ?? usage.total_tokens ?? 0, provider: 'ovh' } } catch { return null } } async function ollamaComplete(messages: ChatMessage[], o: CascadeOpts): Promise { try { const body: Record = { model: OLLAMA_MODEL, messages, stream: false, think: false, options: { temperature: o.temperature ?? 0.15, num_predict: o.maxTokens ?? 4096, num_ctx: 8192 }, } if (o.json) body.format = 'json' const r = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body), signal: AbortSignal.timeout(o.timeoutMs ?? 120000), }) if (!r.ok) return null const d = await r.json() const content = d?.message?.content || '' if (!content) return null return { content, tokensUsed: d?.eval_count ?? 0, provider: 'ollama' } } catch { return null } } /** * Nicht-streamender LLM-Aufruf mit Kaskade. Liefert Inhalt + Token + Provider, * oder null wenn weder OVH noch Ollama antworten. */ export async function cascadeComplete(messages: ChatMessage[], opts: CascadeOpts = {}): Promise { return (await ovhComplete(messages, opts)) ?? (await ollamaComplete(messages, opts)) } // --- Streaming (cascadeStream) --- const encoder = new TextEncoder() function textStream(upstream: Response, parseLine: (line: string) => string | null): ReadableStream { return new ReadableStream({ async start(controller) { const reader = upstream.body!.getReader() const decoder = new TextDecoder() let buf = '' try { for (;;) { const { done, value } = await reader.read() if (done) break buf += decoder.decode(value, { stream: true }) const lines = buf.split('\n') buf = lines.pop() || '' for (const line of lines) { const delta = parseLine(line) if (delta) controller.enqueue(encoder.encode(delta)) } } const tail = parseLine(buf) if (tail) controller.enqueue(encoder.encode(tail)) } finally { controller.close() } }, }) } async function ovhStream(messages: ChatMessage[], o: CascadeOpts): Promise { if (!OVH_URL || !OVH_MODEL) return null try { const headers: Record = { 'Content-Type': 'application/json' } if (OVH_KEY) headers['Authorization'] = `Bearer ${OVH_KEY}` const r = await fetch(`${OVH_URL}/v1/chat/completions`, { method: 'POST', headers, body: JSON.stringify({ model: OVH_MODEL, messages, stream: true, temperature: o.temperature ?? 0.3, max_tokens: o.maxTokens ?? 4096, }), signal: AbortSignal.timeout(o.timeoutMs ?? 120000), }) return r.ok && r.body ? r : null } catch { return null } } async function ollamaStream(messages: ChatMessage[], o: CascadeOpts): Promise { try { const r = await fetch(`${OLLAMA_URL}/api/chat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: OLLAMA_MODEL, messages, stream: true, think: false, options: { temperature: o.temperature ?? 0.3, num_predict: o.maxTokens ?? 4096, num_ctx: 8192 }, }), signal: AbortSignal.timeout(o.timeoutMs ?? 120000), }) return r.ok && r.body ? r : null } catch { return null } } /** * Streamender LLM-Aufruf mit Kaskade -> Plain-Text-Stream. null = kein LLM erreichbar. */ export async function cascadeStream(messages: ChatMessage[], opts: CascadeOpts = {}): Promise | null> { const ovh = await ovhStream(messages, opts) if (ovh) return textStream(ovh, parseSSELine) const ollama = await ollamaStream(messages, opts) if (ollama) return textStream(ollama, parseOllamaLine) return null }