breakpilot-compliance/admin-compliance/app/api/sdk/drafting-engine/chat/route.ts

/**
 * Drafting Engine Chat API
 *
 * Verbindet das DraftingEngineWidget mit dem LLM Backend.
 * Unterstuetzt alle 4 Modi: explain, ask, draft, validate.
 * Nutzt State-Projection fuer token-effiziente Kontextgabe.
 */

import { NextRequest, NextResponse } from 'next/server'
import { queryRAG } from '@/lib/sdk/drafting-engine/rag-query'
import { readSoulFile } from '@/lib/sdk/agents/soul-reader'

const OLLAMA_URL = process.env.OLLAMA_URL || 'http://host.docker.internal:11434'
const LLM_MODEL = process.env.COMPLIANCE_LLM_MODEL || 'qwen2.5vl:32b'

// Fallback SOUL prompt (used when .soul.md file is unavailable)
const FALLBACK_DRAFTING_PROMPT = `# Drafting Agent - Compliance-Dokumententwurf

## Identitaet
Du bist der BreakPilot Drafting Agent. Du hilfst Nutzern des AI Compliance SDK,
DSGVO-konforme Compliance-Dokumente zu entwerfen und Konsistenz sicherzustellen.

## Strikte Constraints
- Gib praxisnahe Hinweise, KEINE konkrete Rechtsberatung
- Kommuniziere auf Deutsch, sachlich und verstaendlich
- Fuelle fehlende Informationen mit [PLATZHALTER: ...] Markierung`

export async function POST(request: NextRequest) {
  try {
    const body = await request.json()
    const {
      message,
      history = [],
      sdkStateProjection,
      mode = 'explain',
      documentType,
    } = body

    if (!message || typeof message !== 'string') {
      return NextResponse.json({ error: 'Message is required' }, { status: 400 })
    }

    // 1. Query RAG for legal context
    const ragContext = await queryRAG(message)

    // 2. Build system prompt with mode-specific instructions + state projection
    const soulPrompt = await readSoulFile('drafting-agent')
    let systemContent = soulPrompt || FALLBACK_DRAFTING_PROMPT

    // Mode-specific instructions
    const modeInstructions: Record<string, string> = {
      explain: '\n\n## Aktueller Modus: EXPLAIN\nBeantworte Fragen verstaendlich mit Quellenangaben.',
      ask: '\n\n## Aktueller Modus: ASK\nAnalysiere Luecken und stelle gezielte Fragen. Eine Frage pro Antwort.',
      draft: `\n\n## Aktueller Modus: DRAFT\nEntwirf strukturierte Dokument-Sections. Dokumenttyp: ${documentType || 'nicht spezifiziert'}.\nAntworte mit JSON wenn ein Draft angefragt wird.`,
      validate: '\n\n## Aktueller Modus: VALIDATE\nPruefe Cross-Dokument-Konsistenz. Gib Errors, Warnings und Suggestions zurueck.',
    }
    systemContent += modeInstructions[mode] || modeInstructions.explain

    // Add state projection context
    if (sdkStateProjection) {
      systemContent += `\n\n## SDK-State Projektion (${mode}-Kontext)\n${JSON.stringify(sdkStateProjection, null, 0).slice(0, 3000)}`
    }

    // Add RAG context
    if (ragContext) {
      systemContent += `\n\n## Relevanter Rechtskontext\n${ragContext}`
    }

    // 3. Build messages array
    const messages = [
      { role: 'system', content: systemContent },
      ...history.slice(-10).map((h: { role: string; content: string }) => ({
        role: h.role === 'user' ? 'user' : 'assistant',
        content: h.content,
      })),
      { role: 'user', content: message },
    ]

    // 4. Call LLM with streaming
    const ollamaResponse = await fetch(`${OLLAMA_URL}/api/chat`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        model: LLM_MODEL,
        messages,
        stream: true,
        options: {
          temperature: mode === 'draft' ? 0.2 : 0.3,
          num_predict: mode === 'draft' ? 16384 : 8192,
        },
      }),
      signal: AbortSignal.timeout(120000),
    })

    if (!ollamaResponse.ok) {
      const errorText = await ollamaResponse.text()
      console.error('LLM error:', ollamaResponse.status, errorText)
      return NextResponse.json(
        { error: `LLM nicht erreichbar (Status ${ollamaResponse.status})` },
        { status: 502 }
      )
    }

    // 5. Stream response back
    const encoder = new TextEncoder()
    const stream = new ReadableStream({
      async start(controller) {
        const reader = ollamaResponse.body!.getReader()
        const decoder = new TextDecoder()

        try {
          while (true) {
            const { done, value } = await reader.read()
            if (done) break

            const chunk = decoder.decode(value, { stream: true })
            const lines = chunk.split('\n').filter((l) => l.trim())

            for (const line of lines) {
              try {
                const json = JSON.parse(line)
                if (json.message?.content) {
                  controller.enqueue(encoder.encode(json.message.content))
                }
              } catch {
                // Partial JSON, skip
              }
            }
          }
        } catch (error) {
          console.error('Stream error:', error)
        } finally {
          controller.close()
        }
      },
    })

    return new NextResponse(stream, {
      headers: {
        'Content-Type': 'text/plain; charset=utf-8',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
      },
    })
  } catch (error) {
    console.error('Drafting engine chat error:', error)
    return NextResponse.json(
      { error: 'Verbindung zum LLM fehlgeschlagen.' },
      { status: 503 }
    )
  }
}