feat(presenter): replace Web Speech API with Piper TTS for high-quality voice

- New API route /api/presenter/tts proxies to compliance-tts-service - usePresenterMode now uses Audio element with Piper-generated MP3 - Client-side audio caching (text hash → blob URL) avoids re-synthesis - Graceful fallback to word-count timer if TTS service unavailable - Add TTS_SERVICE_URL env var to pitch-deck Docker config Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:23:37 +01:00
parent bcbceba31c
commit ddabda6f05
3 changed files with 150 additions and 79 deletions
@@ -836,6 +836,7 @@ services:
      LITELLM_URL: ${LITELLM_URL:-https://llm-dev.meghsakha.com}
      LITELLM_MODEL: ${LITELLM_MODEL:-gpt-oss-120b}
      LITELLM_API_KEY: ${LITELLM_API_KEY:-sk-0nAyxaMVbIqmz_ntnndzag}
+      TTS_SERVICE_URL: http://bp-compliance-tts:8095
    extra_hosts:
      - "host.docker.internal:host-gateway"
    depends_on:
@@ -0,0 +1,46 @@
+import { NextRequest, NextResponse } from 'next/server'
+
+const TTS_SERVICE_URL = process.env.TTS_SERVICE_URL || 'http://compliance-tts-service:8095'
+
+export async function POST(request: NextRequest) {
+  try {
+    const body = await request.json()
+    const { text, language = 'de' } = body
+
+    if (!text || typeof text !== 'string') {
+      return NextResponse.json({ error: 'Text is required' }, { status: 400 })
+    }
+
+    const res = await fetch(`${TTS_SERVICE_URL}/synthesize-direct`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ text, language }),
+      signal: AbortSignal.timeout(30000),
+    })
+
+    if (!res.ok) {
+      const errorText = await res.text()
+      console.error('TTS service error:', res.status, errorText)
+      return NextResponse.json(
+        { error: `TTS service error (${res.status})` },
+        { status: 502 }
+      )
+    }
+
+    const audioBuffer = await res.arrayBuffer()
+
+    return new NextResponse(audioBuffer, {
+      headers: {
+        'Content-Type': 'audio/mpeg',
+        'Cache-Control': 'public, max-age=86400', // Cache 24h — texts are static
+        'X-TTS-Cache': res.headers.get('X-TTS-Cache') || 'unknown',
+      },
+    })
+  } catch (error) {
+    console.error('TTS proxy error:', error)
+    return NextResponse.json(
+      { error: 'TTS service not reachable' },
+      { status: 503 }
+    )
+  }
+}
@@ -11,7 +11,6 @@ interface UsePresenterModeConfig {
  currentSlide: number
  totalSlides: number
  language: Language
-  speechRate?: number  // 0.5–2.0, default 1.0
  ttsEnabled?: boolean // default true
 }

@@ -32,12 +31,21 @@ interface UsePresenterModeReturn {
  setTtsEnabled: (enabled: boolean) => void
 }

+// Client-side audio cache: text hash → blob URL
+const audioCache = new Map<string, string>()
+
+async function hashText(text: string): Promise<string> {
+  const encoder = new TextEncoder()
+  const data = encoder.encode(text)
+  const hash = await crypto.subtle.digest('SHA-256', data)
+  return Array.from(new Uint8Array(hash)).slice(0, 8).map(b => b.toString(16).padStart(2, '0')).join('')
+}
+
 export function usePresenterMode({
  goToSlide,
  currentSlide,
  totalSlides,
  language,
-  speechRate = 1.0,
  ttsEnabled: initialTtsEnabled = true,
 }: UsePresenterModeConfig): UsePresenterModeReturn {
  const [state, setState] = useState<PresenterState>('idle')
@@ -50,45 +58,42 @@ export function usePresenterMode({
  const slideIndexRef = useRef(currentSlide)
  const paragraphIndexRef = useRef(0)
  const stateRef = useRef<PresenterState>('idle')
-  const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null)
-  const voicesRef = useRef<SpeechSynthesisVoice[]>([])
+  const audioRef = useRef<HTMLAudioElement | null>(null)
+  const abortRef = useRef<AbortController | null>(null)

  // Refs for recursive functions to avoid circular useCallback dependencies
  const advanceRef = useRef<() => void>(() => {})
  const speakAndAdvanceRef = useRef<(text: string, pauseAfter: number, onDone: () => void) => void>(() => {})

-  // Initialize Web Speech API voices
+  // Check TTS service availability on mount
  useEffect(() => {
-    if (typeof window === 'undefined' || !window.speechSynthesis) return
-    setTtsAvailable(true)
-
-    const loadVoices = () => {
-      voicesRef.current = window.speechSynthesis.getVoices()
-    }
-    loadVoices()
-    window.speechSynthesis.addEventListener('voiceschanged', loadVoices)
-    return () => {
-      window.speechSynthesis.removeEventListener('voiceschanged', loadVoices)
-    }
-  }, [])
-
-  const getVoice = useCallback((lang: Language): SpeechSynthesisVoice | null => {
-    const voices = voicesRef.current
-    if (!voices.length) return null
-    const langCode = lang === 'de' ? 'de' : 'en'
-    // Prefer high-quality voices
-    const premium = voices.find(v =>
-      v.lang.startsWith(langCode) && /premium|enhanced|neural|google|microsoft/i.test(v.name)
-    )
-    if (premium) return premium
-    return voices.find(v => v.lang.startsWith(langCode)) || null
+    fetch('/api/presenter/tts', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ text: 'Test', language: 'de' }),
+      signal: AbortSignal.timeout(5000),
+    })
+      .then(res => {
+        setTtsAvailable(res.ok)
+        if (res.ok) console.log('Piper TTS available')
+        else console.warn('Piper TTS not available:', res.status)
+      })
+      .catch(() => {
+        setTtsAvailable(false)
+        console.warn('Piper TTS service not reachable')
+      })
  }, [])

  const cancelSpeech = useCallback(() => {
-    if (typeof window !== 'undefined' && window.speechSynthesis) {
-      window.speechSynthesis.cancel()
+    if (audioRef.current) {
+      audioRef.current.pause()
+      audioRef.current.currentTime = 0
+      audioRef.current = null
+    }
+    if (abortRef.current) {
+      abortRef.current.abort()
+      abortRef.current = null
    }
-    utteranceRef.current = null
    setIsSpeaking(false)
  }, [])

@@ -123,48 +128,79 @@ export function usePresenterMode({
  // Update speakAndAdvance ref whenever dependencies change
  useEffect(() => {
    speakAndAdvanceRef.current = (text: string, pauseAfter: number, onDone: () => void) => {
-      const canSpeak = ttsAvailable && ttsEnabled && typeof window !== 'undefined'
-
-      if (canSpeak) {
-        // Chrome bug: speechSynthesis can get stuck
-        window.speechSynthesis.cancel()
-
-        const utterance = new SpeechSynthesisUtterance(text)
-        const voice = getVoice(language)
-        if (voice) utterance.voice = voice
-        utterance.lang = language === 'de' ? 'de-DE' : 'en-US'
-        utterance.rate = speechRate
-        utterance.pitch = 1.0
-
-        const handleEnd = () => {
-          setIsSpeaking(false)
-          utteranceRef.current = null
-          if (pauseAfter > 0) {
-            timerRef.current = setTimeout(onDone, pauseAfter)
-          } else {
-            onDone()
-          }
-        }
-
-        utterance.onstart = () => setIsSpeaking(true)
-        utterance.onend = handleEnd
-        utterance.onerror = (e) => {
-          if (e.error !== 'canceled') console.warn('TTS error:', e.error)
-          setIsSpeaking(false)
-          utteranceRef.current = null
-          handleEnd()
-        }
-
-        utteranceRef.current = utterance
-        window.speechSynthesis.speak(utterance)
-      } else {
+      if (!ttsAvailable || !ttsEnabled) {
        // No TTS — use word-count-based timer
        const wordCount = text.split(/\s+/).length
        const readingTime = Math.max(wordCount * 150, 2000)
        timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
+        return
      }
+
+      // Piper TTS via API
+      setIsSpeaking(true)
+      const controller = new AbortController()
+      abortRef.current = controller
+
+      const playAudio = async () => {
+        try {
+          const key = await hashText(text + language)
+          let blobUrl = audioCache.get(key)
+
+          if (!blobUrl) {
+            const res = await fetch('/api/presenter/tts', {
+              method: 'POST',
+              headers: { 'Content-Type': 'application/json' },
+              body: JSON.stringify({ text, language }),
+              signal: controller.signal,
+            })
+
+            if (!res.ok) throw new Error(`TTS error: ${res.status}`)
+
+            const blob = await res.blob()
+            blobUrl = URL.createObjectURL(blob)
+            audioCache.set(key, blobUrl)
+          }
+
+          if (controller.signal.aborted) return
+
+          const audio = new Audio(blobUrl)
+          audioRef.current = audio
+
+          audio.onended = () => {
+            setIsSpeaking(false)
+            audioRef.current = null
+            if (pauseAfter > 0) {
+              timerRef.current = setTimeout(onDone, pauseAfter)
+            } else {
+              onDone()
+            }
+          }
+
+          audio.onerror = () => {
+            console.warn('Audio playback error')
+            setIsSpeaking(false)
+            audioRef.current = null
+            // Fallback to timer
+            const wordCount = text.split(/\s+/).length
+            const readingTime = Math.max(wordCount * 150, 2000)
+            timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
+          }
+
+          await audio.play()
+        } catch (err: any) {
+          if (err.name === 'AbortError') return
+          console.warn('TTS fetch error:', err)
+          setIsSpeaking(false)
+          // Fallback to timer
+          const wordCount = text.split(/\s+/).length
+          const readingTime = Math.max(wordCount * 150, 2000)
+          timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
+        }
+      }
+
+      playAudio()
    }
-  }, [ttsAvailable, ttsEnabled, language, speechRate, getVoice])
+  }, [ttsAvailable, ttsEnabled, language])

  // Update advancePresentation ref whenever dependencies change
  useEffect(() => {
@@ -342,18 +378,6 @@ export function usePresenterMode({
    }
  }, [clearTimer, cancelSpeech])

-  // Chrome workaround: speechSynthesis pauses after ~15s without interaction
-  useEffect(() => {
-    if (state !== 'presenting' || !ttsEnabled || !ttsAvailable) return
-    const keepAlive = setInterval(() => {
-      if (typeof window !== 'undefined' && window.speechSynthesis?.speaking) {
-        window.speechSynthesis.pause()
-        window.speechSynthesis.resume()
-      }
-    }, 10000)
-    return () => clearInterval(keepAlive)
-  }, [state, ttsEnabled, ttsAvailable])
-
  return {
    state,
    currentParagraph,