feat(presenter): replace Web Speech API with Piper TTS for high-quality voice
- New API route /api/presenter/tts proxies to compliance-tts-service - usePresenterMode now uses Audio element with Piper-generated MP3 - Client-side audio caching (text hash → blob URL) avoids re-synthesis - Graceful fallback to word-count timer if TTS service unavailable - Add TTS_SERVICE_URL env var to pitch-deck Docker config Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -836,6 +836,7 @@ services:
|
|||||||
LITELLM_URL: ${LITELLM_URL:-https://llm-dev.meghsakha.com}
|
LITELLM_URL: ${LITELLM_URL:-https://llm-dev.meghsakha.com}
|
||||||
LITELLM_MODEL: ${LITELLM_MODEL:-gpt-oss-120b}
|
LITELLM_MODEL: ${LITELLM_MODEL:-gpt-oss-120b}
|
||||||
LITELLM_API_KEY: ${LITELLM_API_KEY:-sk-0nAyxaMVbIqmz_ntnndzag}
|
LITELLM_API_KEY: ${LITELLM_API_KEY:-sk-0nAyxaMVbIqmz_ntnndzag}
|
||||||
|
TTS_SERVICE_URL: http://bp-compliance-tts:8095
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|||||||
46
pitch-deck/app/api/presenter/tts/route.ts
Normal file
46
pitch-deck/app/api/presenter/tts/route.ts
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import { NextRequest, NextResponse } from 'next/server'
|
||||||
|
|
||||||
|
const TTS_SERVICE_URL = process.env.TTS_SERVICE_URL || 'http://compliance-tts-service:8095'
|
||||||
|
|
||||||
|
export async function POST(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const body = await request.json()
|
||||||
|
const { text, language = 'de' } = body
|
||||||
|
|
||||||
|
if (!text || typeof text !== 'string') {
|
||||||
|
return NextResponse.json({ error: 'Text is required' }, { status: 400 })
|
||||||
|
}
|
||||||
|
|
||||||
|
const res = await fetch(`${TTS_SERVICE_URL}/synthesize-direct`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ text, language }),
|
||||||
|
signal: AbortSignal.timeout(30000),
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const errorText = await res.text()
|
||||||
|
console.error('TTS service error:', res.status, errorText)
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: `TTS service error (${res.status})` },
|
||||||
|
{ status: 502 }
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioBuffer = await res.arrayBuffer()
|
||||||
|
|
||||||
|
return new NextResponse(audioBuffer, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'audio/mpeg',
|
||||||
|
'Cache-Control': 'public, max-age=86400', // Cache 24h — texts are static
|
||||||
|
'X-TTS-Cache': res.headers.get('X-TTS-Cache') || 'unknown',
|
||||||
|
},
|
||||||
|
})
|
||||||
|
} catch (error) {
|
||||||
|
console.error('TTS proxy error:', error)
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'TTS service not reachable' },
|
||||||
|
{ status: 503 }
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,7 +11,6 @@ interface UsePresenterModeConfig {
|
|||||||
currentSlide: number
|
currentSlide: number
|
||||||
totalSlides: number
|
totalSlides: number
|
||||||
language: Language
|
language: Language
|
||||||
speechRate?: number // 0.5–2.0, default 1.0
|
|
||||||
ttsEnabled?: boolean // default true
|
ttsEnabled?: boolean // default true
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,12 +31,21 @@ interface UsePresenterModeReturn {
|
|||||||
setTtsEnabled: (enabled: boolean) => void
|
setTtsEnabled: (enabled: boolean) => void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Client-side audio cache: text hash → blob URL
|
||||||
|
const audioCache = new Map<string, string>()
|
||||||
|
|
||||||
|
async function hashText(text: string): Promise<string> {
|
||||||
|
const encoder = new TextEncoder()
|
||||||
|
const data = encoder.encode(text)
|
||||||
|
const hash = await crypto.subtle.digest('SHA-256', data)
|
||||||
|
return Array.from(new Uint8Array(hash)).slice(0, 8).map(b => b.toString(16).padStart(2, '0')).join('')
|
||||||
|
}
|
||||||
|
|
||||||
export function usePresenterMode({
|
export function usePresenterMode({
|
||||||
goToSlide,
|
goToSlide,
|
||||||
currentSlide,
|
currentSlide,
|
||||||
totalSlides,
|
totalSlides,
|
||||||
language,
|
language,
|
||||||
speechRate = 1.0,
|
|
||||||
ttsEnabled: initialTtsEnabled = true,
|
ttsEnabled: initialTtsEnabled = true,
|
||||||
}: UsePresenterModeConfig): UsePresenterModeReturn {
|
}: UsePresenterModeConfig): UsePresenterModeReturn {
|
||||||
const [state, setState] = useState<PresenterState>('idle')
|
const [state, setState] = useState<PresenterState>('idle')
|
||||||
@@ -50,45 +58,42 @@ export function usePresenterMode({
|
|||||||
const slideIndexRef = useRef(currentSlide)
|
const slideIndexRef = useRef(currentSlide)
|
||||||
const paragraphIndexRef = useRef(0)
|
const paragraphIndexRef = useRef(0)
|
||||||
const stateRef = useRef<PresenterState>('idle')
|
const stateRef = useRef<PresenterState>('idle')
|
||||||
const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null)
|
const audioRef = useRef<HTMLAudioElement | null>(null)
|
||||||
const voicesRef = useRef<SpeechSynthesisVoice[]>([])
|
const abortRef = useRef<AbortController | null>(null)
|
||||||
|
|
||||||
// Refs for recursive functions to avoid circular useCallback dependencies
|
// Refs for recursive functions to avoid circular useCallback dependencies
|
||||||
const advanceRef = useRef<() => void>(() => {})
|
const advanceRef = useRef<() => void>(() => {})
|
||||||
const speakAndAdvanceRef = useRef<(text: string, pauseAfter: number, onDone: () => void) => void>(() => {})
|
const speakAndAdvanceRef = useRef<(text: string, pauseAfter: number, onDone: () => void) => void>(() => {})
|
||||||
|
|
||||||
// Initialize Web Speech API voices
|
// Check TTS service availability on mount
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (typeof window === 'undefined' || !window.speechSynthesis) return
|
fetch('/api/presenter/tts', {
|
||||||
setTtsAvailable(true)
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
const loadVoices = () => {
|
body: JSON.stringify({ text: 'Test', language: 'de' }),
|
||||||
voicesRef.current = window.speechSynthesis.getVoices()
|
signal: AbortSignal.timeout(5000),
|
||||||
}
|
})
|
||||||
loadVoices()
|
.then(res => {
|
||||||
window.speechSynthesis.addEventListener('voiceschanged', loadVoices)
|
setTtsAvailable(res.ok)
|
||||||
return () => {
|
if (res.ok) console.log('Piper TTS available')
|
||||||
window.speechSynthesis.removeEventListener('voiceschanged', loadVoices)
|
else console.warn('Piper TTS not available:', res.status)
|
||||||
}
|
})
|
||||||
}, [])
|
.catch(() => {
|
||||||
|
setTtsAvailable(false)
|
||||||
const getVoice = useCallback((lang: Language): SpeechSynthesisVoice | null => {
|
console.warn('Piper TTS service not reachable')
|
||||||
const voices = voicesRef.current
|
})
|
||||||
if (!voices.length) return null
|
|
||||||
const langCode = lang === 'de' ? 'de' : 'en'
|
|
||||||
// Prefer high-quality voices
|
|
||||||
const premium = voices.find(v =>
|
|
||||||
v.lang.startsWith(langCode) && /premium|enhanced|neural|google|microsoft/i.test(v.name)
|
|
||||||
)
|
|
||||||
if (premium) return premium
|
|
||||||
return voices.find(v => v.lang.startsWith(langCode)) || null
|
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
const cancelSpeech = useCallback(() => {
|
const cancelSpeech = useCallback(() => {
|
||||||
if (typeof window !== 'undefined' && window.speechSynthesis) {
|
if (audioRef.current) {
|
||||||
window.speechSynthesis.cancel()
|
audioRef.current.pause()
|
||||||
|
audioRef.current.currentTime = 0
|
||||||
|
audioRef.current = null
|
||||||
|
}
|
||||||
|
if (abortRef.current) {
|
||||||
|
abortRef.current.abort()
|
||||||
|
abortRef.current = null
|
||||||
}
|
}
|
||||||
utteranceRef.current = null
|
|
||||||
setIsSpeaking(false)
|
setIsSpeaking(false)
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
@@ -123,48 +128,79 @@ export function usePresenterMode({
|
|||||||
// Update speakAndAdvance ref whenever dependencies change
|
// Update speakAndAdvance ref whenever dependencies change
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
speakAndAdvanceRef.current = (text: string, pauseAfter: number, onDone: () => void) => {
|
speakAndAdvanceRef.current = (text: string, pauseAfter: number, onDone: () => void) => {
|
||||||
const canSpeak = ttsAvailable && ttsEnabled && typeof window !== 'undefined'
|
if (!ttsAvailable || !ttsEnabled) {
|
||||||
|
|
||||||
if (canSpeak) {
|
|
||||||
// Chrome bug: speechSynthesis can get stuck
|
|
||||||
window.speechSynthesis.cancel()
|
|
||||||
|
|
||||||
const utterance = new SpeechSynthesisUtterance(text)
|
|
||||||
const voice = getVoice(language)
|
|
||||||
if (voice) utterance.voice = voice
|
|
||||||
utterance.lang = language === 'de' ? 'de-DE' : 'en-US'
|
|
||||||
utterance.rate = speechRate
|
|
||||||
utterance.pitch = 1.0
|
|
||||||
|
|
||||||
const handleEnd = () => {
|
|
||||||
setIsSpeaking(false)
|
|
||||||
utteranceRef.current = null
|
|
||||||
if (pauseAfter > 0) {
|
|
||||||
timerRef.current = setTimeout(onDone, pauseAfter)
|
|
||||||
} else {
|
|
||||||
onDone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
utterance.onstart = () => setIsSpeaking(true)
|
|
||||||
utterance.onend = handleEnd
|
|
||||||
utterance.onerror = (e) => {
|
|
||||||
if (e.error !== 'canceled') console.warn('TTS error:', e.error)
|
|
||||||
setIsSpeaking(false)
|
|
||||||
utteranceRef.current = null
|
|
||||||
handleEnd()
|
|
||||||
}
|
|
||||||
|
|
||||||
utteranceRef.current = utterance
|
|
||||||
window.speechSynthesis.speak(utterance)
|
|
||||||
} else {
|
|
||||||
// No TTS — use word-count-based timer
|
// No TTS — use word-count-based timer
|
||||||
const wordCount = text.split(/\s+/).length
|
const wordCount = text.split(/\s+/).length
|
||||||
const readingTime = Math.max(wordCount * 150, 2000)
|
const readingTime = Math.max(wordCount * 150, 2000)
|
||||||
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
|
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Piper TTS via API
|
||||||
|
setIsSpeaking(true)
|
||||||
|
const controller = new AbortController()
|
||||||
|
abortRef.current = controller
|
||||||
|
|
||||||
|
const playAudio = async () => {
|
||||||
|
try {
|
||||||
|
const key = await hashText(text + language)
|
||||||
|
let blobUrl = audioCache.get(key)
|
||||||
|
|
||||||
|
if (!blobUrl) {
|
||||||
|
const res = await fetch('/api/presenter/tts', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ text, language }),
|
||||||
|
signal: controller.signal,
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!res.ok) throw new Error(`TTS error: ${res.status}`)
|
||||||
|
|
||||||
|
const blob = await res.blob()
|
||||||
|
blobUrl = URL.createObjectURL(blob)
|
||||||
|
audioCache.set(key, blobUrl)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (controller.signal.aborted) return
|
||||||
|
|
||||||
|
const audio = new Audio(blobUrl)
|
||||||
|
audioRef.current = audio
|
||||||
|
|
||||||
|
audio.onended = () => {
|
||||||
|
setIsSpeaking(false)
|
||||||
|
audioRef.current = null
|
||||||
|
if (pauseAfter > 0) {
|
||||||
|
timerRef.current = setTimeout(onDone, pauseAfter)
|
||||||
|
} else {
|
||||||
|
onDone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
audio.onerror = () => {
|
||||||
|
console.warn('Audio playback error')
|
||||||
|
setIsSpeaking(false)
|
||||||
|
audioRef.current = null
|
||||||
|
// Fallback to timer
|
||||||
|
const wordCount = text.split(/\s+/).length
|
||||||
|
const readingTime = Math.max(wordCount * 150, 2000)
|
||||||
|
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
|
||||||
|
}
|
||||||
|
|
||||||
|
await audio.play()
|
||||||
|
} catch (err: any) {
|
||||||
|
if (err.name === 'AbortError') return
|
||||||
|
console.warn('TTS fetch error:', err)
|
||||||
|
setIsSpeaking(false)
|
||||||
|
// Fallback to timer
|
||||||
|
const wordCount = text.split(/\s+/).length
|
||||||
|
const readingTime = Math.max(wordCount * 150, 2000)
|
||||||
|
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
playAudio()
|
||||||
}
|
}
|
||||||
}, [ttsAvailable, ttsEnabled, language, speechRate, getVoice])
|
}, [ttsAvailable, ttsEnabled, language])
|
||||||
|
|
||||||
// Update advancePresentation ref whenever dependencies change
|
// Update advancePresentation ref whenever dependencies change
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -342,18 +378,6 @@ export function usePresenterMode({
|
|||||||
}
|
}
|
||||||
}, [clearTimer, cancelSpeech])
|
}, [clearTimer, cancelSpeech])
|
||||||
|
|
||||||
// Chrome workaround: speechSynthesis pauses after ~15s without interaction
|
|
||||||
useEffect(() => {
|
|
||||||
if (state !== 'presenting' || !ttsEnabled || !ttsAvailable) return
|
|
||||||
const keepAlive = setInterval(() => {
|
|
||||||
if (typeof window !== 'undefined' && window.speechSynthesis?.speaking) {
|
|
||||||
window.speechSynthesis.pause()
|
|
||||||
window.speechSynthesis.resume()
|
|
||||||
}
|
|
||||||
}, 10000)
|
|
||||||
return () => clearInterval(keepAlive)
|
|
||||||
}, [state, ttsEnabled, ttsAvailable])
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
state,
|
state,
|
||||||
currentParagraph,
|
currentParagraph,
|
||||||
|
|||||||
Reference in New Issue
Block a user