feat(presenter): continuous speech — no gaps between paragraphs/slides

- Concatenate all paragraphs + transition hint into one TTS call per slide
  → natural prosody, zero gaps within a slide
- Pre-fetch next slide's audio during current playback → seamless transitions
- Advance slide during transition phrase ("Let us look at...")
- Pause/resume without destroying audio → instant continue
- Subtitle display synced to playback position via timeupdate

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 17:02:13 +01:00
parent fa4027d027
commit f126b40574

View File

@@ -1,8 +1,8 @@
'use client' 'use client'
import { useState, useCallback, useRef, useEffect } from 'react' import { useState, useCallback, useRef, useEffect } from 'react'
import { Language, SlideId } from '../types' import { Language } from '../types'
import { PresenterState } from '../presenter/types' import { PresenterState, SlideScript } from '../presenter/types'
import { PRESENTER_SCRIPT } from '../presenter/presenter-script' import { PRESENTER_SCRIPT } from '../presenter/presenter-script'
import { SLIDE_ORDER } from './useSlideNavigation' import { SLIDE_ORDER } from './useSlideNavigation'
@@ -11,7 +11,7 @@ interface UsePresenterModeConfig {
currentSlide: number currentSlide: number
totalSlides: number totalSlides: number
language: Language language: Language
ttsEnabled?: boolean // default true ttsEnabled?: boolean
} }
interface UsePresenterModeReturn { interface UsePresenterModeReturn {
@@ -31,11 +31,10 @@ interface UsePresenterModeReturn {
setTtsEnabled: (enabled: boolean) => void setTtsEnabled: (enabled: boolean) => void
} }
// Client-side audio cache: text key → blob URL // Client-side audio cache: text hash → blob URL
const audioCache = new Map<string, string>() const audioCache = new Map<string, string>()
function cacheKey(text: string, lang: string): string { function cacheKey(text: string, lang: string): string {
// Simple string hash — no crypto.subtle needed (works on HTTP too)
let hash = 0 let hash = 0
const str = text + '|' + lang const str = text + '|' + lang
for (let i = 0; i < str.length; i++) { for (let i = 0; i < str.length; i++) {
@@ -44,6 +43,77 @@ function cacheKey(text: string, lang: string): string {
return 'tts_' + (hash >>> 0).toString(36) return 'tts_' + (hash >>> 0).toString(36)
} }
// --- Slide audio plan: concatenates all paragraphs + transition into one text ---
interface AudioSegment {
text: string
startRatio: number // 0..1 — where in the audio this segment starts
isTransition: boolean
}
interface SlideAudioPlan {
fullText: string
segments: AudioSegment[]
}
function buildSlideAudioPlan(slideIdx: number, lang: Language): SlideAudioPlan | null {
const slideId = SLIDE_ORDER[slideIdx]
const script = PRESENTER_SCRIPT.find(s => s.slideId === slideId)
if (!script || script.paragraphs.length === 0) return null
const segments: AudioSegment[] = []
let totalWords = 0
const parts: string[] = []
for (const para of script.paragraphs) {
const text = lang === 'de' ? para.text_de : para.text_en
segments.push({ text, startRatio: totalWords, isTransition: false })
totalWords += text.split(/\s+/).length
parts.push(text)
}
const hint = lang === 'de'
? (script.transition_hint_de || '')
: (script.transition_hint_en || '')
if (hint) {
segments.push({ text: hint, startRatio: totalWords, isTransition: true })
totalWords += hint.split(/\s+/).length
parts.push(hint)
}
// Normalize ratios to 0..1
if (totalWords > 0) {
for (const s of segments) {
s.startRatio /= totalWords
}
}
return { fullText: parts.join(' '), segments }
}
async function fetchAudio(text: string, lang: string, signal?: AbortSignal): Promise<string | null> {
const key = cacheKey(text, lang)
const cached = audioCache.get(key)
if (cached) return cached
try {
const res = await fetch('/api/presenter/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, language: lang }),
signal,
})
if (!res.ok) return null
const blob = await res.blob()
const url = URL.createObjectURL(blob)
audioCache.set(key, url)
return url
} catch {
return null
}
}
export function usePresenterMode({ export function usePresenterMode({
goToSlide, goToSlide,
currentSlide, currentSlide,
@@ -57,23 +127,22 @@ export function usePresenterMode({
const [isSpeaking, setIsSpeaking] = useState(false) const [isSpeaking, setIsSpeaking] = useState(false)
const [ttsEnabled, setTtsEnabled] = useState(initialTtsEnabled) const [ttsEnabled, setTtsEnabled] = useState(initialTtsEnabled)
const [ttsAvailable, setTtsAvailable] = useState(false) const [ttsAvailable, setTtsAvailable] = useState(false)
const timerRef = useRef<NodeJS.Timeout | null>(null)
const slideIndexRef = useRef(currentSlide)
const paragraphIndexRef = useRef(0)
const stateRef = useRef<PresenterState>('idle') const stateRef = useRef<PresenterState>('idle')
const slideIndexRef = useRef(currentSlide)
const audioRef = useRef<HTMLAudioElement | null>(null) const audioRef = useRef<HTMLAudioElement | null>(null)
const abortRef = useRef<AbortController | null>(null) const abortRef = useRef<AbortController | null>(null)
const audioUnlockedRef = useRef(false) const audioUnlockedRef = useRef(false)
const slideAdvancedRef = useRef(false)
const timerRefs = useRef<NodeJS.Timeout[]>([])
// Refs for recursive functions to avoid circular useCallback dependencies // Ref for playSlide — avoids stale closure in audio callbacks
const advanceRef = useRef<() => void>(() => {}) const playSlideRef = useRef<(slideIdx: number) => void>(() => {})
const speakAndAdvanceRef = useRef<(text: string, pauseAfter: number, onDone: () => void) => void>(() => {})
// Unlock browser audio playback — must be called from a user gesture (click) // Unlock browser audio (must be called from user gesture)
const unlockAudio = useCallback(() => { const unlockAudio = useCallback(() => {
if (audioUnlockedRef.current) return if (audioUnlockedRef.current) return
try { try {
// Create and play a silent audio to unlock the Audio API
const ctx = new (window.AudioContext || (window as any).webkitAudioContext)() const ctx = new (window.AudioContext || (window as any).webkitAudioContext)()
const buffer = ctx.createBuffer(1, 1, 22050) const buffer = ctx.createBuffer(1, 1, 22050)
const source = ctx.createBufferSource() const source = ctx.createBufferSource()
@@ -81,13 +150,10 @@ export function usePresenterMode({
source.connect(ctx.destination) source.connect(ctx.destination)
source.start(0) source.start(0)
audioUnlockedRef.current = true audioUnlockedRef.current = true
console.log('Audio playback unlocked') } catch {}
} catch (e) {
console.warn('Audio unlock failed:', e)
}
}, []) }, [])
// Check TTS service availability on mount // Check TTS availability on mount
useEffect(() => { useEffect(() => {
fetch('/api/presenter/tts', { fetch('/api/presenter/tts', {
method: 'POST', method: 'POST',
@@ -95,27 +161,25 @@ export function usePresenterMode({
body: JSON.stringify({ text: 'Test', language: 'de' }), body: JSON.stringify({ text: 'Test', language: 'de' }),
signal: AbortSignal.timeout(5000), signal: AbortSignal.timeout(5000),
}) })
.then(res => { .then(res => setTtsAvailable(res.ok))
setTtsAvailable(res.ok) .catch(() => setTtsAvailable(false))
if (res.ok) console.log('Piper TTS available')
else console.warn('Piper TTS not available:', res.status)
})
.catch(() => {
setTtsAvailable(false)
console.warn('Piper TTS service not reachable')
})
}, []) }, [])
const cancelSpeech = useCallback(() => { // Cancel all audio and timers
const cancelAll = useCallback(() => {
if (audioRef.current) { if (audioRef.current) {
audioRef.current.pause() audioRef.current.pause()
audioRef.current.currentTime = 0 audioRef.current.onended = null
audioRef.current.ontimeupdate = null
audioRef.current.onerror = null
audioRef.current = null audioRef.current = null
} }
if (abortRef.current) { if (abortRef.current) {
abortRef.current.abort() abortRef.current.abort()
abortRef.current = null abortRef.current = null
} }
for (const t of timerRefs.current) clearTimeout(t)
timerRefs.current = []
setIsSpeaking(false) setIsSpeaking(false)
}, []) }, [])
@@ -123,259 +187,204 @@ export function usePresenterMode({
useEffect(() => { slideIndexRef.current = currentSlide }, [currentSlide]) useEffect(() => { slideIndexRef.current = currentSlide }, [currentSlide])
useEffect(() => { stateRef.current = state }, [state]) useEffect(() => { stateRef.current = state }, [state])
const clearTimer = useCallback(() => { // --- Core: play one slide's audio (all paragraphs concatenated) ---
if (timerRef.current) {
clearTimeout(timerRef.current)
timerRef.current = null
}
}, [])
const getScriptForIndex = useCallback((index: number) => {
const slideId = SLIDE_ORDER[index]
return PRESENTER_SCRIPT.find(s => s.slideId === slideId)
}, [])
const showParagraph = useCallback((slideIdx: number, paraIdx: number) => {
const script = getScriptForIndex(slideIdx)
if (!script || paraIdx >= script.paragraphs.length) return null
const para = script.paragraphs[paraIdx]
const text = language === 'de' ? para.text_de : para.text_en
setDisplayText(text)
setCurrentParagraph(paraIdx)
paragraphIndexRef.current = paraIdx
return para
}, [language, getScriptForIndex])
// Update speakAndAdvance ref whenever dependencies change
useEffect(() => { useEffect(() => {
speakAndAdvanceRef.current = (text: string, pauseAfter: number, onDone: () => void) => { playSlideRef.current = async (slideIdx: number) => {
if (!ttsAvailable || !ttsEnabled) {
// No TTS — use word-count-based timer
const wordCount = text.split(/\s+/).length
const readingTime = Math.max(wordCount * 150, 2000)
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
return
}
// Piper TTS via API
setIsSpeaking(true)
const controller = new AbortController()
abortRef.current = controller
const playAudio = async () => {
try {
const key = cacheKey(text, language)
let blobUrl = audioCache.get(key)
if (!blobUrl) {
console.log('[TTS] Fetching audio for:', text.slice(0, 50) + '...')
const res = await fetch('/api/presenter/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, language }),
signal: controller.signal,
})
if (!res.ok) throw new Error(`TTS error: ${res.status}`)
const blob = await res.blob()
console.log('[TTS] Audio received:', blob.size, 'bytes')
blobUrl = URL.createObjectURL(blob)
audioCache.set(key, blobUrl)
} else {
console.log('[TTS] Cache hit for:', text.slice(0, 50) + '...')
}
if (controller.signal.aborted) return
const audio = new Audio(blobUrl)
audioRef.current = audio
audio.onended = () => {
console.log('[TTS] Audio playback ended')
setIsSpeaking(false)
audioRef.current = null
if (pauseAfter > 0) {
timerRef.current = setTimeout(onDone, pauseAfter)
} else {
onDone()
}
}
audio.onerror = (e) => {
console.warn('[TTS] Audio playback error:', e)
setIsSpeaking(false)
audioRef.current = null
const wordCount = text.split(/\s+/).length
const readingTime = Math.max(wordCount * 150, 2000)
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
}
await audio.play()
console.log('[TTS] Audio playing')
} catch (err: any) {
if (err.name === 'AbortError') return
console.warn('[TTS] Error:', err.name, err.message)
setIsSpeaking(false)
const wordCount = text.split(/\s+/).length
const readingTime = Math.max(wordCount * 150, 2000)
timerRef.current = setTimeout(onDone, readingTime + pauseAfter)
}
}
playAudio()
}
}, [ttsAvailable, ttsEnabled, language])
// Update advancePresentation ref whenever dependencies change
useEffect(() => {
advanceRef.current = () => {
if (stateRef.current !== 'presenting') return if (stateRef.current !== 'presenting') return
const slideIdx = slideIndexRef.current const plan = buildSlideAudioPlan(slideIdx, language)
const script = getScriptForIndex(slideIdx) if (!plan) {
// No script for this slide — skip to next
if (!script) {
if (slideIdx < totalSlides - 1) { if (slideIdx < totalSlides - 1) {
goToSlide(slideIdx + 1) goToSlide(slideIdx + 1)
paragraphIndexRef.current = 0 slideIndexRef.current = slideIdx + 1
timerRef.current = setTimeout(() => advanceRef.current(), 2000) const t = setTimeout(() => playSlideRef.current(slideIdx + 1), 300)
timerRefs.current.push(t)
} else { } else {
cancelSpeech()
setState('idle') setState('idle')
stateRef.current = 'idle'
setDisplayText('') setDisplayText('')
} }
return return
} }
const nextPara = paragraphIndexRef.current + 1 // Show first segment immediately
setDisplayText(plan.segments[0]?.text || '')
setCurrentParagraph(0)
slideAdvancedRef.current = false
if (nextPara < script.paragraphs.length) { // Pre-fetch next slide's audio in background
const para = showParagraph(slideIdx, nextPara) if (slideIdx < totalSlides - 1) {
if (para) { const nextPlan = buildSlideAudioPlan(slideIdx + 1, language)
const text = language === 'de' ? para.text_de : para.text_en if (nextPlan) fetchAudio(nextPlan.fullText, language).catch(() => {})
speakAndAdvanceRef.current(text, para.pause_after, () => advanceRef.current()) }
}
} else {
// All paragraphs done — transition hint then next slide
const hint = language === 'de'
? (script.transition_hint_de || '')
: (script.transition_hint_en || '')
const goNext = () => { // --- Non-TTS path: word-count based timing ---
if (slideIdx < totalSlides - 1) { if (!ttsAvailable || !ttsEnabled) {
timerRef.current = setTimeout(() => { const words = plan.fullText.split(/\s+/).length
if (stateRef.current !== 'presenting') return const totalTime = Math.max(words * 130, 3000)
for (const seg of plan.segments) {
const t = setTimeout(() => {
if (stateRef.current !== 'presenting') return
setDisplayText(seg.text)
if (seg.isTransition && slideIdx < totalSlides - 1 && !slideAdvancedRef.current) {
slideAdvancedRef.current = true
goToSlide(slideIdx + 1) goToSlide(slideIdx + 1)
paragraphIndexRef.current = -1 }
}, seg.startRatio * totalTime)
timerRefs.current.push(t)
}
timerRef.current = setTimeout(() => { const t = setTimeout(() => {
if (stateRef.current !== 'presenting') return if (stateRef.current !== 'presenting') return
const nextScript = getScriptForIndex(slideIdx + 1) const next = slideIdx + 1
if (nextScript && nextScript.paragraphs.length > 0) { if (next < totalSlides) {
const para = showParagraph(slideIdx + 1, 0) if (!slideAdvancedRef.current) goToSlide(next)
if (para) { slideIndexRef.current = next
const text = language === 'de' ? para.text_de : para.text_en playSlideRef.current(next)
speakAndAdvanceRef.current(text, para.pause_after, () => advanceRef.current())
}
} else {
advanceRef.current()
}
}, 1500)
}, 1000)
} else { } else {
timerRef.current = setTimeout(() => { setState('idle')
cancelSpeech() stateRef.current = 'idle'
setState('idle') setDisplayText('')
setDisplayText('') }
}, 3000) }, totalTime)
timerRefs.current.push(t)
return
}
// --- TTS path: fetch + play full-slide audio ---
setIsSpeaking(true)
const controller = new AbortController()
abortRef.current = controller
const blobUrl = await fetchAudio(plan.fullText, language, controller.signal)
if (!blobUrl || controller.signal.aborted) {
setIsSpeaking(false)
return
}
if (stateRef.current !== 'presenting') {
setIsSpeaking(false)
return
}
const audio = new Audio(blobUrl)
audioRef.current = audio
// Sync subtitle text to playback position
audio.ontimeupdate = () => {
if (!audio.duration || stateRef.current !== 'presenting') return
const ratio = audio.currentTime / audio.duration
for (let i = plan.segments.length - 1; i >= 0; i--) {
if (ratio >= plan.segments[i].startRatio) {
setDisplayText(plan.segments[i].text)
setCurrentParagraph(Math.min(i, plan.segments.length - 1))
// Advance slide when transition phrase starts playing
if (plan.segments[i].isTransition && !slideAdvancedRef.current && slideIdx < totalSlides - 1) {
slideAdvancedRef.current = true
goToSlide(slideIdx + 1)
}
break
} }
} }
}
if (hint) { // When audio finishes → immediately play next slide (pre-fetched)
setDisplayText(hint) audio.onended = () => {
speakAndAdvanceRef.current(hint, 0, () => { setIsSpeaking(false)
if (stateRef.current !== 'presenting') return audioRef.current = null
goNext() if (stateRef.current !== 'presenting') return
})
const next = slideIdx + 1
if (next < totalSlides) {
if (!slideAdvancedRef.current) goToSlide(next)
slideIndexRef.current = next
playSlideRef.current(next)
} else { } else {
goNext() setState('idle')
stateRef.current = 'idle'
setDisplayText('')
} }
} }
audio.onerror = () => {
setIsSpeaking(false)
audioRef.current = null
// Skip to next slide on error
if (stateRef.current !== 'presenting') return
const next = slideIdx + 1
if (next < totalSlides) {
goToSlide(next)
slideIndexRef.current = next
playSlideRef.current(next)
}
}
try {
await audio.play()
} catch {
setIsSpeaking(false)
}
} }
}, [language, totalSlides, goToSlide, getScriptForIndex, showParagraph, cancelSpeech]) }, [language, totalSlides, goToSlide, ttsAvailable, ttsEnabled])
const start = useCallback(() => { const start = useCallback(() => {
// Unlock audio playback immediately in user gesture context
unlockAudio() unlockAudio()
cancelAll()
setState('presenting')
stateRef.current = 'presenting'
playSlideRef.current(slideIndexRef.current)
}, [unlockAudio, cancelAll])
clearTimer() const stop = useCallback(() => {
cancelSpeech() cancelAll()
setState('idle')
stateRef.current = 'idle'
setDisplayText('')
setCurrentParagraph(0)
}, [cancelAll])
// Pause: keep audio alive, just pause playback
const pause = useCallback(() => {
if (audioRef.current) {
audioRef.current.pause()
}
for (const t of timerRefs.current) clearTimeout(t)
timerRefs.current = []
setState('paused')
stateRef.current = 'paused'
setIsSpeaking(false)
}, [])
// Resume: continue paused audio, or restart current slide
const resume = useCallback(() => {
setState('presenting') setState('presenting')
stateRef.current = 'presenting' stateRef.current = 'presenting'
const slideIdx = slideIndexRef.current if (audioRef.current && audioRef.current.paused && audioRef.current.currentTime > 0) {
const script = getScriptForIndex(slideIdx) setIsSpeaking(true)
audioRef.current.play().catch(() => {
if (script && script.paragraphs.length > 0) { setIsSpeaking(false)
const para = showParagraph(slideIdx, 0) playSlideRef.current(slideIndexRef.current)
if (para) { })
const text = language === 'de' ? para.text_de : para.text_en
speakAndAdvanceRef.current(text, para.pause_after, () => advanceRef.current())
}
} else { } else {
timerRef.current = setTimeout(() => advanceRef.current(), 1000) playSlideRef.current(slideIndexRef.current)
} }
}, [unlockAudio, clearTimer, cancelSpeech, language, getScriptForIndex, showParagraph])
const stop = useCallback(() => {
clearTimer()
cancelSpeech()
setState('idle')
setDisplayText('')
setCurrentParagraph(0)
paragraphIndexRef.current = 0
}, [clearTimer, cancelSpeech])
const pause = useCallback(() => {
clearTimer()
cancelSpeech()
setState('paused')
}, [clearTimer, cancelSpeech])
const resume = useCallback(() => {
setState('resuming')
stateRef.current = 'resuming'
timerRef.current = setTimeout(() => {
setState('presenting')
stateRef.current = 'presenting' // Sync ref immediately before calling advance
advanceRef.current()
}, 2000)
}, []) }, [])
const skipSlide = useCallback(() => { const skipSlide = useCallback(() => {
clearTimer() cancelAll()
cancelSpeech()
const nextIdx = slideIndexRef.current + 1 const nextIdx = slideIndexRef.current + 1
if (nextIdx < totalSlides) { if (nextIdx < totalSlides) {
goToSlide(nextIdx) goToSlide(nextIdx)
paragraphIndexRef.current = -1 slideIndexRef.current = nextIdx
if (stateRef.current === 'presenting') { if (stateRef.current === 'presenting') {
timerRef.current = setTimeout(() => { playSlideRef.current(nextIdx)
const script = getScriptForIndex(nextIdx)
if (script && script.paragraphs.length > 0) {
const para = showParagraph(nextIdx, 0)
if (para) {
const text = language === 'de' ? para.text_de : para.text_en
speakAndAdvanceRef.current(text, para.pause_after, () => advanceRef.current())
}
}
}, 1500)
} }
} }
}, [clearTimer, cancelSpeech, totalSlides, goToSlide, language, getScriptForIndex, showParagraph]) }, [cancelAll, totalSlides, goToSlide])
const toggle = useCallback(() => { const toggle = useCallback(() => {
unlockAudio() unlockAudio()
@@ -402,11 +411,8 @@ export function usePresenterMode({
// Cleanup on unmount // Cleanup on unmount
useEffect(() => { useEffect(() => {
return () => { return () => { cancelAll() }
clearTimer() }, [cancelAll])
cancelSpeech()
}
}, [clearTimer, cancelSpeech])
return { return {
state, state,