From 0f0bbc3dc0a7bef3ac87d7b1c04be16757af2cf9 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 26 Apr 2026 23:17:39 +0200 Subject: [PATCH] Switch AudioButton to Piper TTS (Thorsten/Lessac voices) AudioButton now tries Piper TTS via /api/vocabulary/tts endpoint first, falls back to Browser Web Speech API if unavailable. Backend: New GET /api/vocabulary/tts?text=...&lang=de endpoint. audio_service.py: Fixed presigned URL flow for MinIO download. This gives the same high-quality voice as the Investor Agent in the pitch deck (Thorsten DE / Lessac EN, MIT license). Co-Authored-By: Claude Opus 4.6 (1M context) --- backend-lehrer/services/audio.py | 26 ++++--- backend-lehrer/vocabulary/api.py | 16 +++++ studio-v2/components/learn/AudioButton.tsx | 80 +++++++++++++--------- 3 files changed, 82 insertions(+), 40 deletions(-) diff --git a/backend-lehrer/services/audio.py b/backend-lehrer/services/audio.py index d1ee9ca..c022517 100644 --- a/backend-lehrer/services/audio.py +++ b/backend-lehrer/services/audio.py @@ -74,16 +74,24 @@ async def synthesize_word( return None data = resp.json() - audio_url = data.get("audio_url") or data.get("presigned_url") + bucket = data.get("bucket") + object_key = data.get("object_key") - if audio_url: - # Download the audio file - audio_resp = await client.get(audio_url) - if audio_resp.status_code == 200: - with open(cached, "wb") as f: - f.write(audio_resp.content) - logger.info(f"TTS cached: '{text}' ({language}) → {cached}") - return cached + if bucket and object_key: + # Get presigned URL to download the audio + url_resp = await client.post( + f"{TTS_SERVICE_URL}/presigned-url", + json={"bucket": bucket, "object_key": object_key, "expires": 300}, + ) + if url_resp.status_code == 200: + audio_url = url_resp.json().get("url") + if audio_url: + audio_resp = await client.get(audio_url) + if audio_resp.status_code == 200: + with open(cached, "wb") as f: + f.write(audio_resp.content) + logger.info(f"TTS cached: '{text}' ({language}) → {cached}") + return cached except Exception as e: logger.warning(f"TTS service unavailable: {e}") diff --git a/backend-lehrer/vocabulary/api.py b/backend-lehrer/vocabulary/api.py index 7045dd5..cbe2eaf 100644 --- a/backend-lehrer/vocabulary/api.py +++ b/backend-lehrer/vocabulary/api.py @@ -161,6 +161,22 @@ async def api_get_syllable_audio(word_id: str, lang: str = "en"): return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg") +@router.get("/tts") +async def api_tts(text: str = Query("", min_length=1), lang: str = Query("de")): + """Text-to-Speech endpoint. Returns MP3 audio for any text. + + Uses Piper TTS (Thorsten DE / Lessac EN). Cached by text+lang. + """ + from fastapi.responses import Response as FastAPIResponse + from services.audio import get_or_generate_audio + + audio_bytes = await get_or_generate_audio(text, language=lang) + if not audio_bytes: + raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar") + + return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg") + + # --------------------------------------------------------------------------- # Learning Unit Creation from Word Selection # --------------------------------------------------------------------------- diff --git a/studio-v2/components/learn/AudioButton.tsx b/studio-v2/components/learn/AudioButton.tsx index aa48013..b26c522 100644 --- a/studio-v2/components/learn/AudioButton.tsx +++ b/studio-v2/components/learn/AudioButton.tsx @@ -1,6 +1,6 @@ 'use client' -import React, { useCallback, useState } from 'react' +import React, { useCallback, useRef, useState } from 'react' interface AudioButtonProps { text: string @@ -9,47 +9,65 @@ interface AudioButtonProps { size?: 'sm' | 'md' | 'lg' } +/** + * AudioButton — plays TTS audio for a word or phrase. + * + * Priority: Piper TTS (Thorsten DE / Lessac EN) via backend API. + * Fallback: Browser Web Speech API if Piper is unavailable. + */ export function AudioButton({ text, lang, isDark, size = 'md' }: AudioButtonProps) { const [isSpeaking, setIsSpeaking] = useState(false) + const audioRef = useRef(null) - const speak = useCallback(() => { - if (!('speechSynthesis' in window)) return + const speak = useCallback(async () => { + // Stop if already playing if (isSpeaking) { - window.speechSynthesis.cancel() + audioRef.current?.pause() + window.speechSynthesis?.cancel() setIsSpeaking(false) return } - const utterance = new SpeechSynthesisUtterance(text) - utterance.lang = lang === 'de' ? 'de-DE' : 'en-GB' - utterance.rate = 0.9 - utterance.pitch = 1.0 - - // Try to find a good voice - const voices = window.speechSynthesis.getVoices() - const preferred = voices.find((v) => - v.lang.startsWith(lang === 'de' ? 'de' : 'en') && v.localService - ) || voices.find((v) => v.lang.startsWith(lang === 'de' ? 'de' : 'en')) - if (preferred) utterance.voice = preferred - - utterance.onend = () => setIsSpeaking(false) - utterance.onerror = () => setIsSpeaking(false) - setIsSpeaking(true) - window.speechSynthesis.speak(utterance) + + // Try Piper TTS via backend API first + try { + const url = `/api/vocabulary/tts?text=${encodeURIComponent(text)}&lang=${lang}` + const resp = await fetch(url) + if (resp.ok && resp.headers.get('content-type')?.startsWith('audio')) { + const blob = await resp.blob() + const audioUrl = URL.createObjectURL(blob) + const audio = new Audio(audioUrl) + audioRef.current = audio + audio.onended = () => { setIsSpeaking(false); URL.revokeObjectURL(audioUrl) } + audio.onerror = () => { setIsSpeaking(false); URL.revokeObjectURL(audioUrl) } + await audio.play() + return + } + } catch { + // Piper unavailable — fall through to Web Speech API + } + + // Fallback: Browser Web Speech API + if ('speechSynthesis' in window) { + const utterance = new SpeechSynthesisUtterance(text) + utterance.lang = lang === 'de' ? 'de-DE' : 'en-GB' + utterance.rate = 0.9 + const voices = window.speechSynthesis.getVoices() + const preferred = voices.find((v) => + v.lang.startsWith(lang === 'de' ? 'de' : 'en') && v.localService + ) || voices.find((v) => v.lang.startsWith(lang === 'de' ? 'de' : 'en')) + if (preferred) utterance.voice = preferred + utterance.onend = () => setIsSpeaking(false) + utterance.onerror = () => setIsSpeaking(false) + window.speechSynthesis.speak(utterance) + } else { + setIsSpeaking(false) + } }, [text, lang, isSpeaking]) - const sizeClasses = { - sm: 'w-7 h-7', - md: 'w-9 h-9', - lg: 'w-11 h-11', - } - - const iconSizes = { - sm: 'w-3.5 h-3.5', - md: 'w-4 h-4', - lg: 'w-5 h-5', - } + const sizeClasses = { sm: 'w-7 h-7', md: 'w-9 h-9', lg: 'w-11 h-11' } + const iconSizes = { sm: 'w-3.5 h-3.5', md: 'w-4 h-4', lg: 'w-5 h-5' } return (