feat(tts): add Edge TTS (Microsoft Neural Voices) as primary engine with Piper fallback
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 32s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 32s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Edge TTS provides near-human quality voices (de-DE-ConradNeural, en-US-GuyNeural). Falls back to Piper TTS when Edge TTS is unavailable (e.g. no internet). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -136,11 +136,31 @@ TTS_CACHE_DIR = "/tmp/tts-cache"
|
|||||||
os.makedirs(TTS_CACHE_DIR, exist_ok=True)
|
os.makedirs(TTS_CACHE_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
EDGE_TTS_VOICES = {
|
||||||
|
"de": "de-DE-ConradNeural",
|
||||||
|
"en": "en-US-GuyNeural",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool:
|
||||||
|
"""Synthesize using Edge TTS (Microsoft Neural Voices). Returns True on success."""
|
||||||
|
try:
|
||||||
|
import edge_tts
|
||||||
|
voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"])
|
||||||
|
communicate = edge_tts.Communicate(text, voice)
|
||||||
|
await communicate.save(output_path)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Edge TTS failed, falling back to Piper: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
@app.post("/synthesize-direct")
|
@app.post("/synthesize-direct")
|
||||||
async def synthesize_direct(req: SynthesizeDirectRequest):
|
async def synthesize_direct(req: SynthesizeDirectRequest):
|
||||||
"""Synthesize text and return MP3 audio directly (no MinIO upload).
|
"""Synthesize text and return MP3 audio directly (no MinIO upload).
|
||||||
|
|
||||||
Used by the pitch-deck presenter for real-time TTS playback.
|
Uses Edge TTS (Microsoft Neural Voices) for high-quality speech.
|
||||||
|
Falls back to Piper TTS if Edge TTS is unavailable (e.g. no internet).
|
||||||
Includes disk caching so identical text is only synthesized once.
|
Includes disk caching so identical text is only synthesized once.
|
||||||
"""
|
"""
|
||||||
if not req.text.strip():
|
if not req.text.strip():
|
||||||
@@ -158,7 +178,19 @@ async def synthesize_direct(req: SynthesizeDirectRequest):
|
|||||||
headers={"X-TTS-Cache": "hit"},
|
headers={"X-TTS-Cache": "hit"},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Select TTS engine based on language
|
# Try Edge TTS first (high quality neural voices)
|
||||||
|
success = await _edge_tts_synthesize(req.text, req.language, cache_path)
|
||||||
|
|
||||||
|
if success and os.path.exists(cache_path):
|
||||||
|
size = os.path.getsize(cache_path)
|
||||||
|
logger.info(f"Edge TTS ({req.language}): {len(req.text)} chars, {size} bytes, cached as {text_hash}")
|
||||||
|
return FileResponse(
|
||||||
|
cache_path,
|
||||||
|
media_type="audio/mpeg",
|
||||||
|
headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "edge"},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: Piper TTS
|
||||||
engine = tts
|
engine = tts
|
||||||
if req.language == "en" and tts_en is not None:
|
if req.language == "en" and tts_en is not None:
|
||||||
engine = tts_en
|
engine = tts_en
|
||||||
@@ -166,18 +198,17 @@ async def synthesize_direct(req: SynthesizeDirectRequest):
|
|||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
try:
|
try:
|
||||||
mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir)
|
mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir)
|
||||||
# Copy to cache
|
|
||||||
import shutil
|
import shutil
|
||||||
shutil.copy2(mp3_path, cache_path)
|
shutil.copy2(mp3_path, cache_path)
|
||||||
logger.info(f"TTS synthesized ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
|
logger.info(f"Piper TTS ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Direct synthesis failed: {e}")
|
logger.error(f"TTS synthesis failed: {e}")
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
return FileResponse(
|
return FileResponse(
|
||||||
cache_path,
|
cache_path,
|
||||||
media_type="audio/mpeg",
|
media_type="audio/mpeg",
|
||||||
headers={"X-TTS-Cache": "miss", "X-TTS-Duration": str(round(duration, 2))},
|
headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "piper"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ uvicorn[standard]==0.27.1
|
|||||||
boto3==1.34.25
|
boto3==1.34.25
|
||||||
python-multipart==0.0.6
|
python-multipart==0.0.6
|
||||||
pydantic==2.6.1
|
pydantic==2.6.1
|
||||||
|
edge-tts==6.1.12
|
||||||
|
|||||||
Reference in New Issue
Block a user