feat(tts): add Edge TTS (Microsoft Neural Voices) as primary engine with Piper fallback

Edge TTS provides near-human quality voices (de-DE-ConradNeural, en-US-GuyNeural). Falls back to Piper TTS when Edge TTS is unavailable (e.g. no internet). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 16:13:10 +01:00
parent 4f6ac9b23a
commit df5b6d69ef
2 changed files with 38 additions and 6 deletions
--- a/compliance-tts-service/main.py
+++ b/compliance-tts-service/main.py
@@ -136,11 +136,31 @@ TTS_CACHE_DIR = "/tmp/tts-cache"
 os.makedirs(TTS_CACHE_DIR, exist_ok=True)


+EDGE_TTS_VOICES = {
+    "de": "de-DE-ConradNeural",
+    "en": "en-US-GuyNeural",
+}
+
+
+async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool:
+    """Synthesize using Edge TTS (Microsoft Neural Voices). Returns True on success."""
+    try:
+        import edge_tts
+        voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"])
+        communicate = edge_tts.Communicate(text, voice)
+        await communicate.save(output_path)
+        return True
+    except Exception as e:
+        logger.warning(f"Edge TTS failed, falling back to Piper: {e}")
+        return False
+
+
@app.post("/synthesize-direct")
 async def synthesize_direct(req: SynthesizeDirectRequest):
    """Synthesize text and return MP3 audio directly (no MinIO upload).

-    Used by the pitch-deck presenter for real-time TTS playback.
+    Uses Edge TTS (Microsoft Neural Voices) for high-quality speech.
+    Falls back to Piper TTS if Edge TTS is unavailable (e.g. no internet).
    Includes disk caching so identical text is only synthesized once.
    """
    if not req.text.strip():
@@ -158,7 +178,19 @@ async def synthesize_direct(req: SynthesizeDirectRequest):
            headers={"X-TTS-Cache": "hit"},
        )

-    # Select TTS engine based on language
+    # Try Edge TTS first (high quality neural voices)
+    success = await _edge_tts_synthesize(req.text, req.language, cache_path)
+
+    if success and os.path.exists(cache_path):
+        size = os.path.getsize(cache_path)
+        logger.info(f"Edge TTS ({req.language}): {len(req.text)} chars, {size} bytes, cached as {text_hash}")
+        return FileResponse(
+            cache_path,
+            media_type="audio/mpeg",
+            headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "edge"},
+        )
+
+    # Fallback: Piper TTS
    engine = tts
    if req.language == "en" and tts_en is not None:
        engine = tts_en
@@ -166,18 +198,17 @@ async def synthesize_direct(req: SynthesizeDirectRequest):
    with tempfile.TemporaryDirectory() as tmpdir:
        try:
            mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir)
-            # Copy to cache
            import shutil
            shutil.copy2(mp3_path, cache_path)
-            logger.info(f"TTS synthesized ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
+            logger.info(f"Piper TTS ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
        except Exception as e:
-            logger.error(f"Direct synthesis failed: {e}")
+            logger.error(f"TTS synthesis failed: {e}")
            raise HTTPException(status_code=500, detail=str(e))

    return FileResponse(
        cache_path,
        media_type="audio/mpeg",
-        headers={"X-TTS-Cache": "miss", "X-TTS-Duration": str(round(duration, 2))},
+        headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "piper"},
    )


--- a/compliance-tts-service/requirements.txt
+++ b/compliance-tts-service/requirements.txt
@@ -3,3 +3,4 @@ uvicorn[standard]==0.27.1
 boto3==1.34.25
 python-multipart==0.0.6
 pydantic==2.6.1
+edge-tts==6.1.12