diff --git a/compliance-tts-service/main.py b/compliance-tts-service/main.py index 8263a17..f01cde2 100644 --- a/compliance-tts-service/main.py +++ b/compliance-tts-service/main.py @@ -136,11 +136,31 @@ TTS_CACHE_DIR = "/tmp/tts-cache" os.makedirs(TTS_CACHE_DIR, exist_ok=True) +EDGE_TTS_VOICES = { + "de": "de-DE-ConradNeural", + "en": "en-US-GuyNeural", +} + + +async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool: + """Synthesize using Edge TTS (Microsoft Neural Voices). Returns True on success.""" + try: + import edge_tts + voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"]) + communicate = edge_tts.Communicate(text, voice) + await communicate.save(output_path) + return True + except Exception as e: + logger.warning(f"Edge TTS failed, falling back to Piper: {e}") + return False + + @app.post("/synthesize-direct") async def synthesize_direct(req: SynthesizeDirectRequest): """Synthesize text and return MP3 audio directly (no MinIO upload). - Used by the pitch-deck presenter for real-time TTS playback. + Uses Edge TTS (Microsoft Neural Voices) for high-quality speech. + Falls back to Piper TTS if Edge TTS is unavailable (e.g. no internet). Includes disk caching so identical text is only synthesized once. """ if not req.text.strip(): @@ -158,7 +178,19 @@ async def synthesize_direct(req: SynthesizeDirectRequest): headers={"X-TTS-Cache": "hit"}, ) - # Select TTS engine based on language + # Try Edge TTS first (high quality neural voices) + success = await _edge_tts_synthesize(req.text, req.language, cache_path) + + if success and os.path.exists(cache_path): + size = os.path.getsize(cache_path) + logger.info(f"Edge TTS ({req.language}): {len(req.text)} chars, {size} bytes, cached as {text_hash}") + return FileResponse( + cache_path, + media_type="audio/mpeg", + headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "edge"}, + ) + + # Fallback: Piper TTS engine = tts if req.language == "en" and tts_en is not None: engine = tts_en @@ -166,18 +198,17 @@ async def synthesize_direct(req: SynthesizeDirectRequest): with tempfile.TemporaryDirectory() as tmpdir: try: mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir) - # Copy to cache import shutil shutil.copy2(mp3_path, cache_path) - logger.info(f"TTS synthesized ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}") + logger.info(f"Piper TTS ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}") except Exception as e: - logger.error(f"Direct synthesis failed: {e}") + logger.error(f"TTS synthesis failed: {e}") raise HTTPException(status_code=500, detail=str(e)) return FileResponse( cache_path, media_type="audio/mpeg", - headers={"X-TTS-Cache": "miss", "X-TTS-Duration": str(round(duration, 2))}, + headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "piper"}, ) diff --git a/compliance-tts-service/requirements.txt b/compliance-tts-service/requirements.txt index cce69f0..445fec6 100644 --- a/compliance-tts-service/requirements.txt +++ b/compliance-tts-service/requirements.txt @@ -3,3 +3,4 @@ uvicorn[standard]==0.27.1 boto3==1.34.25 python-multipart==0.0.6 pydantic==2.6.1 +edge-tts==6.1.12