diff --git a/compliance-tts-service/Dockerfile b/compliance-tts-service/Dockerfile index 6b98fa1..7642807 100644 --- a/compliance-tts-service/Dockerfile +++ b/compliance-tts-service/Dockerfile @@ -15,8 +15,16 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Download Piper voice model (German, thorsten, high quality) -RUN mkdir -p /app/models && wget -q -O /app/models/de_DE-thorsten-high.onnx "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" && wget -q -O /app/models/de_DE-thorsten-high.onnx.json "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json" +# Download Piper voice models +RUN mkdir -p /app/models && \ + wget -q -O /app/models/de_DE-thorsten-high.onnx \ + "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" && \ + wget -q -O /app/models/de_DE-thorsten-high.onnx.json \ + "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json" && \ + wget -q -O /app/models/en_US-lessac-high.onnx \ + "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx" && \ + wget -q -O /app/models/en_US-lessac-high.onnx.json \ + "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json" # Copy application COPY . . diff --git a/compliance-tts-service/main.py b/compliance-tts-service/main.py index da1ccd5..8263a17 100644 --- a/compliance-tts-service/main.py +++ b/compliance-tts-service/main.py @@ -23,6 +23,7 @@ MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot") MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123") MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true" PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "/app/models/de_DE-thorsten-high.onnx") +PIPER_MODEL_EN_PATH = os.getenv("PIPER_MODEL_EN_PATH", "/app/models/en_US-lessac-high.onnx") AUDIO_BUCKET = "compliance-training-audio" VIDEO_BUCKET = "compliance-training-video" @@ -30,6 +31,7 @@ VIDEO_BUCKET = "compliance-training-video" # Initialize services storage = StorageClient(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, secure=MINIO_SECURE) tts = PiperTTS(PIPER_MODEL_PATH) +tts_en = PiperTTS(PIPER_MODEL_EN_PATH) if os.path.exists(PIPER_MODEL_EN_PATH) else None @app.on_event("startup") @@ -106,16 +108,22 @@ async def health(): @app.get("/voices") async def list_voices(): """List available TTS voices.""" - return { - "voices": [ - VoiceInfo( - id="de_DE-thorsten-high", - language="de", - name="Thorsten (High Quality)", - quality="high", - ), - ], - } + voices = [ + VoiceInfo( + id="de_DE-thorsten-high", + language="de", + name="Thorsten (High Quality)", + quality="high", + ), + ] + if tts_en is not None: + voices.append(VoiceInfo( + id="en_US-lessac-high", + language="en", + name="Lessac (High Quality)", + quality="high", + )) + return {"voices": voices} class SynthesizeDirectRequest(BaseModel): @@ -138,8 +146,8 @@ async def synthesize_direct(req: SynthesizeDirectRequest): if not req.text.strip(): raise HTTPException(status_code=400, detail="Text is empty") - # Cache key based on text hash - text_hash = hashlib.sha256(req.text.encode()).hexdigest()[:16] + # Cache key based on text + language hash + text_hash = hashlib.sha256(f"{req.language}:{req.text}".encode()).hexdigest()[:16] cache_path = os.path.join(TTS_CACHE_DIR, f"{text_hash}.mp3") if os.path.exists(cache_path): @@ -150,13 +158,18 @@ async def synthesize_direct(req: SynthesizeDirectRequest): headers={"X-TTS-Cache": "hit"}, ) + # Select TTS engine based on language + engine = tts + if req.language == "en" and tts_en is not None: + engine = tts_en + with tempfile.TemporaryDirectory() as tmpdir: try: - mp3_path, duration = tts.synthesize_to_mp3(req.text, tmpdir) + mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir) # Copy to cache import shutil shutil.copy2(mp3_path, cache_path) - logger.info(f"TTS synthesized: {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}") + logger.info(f"TTS synthesized ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}") except Exception as e: logger.error(f"Direct synthesis failed: {e}") raise HTTPException(status_code=500, detail=str(e))