feat(tts): add English voice (lessac-high) + language-based model selection

- Download en_US-lessac-high Piper model in Dockerfile - Select TTS engine based on request language (de/en) - Include language in cache key to avoid collisions - List both voices in /voices endpoint Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 14:07:23 +01:00
parent 5ea31a3236
commit 4f6ac9b23a
2 changed files with 37 additions and 16 deletions
@@ -15,8 +15,16 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Download Piper voice model (German, thorsten, high quality)
+# Download Piper voice models
-RUN mkdir -p /app/models &&     wget -q -O /app/models/de_DE-thorsten-high.onnx     "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" &&     wget -q -O /app/models/de_DE-thorsten-high.onnx.json     "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json"
+RUN mkdir -p /app/models && \
    wget -q -O /app/models/de_DE-thorsten-high.onnx \
    "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" && \
    wget -q -O /app/models/de_DE-thorsten-high.onnx.json \
    "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json" && \
    wget -q -O /app/models/en_US-lessac-high.onnx \
    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx" && \
    wget -q -O /app/models/en_US-lessac-high.onnx.json \
    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json"
 # Copy application
 COPY . .
@@ -23,6 +23,7 @@ MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot")
 MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123")
 MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
 PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "/app/models/de_DE-thorsten-high.onnx")
 PIPER_MODEL_EN_PATH = os.getenv("PIPER_MODEL_EN_PATH", "/app/models/en_US-lessac-high.onnx")
 AUDIO_BUCKET = "compliance-training-audio"
 VIDEO_BUCKET = "compliance-training-video"
@@ -30,6 +31,7 @@ VIDEO_BUCKET = "compliance-training-video"
 # Initialize services
 storage = StorageClient(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, secure=MINIO_SECURE)
 tts = PiperTTS(PIPER_MODEL_PATH)
 tts_en = PiperTTS(PIPER_MODEL_EN_PATH) if os.path.exists(PIPER_MODEL_EN_PATH) else None
@app.on_event("startup")
@@ -106,16 +108,22 @@ async def health():
@app.get("/voices")
 async def list_voices():
    """List available TTS voices."""
-    return {
+    voices = [
-        "voices": [
+        VoiceInfo(
-            VoiceInfo(
+            id="de_DE-thorsten-high",
-                id="de_DE-thorsten-high",
+            language="de",
-                language="de",
+            name="Thorsten (High Quality)",
-                name="Thorsten (High Quality)",
+            quality="high",
-                quality="high",
+        ),
-            ),
+    ]
-        ],
+    if tts_en is not None:
-    }
+        voices.append(VoiceInfo(
            id="en_US-lessac-high",
            language="en",
            name="Lessac (High Quality)",
            quality="high",
        ))
    return {"voices": voices}
 class SynthesizeDirectRequest(BaseModel):
@@ -138,8 +146,8 @@ async def synthesize_direct(req: SynthesizeDirectRequest):
    if not req.text.strip():
        raise HTTPException(status_code=400, detail="Text is empty")
-    # Cache key based on text hash
+    # Cache key based on text + language hash
-    text_hash = hashlib.sha256(req.text.encode()).hexdigest()[:16]
+    text_hash = hashlib.sha256(f"{req.language}:{req.text}".encode()).hexdigest()[:16]
    cache_path = os.path.join(TTS_CACHE_DIR, f"{text_hash}.mp3")
    if os.path.exists(cache_path):
@@ -150,13 +158,18 @@ async def synthesize_direct(req: SynthesizeDirectRequest):
            headers={"X-TTS-Cache": "hit"},
        )
    # Select TTS engine based on language
    engine = tts
    if req.language == "en" and tts_en is not None:
        engine = tts_en
    with tempfile.TemporaryDirectory() as tmpdir:
        try:
-            mp3_path, duration = tts.synthesize_to_mp3(req.text, tmpdir)
+            mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir)
            # Copy to cache
            import shutil
            shutil.copy2(mp3_path, cache_path)
-            logger.info(f"TTS synthesized: {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
+            logger.info(f"TTS synthesized ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
        except Exception as e:
            logger.error(f"Direct synthesis failed: {e}")
            raise HTTPException(status_code=500, detail=str(e))