3 Commits

Author SHA1 Message Date
Benjamin Admin
df5b6d69ef feat(tts): add Edge TTS (Microsoft Neural Voices) as primary engine with Piper fallback
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 32s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Edge TTS provides near-human quality voices (de-DE-ConradNeural, en-US-GuyNeural).
Falls back to Piper TTS when Edge TTS is unavailable (e.g. no internet).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 16:13:10 +01:00
Benjamin Admin
4f6ac9b23a feat(tts): add English voice (lessac-high) + language-based model selection
- Download en_US-lessac-high Piper model in Dockerfile
- Select TTS engine based on request language (de/en)
- Include language in cache key to avoid collisions
- List both voices in /voices endpoint

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 14:07:23 +01:00
Benjamin Admin
5ea31a3236 feat(tts): add /synthesize-direct endpoint for real-time audio streaming
- Returns MP3 audio directly in response body (no MinIO upload)
- Disk cache (/tmp/tts-cache) avoids re-synthesis of identical text
- Used by pitch-deck presenter for real-time TTS playback

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:25:25 +01:00
3 changed files with 117 additions and 12 deletions

View File

@@ -15,8 +15,16 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Download Piper voice model (German, thorsten, high quality) # Download Piper voice models
RUN mkdir -p /app/models && wget -q -O /app/models/de_DE-thorsten-high.onnx "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" && wget -q -O /app/models/de_DE-thorsten-high.onnx.json "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json" RUN mkdir -p /app/models && \
wget -q -O /app/models/de_DE-thorsten-high.onnx \
"https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" && \
wget -q -O /app/models/de_DE-thorsten-high.onnx.json \
"https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json" && \
wget -q -O /app/models/en_US-lessac-high.onnx \
"https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx" && \
wget -q -O /app/models/en_US-lessac-high.onnx.json \
"https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json"
# Copy application # Copy application
COPY . . COPY . .

View File

@@ -1,10 +1,12 @@
"""Compliance TTS Service — Piper TTS + FFmpeg Audio/Video Pipeline.""" """Compliance TTS Service — Piper TTS + FFmpeg Audio/Video Pipeline."""
import hashlib
import logging import logging
import os import os
import tempfile import tempfile
import uuid import uuid
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse, Response
from pydantic import BaseModel from pydantic import BaseModel
from storage import StorageClient from storage import StorageClient
@@ -21,6 +23,7 @@ MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123") MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123")
MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true" MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "/app/models/de_DE-thorsten-high.onnx") PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "/app/models/de_DE-thorsten-high.onnx")
PIPER_MODEL_EN_PATH = os.getenv("PIPER_MODEL_EN_PATH", "/app/models/en_US-lessac-high.onnx")
AUDIO_BUCKET = "compliance-training-audio" AUDIO_BUCKET = "compliance-training-audio"
VIDEO_BUCKET = "compliance-training-video" VIDEO_BUCKET = "compliance-training-video"
@@ -28,6 +31,7 @@ VIDEO_BUCKET = "compliance-training-video"
# Initialize services # Initialize services
storage = StorageClient(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, secure=MINIO_SECURE) storage = StorageClient(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, secure=MINIO_SECURE)
tts = PiperTTS(PIPER_MODEL_PATH) tts = PiperTTS(PIPER_MODEL_PATH)
tts_en = PiperTTS(PIPER_MODEL_EN_PATH) if os.path.exists(PIPER_MODEL_EN_PATH) else None
@app.on_event("startup") @app.on_event("startup")
@@ -104,16 +108,108 @@ async def health():
@app.get("/voices") @app.get("/voices")
async def list_voices(): async def list_voices():
"""List available TTS voices.""" """List available TTS voices."""
return { voices = [
"voices": [ VoiceInfo(
VoiceInfo( id="de_DE-thorsten-high",
id="de_DE-thorsten-high", language="de",
language="de", name="Thorsten (High Quality)",
name="Thorsten (High Quality)", quality="high",
quality="high", ),
), ]
], if tts_en is not None:
} voices.append(VoiceInfo(
id="en_US-lessac-high",
language="en",
name="Lessac (High Quality)",
quality="high",
))
return {"voices": voices}
class SynthesizeDirectRequest(BaseModel):
text: str
language: str = "de"
# Simple disk cache for synthesized audio (avoids re-synthesis of same text)
TTS_CACHE_DIR = "/tmp/tts-cache"
os.makedirs(TTS_CACHE_DIR, exist_ok=True)
EDGE_TTS_VOICES = {
"de": "de-DE-ConradNeural",
"en": "en-US-GuyNeural",
}
async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool:
"""Synthesize using Edge TTS (Microsoft Neural Voices). Returns True on success."""
try:
import edge_tts
voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"])
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_path)
return True
except Exception as e:
logger.warning(f"Edge TTS failed, falling back to Piper: {e}")
return False
@app.post("/synthesize-direct")
async def synthesize_direct(req: SynthesizeDirectRequest):
"""Synthesize text and return MP3 audio directly (no MinIO upload).
Uses Edge TTS (Microsoft Neural Voices) for high-quality speech.
Falls back to Piper TTS if Edge TTS is unavailable (e.g. no internet).
Includes disk caching so identical text is only synthesized once.
"""
if not req.text.strip():
raise HTTPException(status_code=400, detail="Text is empty")
# Cache key based on text + language hash
text_hash = hashlib.sha256(f"{req.language}:{req.text}".encode()).hexdigest()[:16]
cache_path = os.path.join(TTS_CACHE_DIR, f"{text_hash}.mp3")
if os.path.exists(cache_path):
logger.info(f"TTS cache hit: {text_hash}")
return FileResponse(
cache_path,
media_type="audio/mpeg",
headers={"X-TTS-Cache": "hit"},
)
# Try Edge TTS first (high quality neural voices)
success = await _edge_tts_synthesize(req.text, req.language, cache_path)
if success and os.path.exists(cache_path):
size = os.path.getsize(cache_path)
logger.info(f"Edge TTS ({req.language}): {len(req.text)} chars, {size} bytes, cached as {text_hash}")
return FileResponse(
cache_path,
media_type="audio/mpeg",
headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "edge"},
)
# Fallback: Piper TTS
engine = tts
if req.language == "en" and tts_en is not None:
engine = tts_en
with tempfile.TemporaryDirectory() as tmpdir:
try:
mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir)
import shutil
shutil.copy2(mp3_path, cache_path)
logger.info(f"Piper TTS ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
except Exception as e:
logger.error(f"TTS synthesis failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
return FileResponse(
cache_path,
media_type="audio/mpeg",
headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "piper"},
)
@app.post("/presigned-url", response_model=PresignedURLResponse) @app.post("/presigned-url", response_model=PresignedURLResponse)

View File

@@ -3,3 +3,4 @@ uvicorn[standard]==0.27.1
boto3==1.34.25 boto3==1.34.25
python-multipart==0.0.6 python-multipart==0.0.6
pydantic==2.6.1 pydantic==2.6.1
edge-tts==6.1.12