"""Compliance TTS Service — Piper TTS + FFmpeg Audio/Video Pipeline.""" import hashlib import logging import os import tempfile import uuid from fastapi import FastAPI, HTTPException from fastapi.responses import FileResponse, Response from pydantic import BaseModel from storage import StorageClient from tts_engine import PiperTTS logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="Compliance TTS Service", version="1.0.0") # Configuration MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "bp-core-minio:9000") MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot") MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123") MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true" PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "/app/models/de_DE-thorsten-high.onnx") PIPER_MODEL_EN_PATH = os.getenv("PIPER_MODEL_EN_PATH", "/app/models/en_US-lessac-high.onnx") AUDIO_BUCKET = "compliance-training-audio" VIDEO_BUCKET = "compliance-training-video" # Initialize services storage = StorageClient(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, secure=MINIO_SECURE) tts = PiperTTS(PIPER_MODEL_PATH) tts_en = PiperTTS(PIPER_MODEL_EN_PATH) if os.path.exists(PIPER_MODEL_EN_PATH) else None @app.on_event("startup") async def startup(): """Ensure buckets exist on startup.""" storage.ensure_bucket(AUDIO_BUCKET) storage.ensure_bucket(VIDEO_BUCKET) logger.info("TTS Service started") # --- Models --- class SynthesizeRequest(BaseModel): text: str language: str = "de" voice: str = "thorsten-high" module_id: str content_id: str | None = None class SynthesizeResponse(BaseModel): audio_id: str bucket: str object_key: str duration_seconds: float size_bytes: int class GenerateVideoRequest(BaseModel): script: dict audio_object_key: str module_id: str class GenerateVideoResponse(BaseModel): video_id: str bucket: str object_key: str duration_seconds: float size_bytes: int class PresignedURLRequest(BaseModel): bucket: str object_key: str expires: int = 3600 class PresignedURLResponse(BaseModel): url: str expires_in: int class VoiceInfo(BaseModel): id: str language: str name: str quality: str # --- Endpoints --- @app.get("/health") async def health(): """Health check endpoint.""" return { "status": "healthy", "piper_available": tts.is_available, "ffmpeg_available": _check_ffmpeg(), "minio_connected": storage.is_connected(), } @app.get("/voices") async def list_voices(): """List available TTS voices.""" voices = [ VoiceInfo( id="de_DE-thorsten-high", language="de", name="Thorsten (High Quality)", quality="high", ), ] if tts_en is not None: voices.append(VoiceInfo( id="en_US-lessac-high", language="en", name="Lessac (High Quality)", quality="high", )) return {"voices": voices} class SynthesizeDirectRequest(BaseModel): text: str language: str = "de" # Simple disk cache for synthesized audio (avoids re-synthesis of same text) TTS_CACHE_DIR = "/tmp/tts-cache" os.makedirs(TTS_CACHE_DIR, exist_ok=True) EDGE_TTS_VOICES = { "de": "de-DE-ConradNeural", "en": "en-US-GuyNeural", } # Matches word markers used to force English pronunciation in German text _EN_TAG_RE = re.compile(r'(.*?)', re.DOTALL) async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool: """Synthesize using Edge TTS. Handles … mixed-language markers.""" try: import edge_tts if '' in text: return await _edge_tts_mixed(text, language, output_path) voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"]) communicate = edge_tts.Communicate(text, voice) await communicate.save(output_path) return True except Exception as e: logger.warning(f"Edge TTS failed, falling back to Piper: {e}") return False async def _edge_tts_mixed(text: str, base_lang: str, output_path: str) -> bool: """Split on tags, synthesise each segment with the right voice, concat.""" import edge_tts, shutil, tempfile as tf segments: list[tuple[str, str]] = [] last = 0 for m in _EN_TAG_RE.finditer(text): if m.start() > last: segments.append((base_lang, text[last:m.start()].strip())) segments.append(("en", m.group(1).strip())) last = m.end() if last < len(text): segments.append((base_lang, text[last:].strip())) segments = [(lang, t) for lang, t in segments if t] tmpdir = tf.mkdtemp() try: seg_files = [] for i, (lang, seg_text) in enumerate(segments): voice = EDGE_TTS_VOICES.get(lang, EDGE_TTS_VOICES["de"]) seg_path = os.path.join(tmpdir, f"seg_{i:04d}.mp3") await edge_tts.Communicate(seg_text, voice).save(seg_path) seg_files.append(seg_path) list_file = os.path.join(tmpdir, "list.txt") with open(list_file, "w") as f: for sf in seg_files: f.write(f"file '{sf}'\n") proc = subprocess.run( ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_path], capture_output=True, text=True, timeout=60, ) return proc.returncode == 0 except Exception as e: logger.warning(f"Mixed TTS failed: {e}") return False finally: shutil.rmtree(tmpdir, ignore_errors=True) @app.post("/synthesize-direct") async def synthesize_direct(req: SynthesizeDirectRequest): """Synthesize text and return MP3 audio directly (no MinIO upload). Uses Edge TTS (Microsoft Neural Voices) for high-quality speech. Falls back to Piper TTS if Edge TTS is unavailable (e.g. no internet). Includes disk caching so identical text is only synthesized once. """ if not req.text.strip(): raise HTTPException(status_code=400, detail="Text is empty") # Cache key based on text + language hash text_hash = hashlib.sha256(f"{req.language}:{req.text}".encode()).hexdigest()[:16] cache_path = os.path.join(TTS_CACHE_DIR, f"{text_hash}.mp3") if os.path.exists(cache_path): logger.info(f"TTS cache hit: {text_hash}") return FileResponse( cache_path, media_type="audio/mpeg", headers={"X-TTS-Cache": "hit"}, ) # Try Edge TTS first (high quality neural voices) success = await _edge_tts_synthesize(req.text, req.language, cache_path) if success and os.path.exists(cache_path): size = os.path.getsize(cache_path) logger.info(f"Edge TTS ({req.language}): {len(req.text)} chars, {size} bytes, cached as {text_hash}") return FileResponse( cache_path, media_type="audio/mpeg", headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "edge"}, ) # Fallback: Piper TTS engine = tts if req.language == "en" and tts_en is not None: engine = tts_en with tempfile.TemporaryDirectory() as tmpdir: try: mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir) import shutil shutil.copy2(mp3_path, cache_path) logger.info(f"Piper TTS ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}") except Exception as e: logger.error(f"TTS synthesis failed: {e}") raise HTTPException(status_code=500, detail=str(e)) return FileResponse( cache_path, media_type="audio/mpeg", headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "piper"}, ) @app.post("/presigned-url", response_model=PresignedURLResponse) async def get_presigned_url(req: PresignedURLRequest): """Generate a presigned URL for accessing a stored media file.""" try: url = storage.get_presigned_url(req.bucket, req.object_key, req.expires) return PresignedURLResponse(url=url, expires_in=req.expires) except Exception as e: logger.error(f"Presigned URL generation failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/synthesize", response_model=SynthesizeResponse) async def synthesize(req: SynthesizeRequest): """Synthesize text to audio and upload to storage.""" if not req.text.strip(): raise HTTPException(status_code=400, detail="Text is empty") audio_id = str(uuid.uuid4()) content_suffix = req.content_id or "full" object_key = f"audio/{req.module_id}/{content_suffix}.mp3" with tempfile.TemporaryDirectory() as tmpdir: try: mp3_path, duration = tts.synthesize_to_mp3(req.text, tmpdir) size_bytes = storage.upload_file(AUDIO_BUCKET, object_key, mp3_path, "audio/mpeg") except Exception as e: logger.error(f"Synthesis failed: {e}") raise HTTPException(status_code=500, detail=str(e)) return SynthesizeResponse( audio_id=audio_id, bucket=AUDIO_BUCKET, object_key=object_key, duration_seconds=round(duration, 2), size_bytes=size_bytes, ) class SynthesizeSectionsRequest(BaseModel): sections: list[dict] # [{text, heading}] voice: str = "de_DE-thorsten-high" module_id: str = "" class SynthesizeSectionsResponse(BaseModel): sections: list[dict] total_duration: float class GenerateInteractiveVideoRequest(BaseModel): script: dict audio: dict # SynthesizeSectionsResponse module_id: str class GenerateInteractiveVideoResponse(BaseModel): video_id: str bucket: str object_key: str duration_seconds: float size_bytes: int @app.post("/synthesize-sections", response_model=SynthesizeSectionsResponse) async def synthesize_sections(req: SynthesizeSectionsRequest): """Synthesize audio for multiple sections, returning per-section timing.""" if not req.sections: raise HTTPException(status_code=400, detail="No sections provided") results = [] cumulative = 0.0 with tempfile.TemporaryDirectory() as tmpdir: for i, section in enumerate(req.sections): text = section.get("text", "") heading = section.get("heading", f"Section {i+1}") if not text.strip(): results.append({ "heading": heading, "audio_path": "", "audio_object_key": "", "duration": 0.0, "start_timestamp": cumulative, }) continue try: mp3_path, duration = tts.synthesize_to_mp3(text, tmpdir, suffix=f"_section_{i}") object_key = f"audio/{req.module_id}/section_{i}.mp3" storage.upload_file(AUDIO_BUCKET, object_key, mp3_path, "audio/mpeg") results.append({ "heading": heading, "audio_path": mp3_path, "audio_object_key": object_key, "duration": round(duration, 2), "start_timestamp": round(cumulative, 2), }) cumulative += duration except Exception as e: logger.error(f"Section {i} synthesis failed: {e}") raise HTTPException(status_code=500, detail=f"Section {i} synthesis failed: {e}") return SynthesizeSectionsResponse( sections=results, total_duration=round(cumulative, 2), ) @app.post("/generate-interactive-video", response_model=GenerateInteractiveVideoResponse) async def generate_interactive_video(req: GenerateInteractiveVideoRequest): """Generate an interactive presentation video with checkpoint slides.""" try: from video_generator import generate_interactive_presentation_video except ImportError: raise HTTPException(status_code=501, detail="Interactive video generation not available") video_id = str(uuid.uuid4()) object_key = f"video/{req.module_id}/interactive.mp4" with tempfile.TemporaryDirectory() as tmpdir: try: mp4_path, duration = generate_interactive_presentation_video( script=req.script, audio_sections=req.audio.get("sections", []), output_dir=tmpdir, storage=storage, audio_bucket=AUDIO_BUCKET, ) size_bytes = storage.upload_file(VIDEO_BUCKET, object_key, mp4_path, "video/mp4") except Exception as e: logger.error(f"Interactive video generation failed: {e}") raise HTTPException(status_code=500, detail=str(e)) return GenerateInteractiveVideoResponse( video_id=video_id, bucket=VIDEO_BUCKET, object_key=object_key, duration_seconds=round(duration, 2), size_bytes=size_bytes, ) @app.post("/generate-video", response_model=GenerateVideoResponse) async def generate_video(req: GenerateVideoRequest): """Generate a presentation video from slides + audio.""" try: from video_generator import generate_presentation_video except ImportError: raise HTTPException(status_code=501, detail="Video generation not available yet") video_id = str(uuid.uuid4()) object_key = f"video/{req.module_id}/presentation.mp4" with tempfile.TemporaryDirectory() as tmpdir: try: mp4_path, duration = generate_presentation_video( script=req.script, audio_object_key=req.audio_object_key, output_dir=tmpdir, storage=storage, audio_bucket=AUDIO_BUCKET, ) size_bytes = storage.upload_file(VIDEO_BUCKET, object_key, mp4_path, "video/mp4") except Exception as e: logger.error(f"Video generation failed: {e}") raise HTTPException(status_code=500, detail=str(e)) return GenerateVideoResponse( video_id=video_id, bucket=VIDEO_BUCKET, object_key=object_key, duration_seconds=round(duration, 2), size_bytes=size_bytes, ) def _check_ffmpeg() -> bool: """Check if ffmpeg is available.""" import subprocess try: subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=5) return True except Exception: return False