diff --git a/backend-lehrer/audio_service.py b/backend-lehrer/audio_service.py new file mode 100644 index 0000000..d1ee9ca --- /dev/null +++ b/backend-lehrer/audio_service.py @@ -0,0 +1,125 @@ +""" +Audio Service — Generates TTS audio for vocabulary words. + +Uses the Piper TTS service (compliance-tts-service, MIT license) +for high-quality German (Thorsten) and English (Lessac) voices. +Falls back to a placeholder response if TTS service is unavailable. + +Audio files are cached — generated once, served forever. +""" + +import hashlib +import logging +import os +from typing import Optional + +import httpx + +logger = logging.getLogger(__name__) + +# Piper TTS service (runs in compliance stack) +TTS_SERVICE_URL = os.getenv("TTS_SERVICE_URL", "http://bp-compliance-tts:8095") + +# Local cache directory for generated audio +AUDIO_CACHE_DIR = os.path.expanduser("~/Arbeitsblaetter/audio-cache") + + +def _ensure_cache_dir(): + os.makedirs(AUDIO_CACHE_DIR, exist_ok=True) + + +def _cache_key(text: str, language: str) -> str: + """Generate a deterministic cache key for text + language.""" + h = hashlib.sha256(f"{language}:{text}".encode()).hexdigest()[:16] + return f"{language}_{h}" + + +def _cache_path(text: str, language: str) -> str: + """Full path to cached MP3 file.""" + _ensure_cache_dir() + return os.path.join(AUDIO_CACHE_DIR, f"{_cache_key(text, language)}.mp3") + + +async def synthesize_word( + text: str, + language: str = "de", + word_id: str = "", +) -> Optional[str]: + """ + Generate TTS audio for a word or short phrase. + + Returns the file path to the cached MP3, or None on error. + Uses Piper TTS service (compliance-tts-service). + """ + # Check cache first + cached = _cache_path(text, language) + if os.path.exists(cached): + return cached + + # Call Piper TTS service + try: + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.post( + f"{TTS_SERVICE_URL}/synthesize", + json={ + "text": text, + "language": language, + "voice": "thorsten-high" if language == "de" else "lessac-high", + "module_id": "vocabulary", + "content_id": word_id or _cache_key(text, language), + }, + ) + if resp.status_code != 200: + logger.warning(f"TTS service returned {resp.status_code} for '{text}'") + return None + + data = resp.json() + audio_url = data.get("audio_url") or data.get("presigned_url") + + if audio_url: + # Download the audio file + audio_resp = await client.get(audio_url) + if audio_resp.status_code == 200: + with open(cached, "wb") as f: + f.write(audio_resp.content) + logger.info(f"TTS cached: '{text}' ({language}) → {cached}") + return cached + + except Exception as e: + logger.warning(f"TTS service unavailable: {e}") + + # Fallback: try direct MP3 endpoint + try: + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.post( + f"{TTS_SERVICE_URL}/synthesize/mp3", + json={ + "text": text, + "language": language, + "voice": "thorsten-high" if language == "de" else "lessac-high", + "module_id": "vocabulary", + }, + ) + if resp.status_code == 200 and resp.headers.get("content-type", "").startswith("audio"): + with open(cached, "wb") as f: + f.write(resp.content) + logger.info(f"TTS cached (direct): '{text}' ({language}) → {cached}") + return cached + except Exception as e: + logger.debug(f"TTS direct fallback also failed: {e}") + + return None + + +async def get_or_generate_audio( + text: str, language: str = "de", word_id: str = "", +) -> Optional[bytes]: + """ + Get audio bytes for a word. Returns MP3 bytes or None. + Generates via TTS if not cached. + """ + path = await synthesize_word(text, language, word_id) + if path and os.path.exists(path): + with open(path, "rb") as f: + return f.read() + return None diff --git a/backend-lehrer/vocabulary_api.py b/backend-lehrer/vocabulary_api.py index e4a3f79..cf9358c 100644 --- a/backend-lehrer/vocabulary_api.py +++ b/backend-lehrer/vocabulary_api.py @@ -99,6 +99,68 @@ async def api_get_filters(): } +# --------------------------------------------------------------------------- +# Audio TTS for Words +# --------------------------------------------------------------------------- + + +@router.get("/word/{word_id}/audio/{lang}") +async def api_get_word_audio(word_id: str, lang: str = "en"): + """Get or generate TTS audio for a vocabulary word. + + Returns MP3 audio. Generated on first request, cached after. + Uses Piper TTS (MIT license) with Thorsten (DE) and Lessac (EN) voices. + """ + from fastapi.responses import Response as FastAPIResponse + + word = await get_word(word_id) + if not word: + raise HTTPException(status_code=404, detail="Wort nicht gefunden") + + text = word.english if lang == "en" else word.german + if not text: + raise HTTPException(status_code=400, detail=f"Kein Text fuer Sprache '{lang}'") + + from audio_service import get_or_generate_audio + audio_bytes = await get_or_generate_audio(text, language=lang, word_id=word_id) + + if not audio_bytes: + raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar") + + return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg") + + +@router.get("/word/{word_id}/audio-syllables/{lang}") +async def api_get_syllable_audio(word_id: str, lang: str = "en"): + """Get TTS audio with slow syllable pronunciation. + + Generates audio like "ap ... ple" with pauses between syllables. + """ + from fastapi.responses import Response as FastAPIResponse + + word = await get_word(word_id) + if not word: + raise HTTPException(status_code=404, detail="Wort nicht gefunden") + + syllables = word.syllables_en if lang == "en" else word.syllables_de + if not syllables: + # Fallback to full word + text = word.english if lang == "en" else word.german + syllables = [text] + + # Join syllables with pauses (Piper handles "..." as pause) + slow_text = " ... ".join(syllables) + + from audio_service import get_or_generate_audio + cache_key = f"{word_id}_syl_{lang}" + audio_bytes = await get_or_generate_audio(slow_text, language=lang, word_id=cache_key) + + if not audio_bytes: + raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar") + + return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg") + + # --------------------------------------------------------------------------- # Learning Unit Creation from Word Selection # ---------------------------------------------------------------------------