Add Piper TTS audio integration for vocabulary words

audio_service.py: Connects to compliance-tts-service (Piper TTS, MIT license) for high-quality German (Thorsten) and English (Lessac) voices. Audio cached as MP3 on first request. vocabulary_api.py: New endpoints: - GET /vocabulary/word/{id}/audio/{lang} — word pronunciation - GET /vocabulary/word/{id}/audio-syllables/{lang} — slow syllable-by-syllable Anton App analysis: identified 5 features to adopt (star system, games as rewards, progress bars, listening exercises, matching exercises). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 15:40:01 +02:00
parent 0ff5399a62
commit dc60233262
2 changed files with 187 additions and 0 deletions
@@ -0,0 +1,125 @@
 """
 Audio Service — Generates TTS audio for vocabulary words.
 Uses the Piper TTS service (compliance-tts-service, MIT license)
 for high-quality German (Thorsten) and English (Lessac) voices.
 Falls back to a placeholder response if TTS service is unavailable.
 Audio files are cached — generated once, served forever.
 """
 import hashlib
 import logging
 import os
 from typing import Optional
 import httpx
 logger = logging.getLogger(__name__)
 # Piper TTS service (runs in compliance stack)
 TTS_SERVICE_URL = os.getenv("TTS_SERVICE_URL", "http://bp-compliance-tts:8095")
 # Local cache directory for generated audio
 AUDIO_CACHE_DIR = os.path.expanduser("~/Arbeitsblaetter/audio-cache")
 def _ensure_cache_dir():
    os.makedirs(AUDIO_CACHE_DIR, exist_ok=True)
 def _cache_key(text: str, language: str) -> str:
    """Generate a deterministic cache key for text + language."""
    h = hashlib.sha256(f"{language}:{text}".encode()).hexdigest()[:16]
    return f"{language}_{h}"
 def _cache_path(text: str, language: str) -> str:
    """Full path to cached MP3 file."""
    _ensure_cache_dir()
    return os.path.join(AUDIO_CACHE_DIR, f"{_cache_key(text, language)}.mp3")
 async def synthesize_word(
    text: str,
    language: str = "de",
    word_id: str = "",
 ) -> Optional[str]:
    """
    Generate TTS audio for a word or short phrase.
    Returns the file path to the cached MP3, or None on error.
    Uses Piper TTS service (compliance-tts-service).
    """
    # Check cache first
    cached = _cache_path(text, language)
    if os.path.exists(cached):
        return cached
    # Call Piper TTS service
    try:
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.post(
                f"{TTS_SERVICE_URL}/synthesize",
                json={
                    "text": text,
                    "language": language,
                    "voice": "thorsten-high" if language == "de" else "lessac-high",
                    "module_id": "vocabulary",
                    "content_id": word_id or _cache_key(text, language),
                },
            )
            if resp.status_code != 200:
                logger.warning(f"TTS service returned {resp.status_code} for '{text}'")
                return None
            data = resp.json()
            audio_url = data.get("audio_url") or data.get("presigned_url")
            if audio_url:
                # Download the audio file
                audio_resp = await client.get(audio_url)
                if audio_resp.status_code == 200:
                    with open(cached, "wb") as f:
                        f.write(audio_resp.content)
                    logger.info(f"TTS cached: '{text}' ({language}) → {cached}")
                    return cached
    except Exception as e:
        logger.warning(f"TTS service unavailable: {e}")
    # Fallback: try direct MP3 endpoint
    try:
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.post(
                f"{TTS_SERVICE_URL}/synthesize/mp3",
                json={
                    "text": text,
                    "language": language,
                    "voice": "thorsten-high" if language == "de" else "lessac-high",
                    "module_id": "vocabulary",
                },
            )
            if resp.status_code == 200 and resp.headers.get("content-type", "").startswith("audio"):
                with open(cached, "wb") as f:
                    f.write(resp.content)
                logger.info(f"TTS cached (direct): '{text}' ({language}) → {cached}")
                return cached
    except Exception as e:
        logger.debug(f"TTS direct fallback also failed: {e}")
    return None
 async def get_or_generate_audio(
    text: str, language: str = "de", word_id: str = "",
 ) -> Optional[bytes]:
    """
    Get audio bytes for a word. Returns MP3 bytes or None.
    Generates via TTS if not cached.
    """
    path = await synthesize_word(text, language, word_id)
    if path and os.path.exists(path):
        with open(path, "rb") as f:
            return f.read()
    return None
@@ -99,6 +99,68 @@ async def api_get_filters():
    }
 # ---------------------------------------------------------------------------
 # Audio TTS for Words
 # ---------------------------------------------------------------------------
@router.get("/word/{word_id}/audio/{lang}")
 async def api_get_word_audio(word_id: str, lang: str = "en"):
    """Get or generate TTS audio for a vocabulary word.
    Returns MP3 audio. Generated on first request, cached after.
    Uses Piper TTS (MIT license) with Thorsten (DE) and Lessac (EN) voices.
    """
    from fastapi.responses import Response as FastAPIResponse
    word = await get_word(word_id)
    if not word:
        raise HTTPException(status_code=404, detail="Wort nicht gefunden")
    text = word.english if lang == "en" else word.german
    if not text:
        raise HTTPException(status_code=400, detail=f"Kein Text fuer Sprache '{lang}'")
    from audio_service import get_or_generate_audio
    audio_bytes = await get_or_generate_audio(text, language=lang, word_id=word_id)
    if not audio_bytes:
        raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
    return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
@router.get("/word/{word_id}/audio-syllables/{lang}")
 async def api_get_syllable_audio(word_id: str, lang: str = "en"):
    """Get TTS audio with slow syllable pronunciation.
    Generates audio like "ap ... ple" with pauses between syllables.
    """
    from fastapi.responses import Response as FastAPIResponse
    word = await get_word(word_id)
    if not word:
        raise HTTPException(status_code=404, detail="Wort nicht gefunden")
    syllables = word.syllables_en if lang == "en" else word.syllables_de
    if not syllables:
        # Fallback to full word
        text = word.english if lang == "en" else word.german
        syllables = [text]
    # Join syllables with pauses (Piper handles "..." as pause)
    slow_text = " ... ".join(syllables)
    from audio_service import get_or_generate_audio
    cache_key = f"{word_id}_syl_{lang}"
    audio_bytes = await get_or_generate_audio(slow_text, language=lang, word_id=cache_key)
    if not audio_bytes:
        raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
    return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
 # ---------------------------------------------------------------------------
 # Learning Unit Creation from Word Selection
 # ---------------------------------------------------------------------------