feat(tts): mixed-language synthesis via <en> tags

Parse <en>word</en> markers in text, synthesise English segments with en-US-GuyNeural and German segments with de-DE-ConradNeural, then ffmpeg-concat into a single MP3. Fallback to plain synthesis if no tags. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 22:24:39 +02:00
parent 1fcd8244b1
commit af08f089df
1 changed files with 45 additions and 1 deletions
@@ -141,11 +141,16 @@ EDGE_TTS_VOICES = {
    "en": "en-US-GuyNeural",
 }
 # Matches <en>word</en> markers used to force English pronunciation in German text
 _EN_TAG_RE = re.compile(r'<en>(.*?)</en>', re.DOTALL)
 async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool:
-    """Synthesize using Edge TTS (Microsoft Neural Voices). Returns True on success."""
+    """Synthesize using Edge TTS. Handles <en>…</en> mixed-language markers."""
    try:
        import edge_tts
        if '<en>' in text:
            return await _edge_tts_mixed(text, language, output_path)
        voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"])
        communicate = edge_tts.Communicate(text, voice)
        await communicate.save(output_path)
@@ -155,6 +160,45 @@ async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bo
        return False
 async def _edge_tts_mixed(text: str, base_lang: str, output_path: str) -> bool:
    """Split on <en> tags, synthesise each segment with the right voice, concat."""
    import edge_tts, shutil, tempfile as tf
    segments: list[tuple[str, str]] = []
    last = 0
    for m in _EN_TAG_RE.finditer(text):
        if m.start() > last:
            segments.append((base_lang, text[last:m.start()].strip()))
        segments.append(("en", m.group(1).strip()))
        last = m.end()
    if last < len(text):
        segments.append((base_lang, text[last:].strip()))
    segments = [(lang, t) for lang, t in segments if t]
    tmpdir = tf.mkdtemp()
    try:
        seg_files = []
        for i, (lang, seg_text) in enumerate(segments):
            voice = EDGE_TTS_VOICES.get(lang, EDGE_TTS_VOICES["de"])
            seg_path = os.path.join(tmpdir, f"seg_{i:04d}.mp3")
            await edge_tts.Communicate(seg_text, voice).save(seg_path)
            seg_files.append(seg_path)
        list_file = os.path.join(tmpdir, "list.txt")
        with open(list_file, "w") as f:
            for sf in seg_files:
                f.write(f"file '{sf}'\n")
        proc = subprocess.run(
            ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_path],
            capture_output=True, text=True, timeout=60,
        )
        return proc.returncode == 0
    except Exception as e:
        logger.warning(f"Mixed TTS failed: {e}")
        return False
    finally:
        shutil.rmtree(tmpdir, ignore_errors=True)
@app.post("/synthesize-direct")
 async def synthesize_direct(req: SynthesizeDirectRequest):
    """Synthesize text and return MP3 audio directly (no MinIO upload).