merge: sync with origin/main, take upstream on conflicts

# Conflicts: # admin-compliance/lib/sdk/types.ts # admin-compliance/lib/sdk/vendor-compliance/types.ts
2026-04-16 16:26:48 +02:00
parent e04816cfe5 712fa8cb74
commit c43d9da6d0
352 changed files with 181673 additions and 2188 deletions
@@ -15,8 +15,16 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt

-# Download Piper voice model (German, thorsten, high quality)
-RUN mkdir -p /app/models &&     wget -q -O /app/models/de_DE-thorsten-high.onnx     "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" &&     wget -q -O /app/models/de_DE-thorsten-high.onnx.json     "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json"
+# Download Piper voice models
+RUN mkdir -p /app/models && \
+    wget -q -O /app/models/de_DE-thorsten-high.onnx \
+    "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx" && \
+    wget -q -O /app/models/de_DE-thorsten-high.onnx.json \
+    "https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE/thorsten/high/de_DE-thorsten-high.onnx.json" && \
+    wget -q -O /app/models/en_US-lessac-high.onnx \
+    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx" && \
+    wget -q -O /app/models/en_US-lessac-high.onnx.json \
+    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json"

 # Copy application
 COPY . .
@@ -1,10 +1,12 @@
 """Compliance TTS Service — Piper TTS + FFmpeg Audio/Video Pipeline."""
+import hashlib
 import logging
 import os
 import tempfile
 import uuid

 from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse, Response
 from pydantic import BaseModel

 from storage import StorageClient
@@ -21,6 +23,7 @@ MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot")
 MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123")
 MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
 PIPER_MODEL_PATH = os.getenv("PIPER_MODEL_PATH", "/app/models/de_DE-thorsten-high.onnx")
+PIPER_MODEL_EN_PATH = os.getenv("PIPER_MODEL_EN_PATH", "/app/models/en_US-lessac-high.onnx")

 AUDIO_BUCKET = "compliance-training-audio"
 VIDEO_BUCKET = "compliance-training-video"
@@ -28,6 +31,7 @@ VIDEO_BUCKET = "compliance-training-video"
 # Initialize services
 storage = StorageClient(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, secure=MINIO_SECURE)
 tts = PiperTTS(PIPER_MODEL_PATH)
+tts_en = PiperTTS(PIPER_MODEL_EN_PATH) if os.path.exists(PIPER_MODEL_EN_PATH) else None


@app.on_event("startup")
@@ -70,6 +74,17 @@ class GenerateVideoResponse(BaseModel):
    size_bytes: int


+class PresignedURLRequest(BaseModel):
+    bucket: str
+    object_key: str
+    expires: int = 3600
+
+
+class PresignedURLResponse(BaseModel):
+    url: str
+    expires_in: int
+
+
 class VoiceInfo(BaseModel):
    id: str
    language: str
@@ -93,16 +108,119 @@ async def health():
@app.get("/voices")
 async def list_voices():
    """List available TTS voices."""
-    return {
-        "voices": [
-            VoiceInfo(
-                id="de_DE-thorsten-high",
-                language="de",
-                name="Thorsten (High Quality)",
-                quality="high",
-            ),
-        ],
-    }
+    voices = [
+        VoiceInfo(
+            id="de_DE-thorsten-high",
+            language="de",
+            name="Thorsten (High Quality)",
+            quality="high",
+        ),
+    ]
+    if tts_en is not None:
+        voices.append(VoiceInfo(
+            id="en_US-lessac-high",
+            language="en",
+            name="Lessac (High Quality)",
+            quality="high",
+        ))
+    return {"voices": voices}
+
+
+class SynthesizeDirectRequest(BaseModel):
+    text: str
+    language: str = "de"
+
+
+# Simple disk cache for synthesized audio (avoids re-synthesis of same text)
+TTS_CACHE_DIR = "/tmp/tts-cache"
+os.makedirs(TTS_CACHE_DIR, exist_ok=True)
+
+
+EDGE_TTS_VOICES = {
+    "de": "de-DE-ConradNeural",
+    "en": "en-US-GuyNeural",
+}
+
+
+async def _edge_tts_synthesize(text: str, language: str, output_path: str) -> bool:
+    """Synthesize using Edge TTS (Microsoft Neural Voices). Returns True on success."""
+    try:
+        import edge_tts
+        voice = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["de"])
+        communicate = edge_tts.Communicate(text, voice)
+        await communicate.save(output_path)
+        return True
+    except Exception as e:
+        logger.warning(f"Edge TTS failed, falling back to Piper: {e}")
+        return False
+
+
+@app.post("/synthesize-direct")
+async def synthesize_direct(req: SynthesizeDirectRequest):
+    """Synthesize text and return MP3 audio directly (no MinIO upload).
+
+    Uses Edge TTS (Microsoft Neural Voices) for high-quality speech.
+    Falls back to Piper TTS if Edge TTS is unavailable (e.g. no internet).
+    Includes disk caching so identical text is only synthesized once.
+    """
+    if not req.text.strip():
+        raise HTTPException(status_code=400, detail="Text is empty")
+
+    # Cache key based on text + language hash
+    text_hash = hashlib.sha256(f"{req.language}:{req.text}".encode()).hexdigest()[:16]
+    cache_path = os.path.join(TTS_CACHE_DIR, f"{text_hash}.mp3")
+
+    if os.path.exists(cache_path):
+        logger.info(f"TTS cache hit: {text_hash}")
+        return FileResponse(
+            cache_path,
+            media_type="audio/mpeg",
+            headers={"X-TTS-Cache": "hit"},
+        )
+
+    # Try Edge TTS first (high quality neural voices)
+    success = await _edge_tts_synthesize(req.text, req.language, cache_path)
+
+    if success and os.path.exists(cache_path):
+        size = os.path.getsize(cache_path)
+        logger.info(f"Edge TTS ({req.language}): {len(req.text)} chars, {size} bytes, cached as {text_hash}")
+        return FileResponse(
+            cache_path,
+            media_type="audio/mpeg",
+            headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "edge"},
+        )
+
+    # Fallback: Piper TTS
+    engine = tts
+    if req.language == "en" and tts_en is not None:
+        engine = tts_en
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            mp3_path, duration = engine.synthesize_to_mp3(req.text, tmpdir)
+            import shutil
+            shutil.copy2(mp3_path, cache_path)
+            logger.info(f"Piper TTS ({req.language}): {len(req.text)} chars, {duration:.1f}s, cached as {text_hash}")
+        except Exception as e:
+            logger.error(f"TTS synthesis failed: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+
+    return FileResponse(
+        cache_path,
+        media_type="audio/mpeg",
+        headers={"X-TTS-Cache": "miss", "X-TTS-Engine": "piper"},
+    )
+
+
+@app.post("/presigned-url", response_model=PresignedURLResponse)
+async def get_presigned_url(req: PresignedURLRequest):
+    """Generate a presigned URL for accessing a stored media file."""
+    try:
+        url = storage.get_presigned_url(req.bucket, req.object_key, req.expires)
+        return PresignedURLResponse(url=url, expires_in=req.expires)
+    except Exception as e:
+        logger.error(f"Presigned URL generation failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))


@app.post("/synthesize", response_model=SynthesizeResponse)
@@ -132,6 +250,112 @@ async def synthesize(req: SynthesizeRequest):
    )


+class SynthesizeSectionsRequest(BaseModel):
+    sections: list[dict]  # [{text, heading}]
+    voice: str = "de_DE-thorsten-high"
+    module_id: str = ""
+
+
+class SynthesizeSectionsResponse(BaseModel):
+    sections: list[dict]
+    total_duration: float
+
+
+class GenerateInteractiveVideoRequest(BaseModel):
+    script: dict
+    audio: dict  # SynthesizeSectionsResponse
+    module_id: str
+
+
+class GenerateInteractiveVideoResponse(BaseModel):
+    video_id: str
+    bucket: str
+    object_key: str
+    duration_seconds: float
+    size_bytes: int
+
+
+@app.post("/synthesize-sections", response_model=SynthesizeSectionsResponse)
+async def synthesize_sections(req: SynthesizeSectionsRequest):
+    """Synthesize audio for multiple sections, returning per-section timing."""
+    if not req.sections:
+        raise HTTPException(status_code=400, detail="No sections provided")
+
+    results = []
+    cumulative = 0.0
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        for i, section in enumerate(req.sections):
+            text = section.get("text", "")
+            heading = section.get("heading", f"Section {i+1}")
+
+            if not text.strip():
+                results.append({
+                    "heading": heading,
+                    "audio_path": "",
+                    "audio_object_key": "",
+                    "duration": 0.0,
+                    "start_timestamp": cumulative,
+                })
+                continue
+
+            try:
+                mp3_path, duration = tts.synthesize_to_mp3(text, tmpdir, suffix=f"_section_{i}")
+                object_key = f"audio/{req.module_id}/section_{i}.mp3"
+                storage.upload_file(AUDIO_BUCKET, object_key, mp3_path, "audio/mpeg")
+
+                results.append({
+                    "heading": heading,
+                    "audio_path": mp3_path,
+                    "audio_object_key": object_key,
+                    "duration": round(duration, 2),
+                    "start_timestamp": round(cumulative, 2),
+                })
+                cumulative += duration
+            except Exception as e:
+                logger.error(f"Section {i} synthesis failed: {e}")
+                raise HTTPException(status_code=500, detail=f"Section {i} synthesis failed: {e}")
+
+    return SynthesizeSectionsResponse(
+        sections=results,
+        total_duration=round(cumulative, 2),
+    )
+
+
+@app.post("/generate-interactive-video", response_model=GenerateInteractiveVideoResponse)
+async def generate_interactive_video(req: GenerateInteractiveVideoRequest):
+    """Generate an interactive presentation video with checkpoint slides."""
+    try:
+        from video_generator import generate_interactive_presentation_video
+    except ImportError:
+        raise HTTPException(status_code=501, detail="Interactive video generation not available")
+
+    video_id = str(uuid.uuid4())
+    object_key = f"video/{req.module_id}/interactive.mp4"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            mp4_path, duration = generate_interactive_presentation_video(
+                script=req.script,
+                audio_sections=req.audio.get("sections", []),
+                output_dir=tmpdir,
+                storage=storage,
+                audio_bucket=AUDIO_BUCKET,
+            )
+            size_bytes = storage.upload_file(VIDEO_BUCKET, object_key, mp4_path, "video/mp4")
+        except Exception as e:
+            logger.error(f"Interactive video generation failed: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+
+    return GenerateInteractiveVideoResponse(
+        video_id=video_id,
+        bucket=VIDEO_BUCKET,
+        object_key=object_key,
+        duration_seconds=round(duration, 2),
+        size_bytes=size_bytes,
+    )
+
+
@app.post("/generate-video", response_model=GenerateVideoResponse)
 async def generate_video(req: GenerateVideoRequest):
    """Generate a presentation video from slides + audio."""
@@ -3,3 +3,4 @@ uvicorn[standard]==0.27.1
 boto3==1.34.25
 python-multipart==0.0.6
 pydantic==2.6.1
+edge-tts==7.2.7
@@ -130,3 +130,97 @@ def render_title_slide(
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
    if result.returncode != 0:
        raise RuntimeError(f"ImageMagick title slide failed: {result.stderr}")
+
+
+def render_checkpoint_slide(
+    title: str,
+    question_preview: str,
+    question_count: int,
+    output_path: str,
+) -> None:
+    """Render a checkpoint slide with red border and quiz preview."""
+    border_width = 12
+    cmd = [
+        "convert",
+        "-size", f"{WIDTH}x{HEIGHT}",
+        "xc:white",
+        # Red border (full rectangle, then white inner)
+        "-fill", "#c0392b",
+        "-draw", f"rectangle 0,0 {WIDTH},{HEIGHT}",
+        "-fill", "white",
+        "-draw", f"rectangle {border_width},{border_width} {WIDTH - border_width},{HEIGHT - border_width}",
+        # Red header bar
+        "-fill", "#c0392b",
+        "-draw", f"rectangle {border_width},{border_width} {WIDTH - border_width},{HEADER_HEIGHT + border_width}",
+        # CHECKPOINT label
+        "-fill", "white",
+        "-font", FONT_BOLD,
+        "-pointsize", "48",
+        "-gravity", "NorthWest",
+        "-annotate", f"+{60 + border_width}+{(HEADER_HEIGHT - 48) // 2 + border_width}",
+        f"CHECKPOINT: {title[:50]}",
+    ]
+
+    y_pos = HEADER_HEIGHT + border_width + 60
+
+    # Instruction text
+    cmd.extend([
+        "-fill", "#333333",
+        "-font", FONT,
+        "-pointsize", "32",
+        "-gravity", "NorthWest",
+        "-annotate", f"+80+{y_pos}",
+        "Bitte beantworten Sie die folgenden Fragen,",
+    ])
+    y_pos += 44
+
+    cmd.extend([
+        "-fill", "#333333",
+        "-font", FONT,
+        "-pointsize", "32",
+        "-gravity", "NorthWest",
+        "-annotate", f"+80+{y_pos}",
+        "um mit der Schulung fortzufahren.",
+    ])
+    y_pos += 80
+
+    # Question preview
+    if question_preview:
+        preview = textwrap.fill(question_preview, width=70)
+        cmd.extend([
+            "-fill", "#666666",
+            "-font", FONT,
+            "-pointsize", "26",
+            "-gravity", "NorthWest",
+            "-annotate", f"+80+{y_pos}",
+            f"Erste Frage: {preview[:120]}...",
+        ])
+        y_pos += 50
+
+    # Question count
+    cmd.extend([
+        "-fill", "#888888",
+        "-font", FONT,
+        "-pointsize", "24",
+        "-gravity", "NorthWest",
+        "-annotate", f"+80+{y_pos}",
+        f"{question_count} Fragen in diesem Checkpoint",
+    ])
+
+    # Footer
+    cmd.extend([
+        "-fill", "#f0f0f0",
+        "-draw", f"rectangle {border_width},{HEIGHT - FOOTER_HEIGHT - border_width} {WIDTH - border_width},{HEIGHT - border_width}",
+        "-fill", "#c0392b",
+        "-font", FONT_BOLD,
+        "-pointsize", "22",
+        "-gravity", "South",
+        "-annotate", f"+0+{(FOOTER_HEIGHT - 22) // 2 + border_width}",
+        "Video wird pausiert — Quiz im Player beantworten",
+    ])
+
+    cmd.append(output_path)
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    if result.returncode != 0:
+        raise RuntimeError(f"ImageMagick checkpoint slide failed: {result.stderr}")
@@ -74,7 +74,7 @@ class PiperTTS:
        if proc.returncode != 0:
            raise RuntimeError(f"Piper failed: {proc.stderr}")

-    def synthesize_to_mp3(self, text: str, output_dir: str) -> tuple[str, float]:
+    def synthesize_to_mp3(self, text: str, output_dir: str, suffix: str = "") -> tuple[str, float]:
        """
        Synthesize text to MP3.
        Splits text into sentences, synthesizes each, concatenates, encodes to MP3.
@@ -88,16 +88,16 @@ class PiperTTS:
        wav_files = []
        try:
            for i, sentence in enumerate(sentences):
-                wav_path = os.path.join(output_dir, f"seg_{i:04d}.wav")
+                wav_path = os.path.join(output_dir, f"seg{suffix}_{i:04d}.wav")
                self.synthesize_to_wav(sentence, wav_path)
                wav_files.append(wav_path)

            # Concatenate WAV files
-            combined_wav = os.path.join(output_dir, "combined.wav")
+            combined_wav = os.path.join(output_dir, f"combined{suffix}.wav")
            self._concatenate_wavs(wav_files, combined_wav)

            # Convert to MP3
-            mp3_path = os.path.join(output_dir, "output.mp3")
+            mp3_path = os.path.join(output_dir, f"output{suffix}.mp3")
            self._wav_to_mp3(combined_wav, mp3_path)

            # Get duration
@@ -115,6 +115,139 @@ def generate_presentation_video(
    return output_path, video_duration


+def generate_interactive_presentation_video(
+    script: dict,
+    audio_sections: list[dict],
+    output_dir: str,
+    storage,
+    audio_bucket: str,
+) -> tuple[str, float]:
+    """
+    Generate an interactive presentation video from narrator script + per-section audio.
+
+    Includes checkpoint slides (red-bordered pause markers) between sections.
+    Returns (mp4_path, duration_seconds).
+    """
+    from slide_renderer import render_slide, render_title_slide, render_checkpoint_slide
+
+    title = script.get("title", "Compliance Training")
+    sections = script.get("sections", [])
+
+    if not sections:
+        raise ValueError("Script has no sections")
+    if not audio_sections:
+        raise ValueError("No audio sections provided")
+
+    # Step 1: Download all section audio files
+    audio_paths = []
+    for i, sec in enumerate(audio_sections):
+        obj_key = sec.get("audio_object_key", "")
+        if not obj_key:
+            continue
+        audio_path = os.path.join(output_dir, f"section_{i}.mp3")
+        storage.client.download_file(audio_bucket, obj_key, audio_path)
+        audio_paths.append((i, audio_path, sec.get("duration", 0.0)))
+
+    # Step 2: Render slides
+    slides_dir = os.path.join(output_dir, "slides")
+    os.makedirs(slides_dir, exist_ok=True)
+
+    # All slide entries: (png_path, duration)
+    slide_entries = []
+
+    # Title slide (5 seconds)
+    title_path = os.path.join(slides_dir, "slide_000_title.png")
+    render_title_slide(title, "Interaktive Compliance-Schulung", title_path)
+    slide_entries.append((title_path, 5.0))
+
+    total_content_slides = sum(1 for _ in sections)  # for numbering
+    slide_num = 1
+
+    for i, section in enumerate(sections):
+        heading = section.get("heading", "")
+        narrator_text = section.get("narrator_text", "")
+        bullet_points = section.get("bullet_points", [])
+
+        # Content slide for this section
+        slide_path = os.path.join(slides_dir, f"slide_{i+1:03d}_content.png")
+        render_slide(
+            heading=heading,
+            text=narrator_text[:200] if len(narrator_text) > 200 else narrator_text,
+            bullet_points=bullet_points,
+            slide_number=slide_num + 1,
+            total_slides=total_content_slides + 1,
+            module_code=script.get("module_code", ""),
+            output_path=slide_path,
+        )
+        slide_num += 1
+
+        # Duration = matching audio section duration
+        section_duration = 5.0  # fallback
+        if i < len(audio_paths):
+            section_duration = audio_paths[i][2] or 5.0
+        slide_entries.append((slide_path, section_duration))
+
+        # Checkpoint slide (if this section has a checkpoint)
+        checkpoint = section.get("checkpoint")
+        if checkpoint:
+            cp_title = checkpoint.get("title", f"Checkpoint {i+1}")
+            questions = checkpoint.get("questions", [])
+            question_preview = questions[0].get("question", "") if questions else ""
+            cp_path = os.path.join(slides_dir, f"slide_{i+1:03d}_checkpoint.png")
+            render_checkpoint_slide(cp_title, question_preview, len(questions), cp_path)
+            slide_entries.append((cp_path, 3.0))  # 3s still frame as pause marker
+
+    # Step 3: Concatenate all section audio files into one
+    combined_audio = os.path.join(output_dir, "combined_audio.mp3")
+    if len(audio_paths) == 1:
+        import shutil
+        shutil.copy2(audio_paths[0][1], combined_audio)
+    elif len(audio_paths) > 1:
+        # Use FFmpeg to concatenate audio
+        audio_list_path = os.path.join(output_dir, "audio_list.txt")
+        with open(audio_list_path, "w") as f:
+            for _, apath, _ in audio_paths:
+                f.write(f"file '{apath}'\n")
+        cmd = [
+            "ffmpeg", "-y", "-f", "concat", "-safe", "0",
+            "-i", audio_list_path, "-c", "copy", combined_audio,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+        if result.returncode != 0:
+            raise RuntimeError(f"FFmpeg audio concat failed: {result.stderr}")
+    else:
+        raise ValueError("No audio files to concatenate")
+
+    # Step 4: Create FFmpeg concat file for slides
+    concat_path = os.path.join(output_dir, "concat.txt")
+    with open(concat_path, "w") as f:
+        for slide_path, dur in slide_entries:
+            f.write(f"file '{slide_path}'\n")
+            f.write(f"duration {dur:.2f}\n")
+        # Repeat last slide for FFmpeg concat demuxer
+        f.write(f"file '{slide_entries[-1][0]}'\n")
+
+    # Step 5: Combine slides + audio into MP4
+    output_path = os.path.join(output_dir, "interactive.mp4")
+    cmd = [
+        "ffmpeg", "-y",
+        "-f", "concat", "-safe", "0", "-i", concat_path,
+        "-i", combined_audio,
+        "-c:v", "libx264", "-pix_fmt", "yuv420p",
+        "-c:a", "aac", "-b:a", "128k",
+        "-shortest",
+        "-movflags", "+faststart",
+        output_path,
+    ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+    if result.returncode != 0:
+        raise RuntimeError(f"FFmpeg interactive video failed: {result.stderr}")
+
+    video_duration = _get_duration(output_path)
+    return output_path, video_duration
+
+
 def _get_duration(file_path: str) -> float:
    """Get media duration using FFprobe."""
    cmd = [