feat: voice-service von lehrer nach core verschoben, Pipeline erweitert (voice, BQAS, embedding, night-scheduler)

2026-02-15 13:26:06 +01:00
parent a7e4500ea6
commit 1089c73b46
59 changed files with 12921 additions and 20 deletions
@@ -0,0 +1,303 @@
+"""
+Audio Processor - Mimi Codec Compatible
+Handles audio encoding/decoding for voice streaming
+
+Mimi Codec specifications:
+- Sample rate: 24kHz
+- Frame size: 80ms
+- Format: Int16 PCM
+- Channels: Mono
+
+IMPORTANT: Audio is NEVER persisted to disk.
+All processing happens in RAM only.
+"""
+import structlog
+import numpy as np
+from typing import Optional, Iterator, Tuple
+from dataclasses import dataclass
+
+from config import settings
+
+logger = structlog.get_logger(__name__)
+
+
+@dataclass
+class AudioFrame:
+    """A single audio frame for processing."""
+    samples: np.ndarray
+    timestamp_ms: int
+    duration_ms: int = 80
+
+
+class AudioProcessor:
+    """
+    Processes audio for the Mimi codec.
+
+    All audio processing is transient - data exists only
+    in RAM and is discarded after processing.
+    """
+
+    def __init__(self):
+        self.sample_rate = settings.audio_sample_rate
+        self.frame_size_ms = settings.audio_frame_size_ms
+        self.samples_per_frame = int(self.sample_rate * self.frame_size_ms / 1000)
+
+    def bytes_to_samples(self, audio_bytes: bytes) -> np.ndarray:
+        """
+        Convert raw bytes to numpy samples.
+
+        Args:
+            audio_bytes: Int16 PCM audio data
+
+        Returns:
+            numpy array of float32 samples (-1.0 to 1.0)
+        """
+        # Convert bytes to int16
+        samples_int16 = np.frombuffer(audio_bytes, dtype=np.int16)
+        # Normalize to float32 (-1.0 to 1.0)
+        samples_float = samples_int16.astype(np.float32) / 32768.0
+        return samples_float
+
+    def samples_to_bytes(self, samples: np.ndarray) -> bytes:
+        """
+        Convert numpy samples to raw bytes.
+
+        Args:
+            samples: float32 samples (-1.0 to 1.0)
+
+        Returns:
+            Int16 PCM audio data
+        """
+        # Clip to valid range
+        samples = np.clip(samples, -1.0, 1.0)
+        # Convert to int16
+        samples_int16 = (samples * 32767).astype(np.int16)
+        return samples_int16.tobytes()
+
+    def extract_frames(
+        self,
+        audio_bytes: bytes,
+        start_timestamp_ms: int = 0,
+    ) -> Iterator[AudioFrame]:
+        """
+        Extract frames from audio data.
+
+        Args:
+            audio_bytes: Raw audio data
+            start_timestamp_ms: Starting timestamp
+
+        Yields:
+            AudioFrame objects
+        """
+        samples = self.bytes_to_samples(audio_bytes)
+        bytes_per_frame = self.samples_per_frame * 2  # Int16 = 2 bytes
+
+        timestamp = start_timestamp_ms
+
+        for i in range(0, len(samples), self.samples_per_frame):
+            frame_samples = samples[i:i + self.samples_per_frame]
+
+            # Pad last frame if needed
+            if len(frame_samples) < self.samples_per_frame:
+                frame_samples = np.pad(
+                    frame_samples,
+                    (0, self.samples_per_frame - len(frame_samples)),
+                )
+
+            yield AudioFrame(
+                samples=frame_samples,
+                timestamp_ms=timestamp,
+                duration_ms=self.frame_size_ms,
+            )
+
+            timestamp += self.frame_size_ms
+
+    def combine_frames(self, frames: list[AudioFrame]) -> bytes:
+        """
+        Combine multiple frames into continuous audio.
+
+        Args:
+            frames: List of AudioFrame objects
+
+        Returns:
+            Combined audio bytes
+        """
+        if not frames:
+            return b""
+
+        # Sort by timestamp
+        sorted_frames = sorted(frames, key=lambda f: f.timestamp_ms)
+
+        # Combine samples
+        all_samples = np.concatenate([f.samples for f in sorted_frames])
+
+        return self.samples_to_bytes(all_samples)
+
+    def detect_voice_activity(
+        self,
+        audio_bytes: bytes,
+        threshold: float = 0.02,
+        min_duration_ms: int = 100,
+    ) -> Tuple[bool, float]:
+        """
+        Simple voice activity detection.
+
+        Args:
+            audio_bytes: Raw audio data
+            threshold: Energy threshold for speech detection
+            min_duration_ms: Minimum duration for valid speech
+
+        Returns:
+            (is_speech, energy_level)
+        """
+        samples = self.bytes_to_samples(audio_bytes)
+
+        # Calculate RMS energy
+        energy = np.sqrt(np.mean(samples ** 2))
+
+        # Check if duration is sufficient
+        duration_ms = len(samples) / self.sample_rate * 1000
+        if duration_ms < min_duration_ms:
+            return False, energy
+
+        return energy > threshold, energy
+
+    def resample(
+        self,
+        audio_bytes: bytes,
+        source_rate: int,
+        target_rate: Optional[int] = None,
+    ) -> bytes:
+        """
+        Resample audio to target sample rate.
+
+        Args:
+            audio_bytes: Raw audio data
+            source_rate: Source sample rate
+            target_rate: Target sample rate (default: 24kHz)
+
+        Returns:
+            Resampled audio bytes
+        """
+        target_rate = target_rate or self.sample_rate
+
+        if source_rate == target_rate:
+            return audio_bytes
+
+        samples = self.bytes_to_samples(audio_bytes)
+
+        # Calculate new length
+        new_length = int(len(samples) * target_rate / source_rate)
+
+        # Simple linear interpolation resampling
+        # (In production, use scipy.signal.resample or librosa)
+        x_old = np.linspace(0, 1, len(samples))
+        x_new = np.linspace(0, 1, new_length)
+        samples_resampled = np.interp(x_new, x_old, samples)
+
+        return self.samples_to_bytes(samples_resampled)
+
+    def normalize_audio(
+        self,
+        audio_bytes: bytes,
+        target_db: float = -3.0,
+    ) -> bytes:
+        """
+        Normalize audio to target dB level.
+
+        Args:
+            audio_bytes: Raw audio data
+            target_db: Target peak level in dB
+
+        Returns:
+            Normalized audio bytes
+        """
+        samples = self.bytes_to_samples(audio_bytes)
+
+        # Find peak
+        peak = np.max(np.abs(samples))
+        if peak < 0.001:  # Silence
+            return audio_bytes
+
+        # Calculate gain
+        target_linear = 10 ** (target_db / 20)
+        gain = target_linear / peak
+
+        # Apply gain
+        samples_normalized = samples * gain
+
+        return self.samples_to_bytes(samples_normalized)
+
+    def apply_noise_gate(
+        self,
+        audio_bytes: bytes,
+        threshold_db: float = -40.0,
+        attack_ms: float = 5.0,
+        release_ms: float = 50.0,
+    ) -> bytes:
+        """
+        Apply noise gate to reduce background noise.
+
+        Args:
+            audio_bytes: Raw audio data
+            threshold_db: Gate threshold in dB
+            attack_ms: Attack time in ms
+            release_ms: Release time in ms
+
+        Returns:
+            Gated audio bytes
+        """
+        samples = self.bytes_to_samples(audio_bytes)
+
+        # Convert threshold to linear
+        threshold = 10 ** (threshold_db / 20)
+
+        # Calculate envelope
+        envelope = np.abs(samples)
+
+        # Simple gate
+        gate = np.where(envelope > threshold, 1.0, 0.0)
+
+        # Smooth gate transitions
+        attack_samples = int(attack_ms * self.sample_rate / 1000)
+        release_samples = int(release_ms * self.sample_rate / 1000)
+
+        # Apply smoothing (simple moving average)
+        kernel_size = max(attack_samples, release_samples)
+        if kernel_size > 1:
+            kernel = np.ones(kernel_size) / kernel_size
+            gate = np.convolve(gate, kernel, mode='same')
+
+        # Apply gate
+        samples_gated = samples * gate
+
+        return self.samples_to_bytes(samples_gated)
+
+    def get_audio_stats(self, audio_bytes: bytes) -> dict:
+        """
+        Get statistics about audio data.
+
+        Args:
+            audio_bytes: Raw audio data
+
+        Returns:
+            Dictionary with audio statistics
+        """
+        samples = self.bytes_to_samples(audio_bytes)
+
+        # Calculate stats
+        rms = np.sqrt(np.mean(samples ** 2))
+        peak = np.max(np.abs(samples))
+        duration_ms = len(samples) / self.sample_rate * 1000
+
+        # Convert to dB
+        rms_db = 20 * np.log10(rms + 1e-10)
+        peak_db = 20 * np.log10(peak + 1e-10)
+
+        return {
+            "duration_ms": duration_ms,
+            "sample_count": len(samples),
+            "rms_db": round(rms_db, 1),
+            "peak_db": round(peak_db, 1),
+            "sample_rate": self.sample_rate,
+        }