breakpilot-lehrer/voice-service/services/audio_processor.py

"""
Audio Processor - Mimi Codec Compatible
Handles audio encoding/decoding for voice streaming

Mimi Codec specifications:
- Sample rate: 24kHz
- Frame size: 80ms
- Format: Int16 PCM
- Channels: Mono

IMPORTANT: Audio is NEVER persisted to disk.
All processing happens in RAM only.
"""
import structlog
import numpy as np
from typing import Optional, Iterator, Tuple
from dataclasses import dataclass

from config import settings

logger = structlog.get_logger(__name__)


@dataclass
class AudioFrame:
    """A single audio frame for processing."""
    samples: np.ndarray
    timestamp_ms: int
    duration_ms: int = 80


class AudioProcessor:
    """
    Processes audio for the Mimi codec.

    All audio processing is transient - data exists only
    in RAM and is discarded after processing.
    """

    def __init__(self):
        self.sample_rate = settings.audio_sample_rate
        self.frame_size_ms = settings.audio_frame_size_ms
        self.samples_per_frame = int(self.sample_rate * self.frame_size_ms / 1000)

    def bytes_to_samples(self, audio_bytes: bytes) -> np.ndarray:
        """
        Convert raw bytes to numpy samples.

        Args:
            audio_bytes: Int16 PCM audio data

        Returns:
            numpy array of float32 samples (-1.0 to 1.0)
        """
        # Convert bytes to int16
        samples_int16 = np.frombuffer(audio_bytes, dtype=np.int16)
        # Normalize to float32 (-1.0 to 1.0)
        samples_float = samples_int16.astype(np.float32) / 32768.0
        return samples_float

    def samples_to_bytes(self, samples: np.ndarray) -> bytes:
        """
        Convert numpy samples to raw bytes.

        Args:
            samples: float32 samples (-1.0 to 1.0)

        Returns:
            Int16 PCM audio data
        """
        # Clip to valid range
        samples = np.clip(samples, -1.0, 1.0)
        # Convert to int16
        samples_int16 = (samples * 32767).astype(np.int16)
        return samples_int16.tobytes()

    def extract_frames(
        self,
        audio_bytes: bytes,
        start_timestamp_ms: int = 0,
    ) -> Iterator[AudioFrame]:
        """
        Extract frames from audio data.

        Args:
            audio_bytes: Raw audio data
            start_timestamp_ms: Starting timestamp

        Yields:
            AudioFrame objects
        """
        samples = self.bytes_to_samples(audio_bytes)
        bytes_per_frame = self.samples_per_frame * 2  # Int16 = 2 bytes

        timestamp = start_timestamp_ms

        for i in range(0, len(samples), self.samples_per_frame):
            frame_samples = samples[i:i + self.samples_per_frame]

            # Pad last frame if needed
            if len(frame_samples) < self.samples_per_frame:
                frame_samples = np.pad(
                    frame_samples,
                    (0, self.samples_per_frame - len(frame_samples)),
                )

            yield AudioFrame(
                samples=frame_samples,
                timestamp_ms=timestamp,
                duration_ms=self.frame_size_ms,
            )

            timestamp += self.frame_size_ms

    def combine_frames(self, frames: list[AudioFrame]) -> bytes:
        """
        Combine multiple frames into continuous audio.

        Args:
            frames: List of AudioFrame objects

        Returns:
            Combined audio bytes
        """
        if not frames:
            return b""

        # Sort by timestamp
        sorted_frames = sorted(frames, key=lambda f: f.timestamp_ms)

        # Combine samples
        all_samples = np.concatenate([f.samples for f in sorted_frames])

        return self.samples_to_bytes(all_samples)

    def detect_voice_activity(
        self,
        audio_bytes: bytes,
        threshold: float = 0.02,
        min_duration_ms: int = 100,
    ) -> Tuple[bool, float]:
        """
        Simple voice activity detection.

        Args:
            audio_bytes: Raw audio data
            threshold: Energy threshold for speech detection
            min_duration_ms: Minimum duration for valid speech

        Returns:
            (is_speech, energy_level)
        """
        samples = self.bytes_to_samples(audio_bytes)

        # Calculate RMS energy
        energy = np.sqrt(np.mean(samples ** 2))

        # Check if duration is sufficient
        duration_ms = len(samples) / self.sample_rate * 1000
        if duration_ms < min_duration_ms:
            return False, energy

        return energy > threshold, energy

    def resample(
        self,
        audio_bytes: bytes,
        source_rate: int,
        target_rate: Optional[int] = None,
    ) -> bytes:
        """
        Resample audio to target sample rate.

        Args:
            audio_bytes: Raw audio data
            source_rate: Source sample rate
            target_rate: Target sample rate (default: 24kHz)

        Returns:
            Resampled audio bytes
        """
        target_rate = target_rate or self.sample_rate

        if source_rate == target_rate:
            return audio_bytes

        samples = self.bytes_to_samples(audio_bytes)

        # Calculate new length
        new_length = int(len(samples) * target_rate / source_rate)

        # Simple linear interpolation resampling
        # (In production, use scipy.signal.resample or librosa)
        x_old = np.linspace(0, 1, len(samples))
        x_new = np.linspace(0, 1, new_length)
        samples_resampled = np.interp(x_new, x_old, samples)

        return self.samples_to_bytes(samples_resampled)

    def normalize_audio(
        self,
        audio_bytes: bytes,
        target_db: float = -3.0,
    ) -> bytes:
        """
        Normalize audio to target dB level.

        Args:
            audio_bytes: Raw audio data
            target_db: Target peak level in dB

        Returns:
            Normalized audio bytes
        """
        samples = self.bytes_to_samples(audio_bytes)

        # Find peak
        peak = np.max(np.abs(samples))
        if peak < 0.001:  # Silence
            return audio_bytes

        # Calculate gain
        target_linear = 10 ** (target_db / 20)
        gain = target_linear / peak

        # Apply gain
        samples_normalized = samples * gain

        return self.samples_to_bytes(samples_normalized)

    def apply_noise_gate(
        self,
        audio_bytes: bytes,
        threshold_db: float = -40.0,
        attack_ms: float = 5.0,
        release_ms: float = 50.0,
    ) -> bytes:
        """
        Apply noise gate to reduce background noise.

        Args:
            audio_bytes: Raw audio data
            threshold_db: Gate threshold in dB
            attack_ms: Attack time in ms
            release_ms: Release time in ms

        Returns:
            Gated audio bytes
        """
        samples = self.bytes_to_samples(audio_bytes)

        # Convert threshold to linear
        threshold = 10 ** (threshold_db / 20)

        # Calculate envelope
        envelope = np.abs(samples)

        # Simple gate
        gate = np.where(envelope > threshold, 1.0, 0.0)

        # Smooth gate transitions
        attack_samples = int(attack_ms * self.sample_rate / 1000)
        release_samples = int(release_ms * self.sample_rate / 1000)

        # Apply smoothing (simple moving average)
        kernel_size = max(attack_samples, release_samples)
        if kernel_size > 1:
            kernel = np.ones(kernel_size) / kernel_size
            gate = np.convolve(gate, kernel, mode='same')

        # Apply gate
        samples_gated = samples * gate

        return self.samples_to_bytes(samples_gated)

    def get_audio_stats(self, audio_bytes: bytes) -> dict:
        """
        Get statistics about audio data.

        Args:
            audio_bytes: Raw audio data

        Returns:
            Dictionary with audio statistics
        """
        samples = self.bytes_to_samples(audio_bytes)

        # Calculate stats
        rms = np.sqrt(np.mean(samples ** 2))
        peak = np.max(np.abs(samples))
        duration_ms = len(samples) / self.sample_rate * 1000

        # Convert to dB
        rms_db = 20 * np.log10(rms + 1e-10)
        peak_db = 20 * np.log10(peak + 1e-10)

        return {
            "duration_ms": duration_ms,
            "sample_count": len(samples),
            "rms_db": round(rms_db, 1),
            "peak_db": round(peak_db, 1),
            "sample_rate": self.sample_rate,
        }