breakpilot-pwa/backend/transcription_worker/diarizer.py

"""
BreakPilot Speaker Diarizer

Uses pyannote.audio (MIT License) for speaker diarization.
Identifies who spoke when in an audio recording.
"""

import os
import structlog
from typing import List, Dict, Optional

log = structlog.get_logger(__name__)


class SpeakerDiarizer:
    """
    Speaker diarization using pyannote.audio.

    Identifies distinct speakers in an audio recording and provides
    timestamp information for when each speaker is talking.

    License: MIT
    Source: https://github.com/pyannote/pyannote-audio

    Note: Requires a HuggingFace token with access to pyannote models.
    Accept the conditions at: https://huggingface.co/pyannote/speaker-diarization
    """

    def __init__(
        self,
        auth_token: Optional[str] = None,
        device: str = "auto"
    ):
        """
        Initialize the diarizer.

        Args:
            auth_token: HuggingFace token with pyannote access
            device: Device to run on ("cpu", "cuda", "auto")
        """
        self.auth_token = auth_token or os.getenv("PYANNOTE_AUTH_TOKEN")
        self.device = device
        self._pipeline = None

        if not self.auth_token:
            log.warning(
                "pyannote_token_missing",
                message="Speaker diarization requires a HuggingFace token"
            )

    def _load_pipeline(self):
        """Lazy load the diarization pipeline."""
        if self._pipeline is not None:
            return

        if not self.auth_token:
            raise ValueError(
                "HuggingFace token required for pyannote.audio. "
                "Set PYANNOTE_AUTH_TOKEN environment variable."
            )

        try:
            from pyannote.audio import Pipeline
            import torch

            log.info("loading_pyannote_pipeline", device=self.device)

            # Load pre-trained speaker diarization pipeline
            self._pipeline = Pipeline.from_pretrained(
                "pyannote/speaker-diarization-3.1",
                use_auth_token=self.auth_token
            )

            # Move to appropriate device
            if self.device == "auto":
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            else:
                device = torch.device(self.device)

            self._pipeline.to(device)

            log.info("pyannote_pipeline_loaded", device=str(device))

        except ImportError:
            log.error("pyannote_not_installed")
            raise ImportError(
                "pyannote.audio is not installed. "
                "Install with: pip install pyannote.audio"
            )

    def diarize(
        self,
        audio_path: str,
        num_speakers: Optional[int] = None,
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None
    ) -> List[Dict]:
        """
        Perform speaker diarization on an audio file.

        Args:
            audio_path: Path to audio file (WAV recommended)
            num_speakers: Exact number of speakers (if known)
            min_speakers: Minimum number of speakers
            max_speakers: Maximum number of speakers

        Returns:
            List of speaker segments with speaker ID and timestamps
        """
        self._load_pipeline()

        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        log.info(
            "starting_diarization",
            audio_path=audio_path,
            num_speakers=num_speakers
        )

        # Run diarization
        diarization = self._pipeline(
            audio_path,
            num_speakers=num_speakers,
            min_speakers=min_speakers,
            max_speakers=max_speakers
        )

        # Convert to list of segments
        segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            segments.append({
                "speaker_id": speaker,
                "start_time_ms": int(turn.start * 1000),
                "end_time_ms": int(turn.end * 1000),
                "duration_ms": int((turn.end - turn.start) * 1000)
            })

        # Get unique speakers
        unique_speakers = set(s["speaker_id"] for s in segments)

        log.info(
            "diarization_complete",
            segments_count=len(segments),
            speakers_count=len(unique_speakers),
            speakers=list(unique_speakers)
        )

        return segments

    def get_speaker_stats(self, segments: List[Dict]) -> Dict:
        """
        Calculate speaking statistics per speaker.

        Args:
            segments: List of speaker segments from diarize()

        Returns:
            dict with speaking time and percentage per speaker
        """
        speaker_times = {}

        for seg in segments:
            speaker = seg["speaker_id"]
            duration = seg["duration_ms"]

            if speaker not in speaker_times:
                speaker_times[speaker] = 0
            speaker_times[speaker] += duration

        total_time = sum(speaker_times.values())

        stats = {}
        for speaker, time_ms in speaker_times.items():
            stats[speaker] = {
                "total_time_ms": time_ms,
                "total_time_seconds": round(time_ms / 1000, 1),
                "percentage": round((time_ms / total_time) * 100, 1) if total_time > 0 else 0
            }

        return {
            "speakers": stats,
            "total_speakers": len(stats),
            "total_duration_ms": total_time
        }

    def is_available(self) -> bool:
        """Check if diarization is available (token configured)."""
        return bool(self.auth_token)

    def get_pipeline_info(self) -> dict:
        """Get information about the pipeline."""
        return {
            "available": self.is_available(),
            "device": self.device,
            "loaded": self._pipeline is not None
        }