""" BreakPilot Whisper Transcriber Uses faster-whisper (MIT License) for GPU-optimized transcription. Based on CTranslate2 for fast inference. """ import os import structlog from typing import List, Dict, Optional log = structlog.get_logger(__name__) class WhisperTranscriber: """ Whisper-based audio transcription using faster-whisper. faster-whisper is a reimplementation of OpenAI Whisper using CTranslate2, which is significantly faster than the original implementation. License: MIT Source: https://github.com/SYSTRAN/faster-whisper """ def __init__( self, model_name: str = "large-v3", device: str = "cpu", compute_type: str = "int8" ): """ Initialize the transcriber. Args: model_name: Whisper model to use (tiny, base, small, medium, large-v3) device: Device to run on ("cpu", "cuda", "auto") compute_type: Quantization type ("int8", "float16", "float32") """ self.model_name = model_name self.device = device self.compute_type = compute_type self._model = None def _load_model(self): """Lazy load the model on first use.""" if self._model is not None: return try: from faster_whisper import WhisperModel log.info( "loading_whisper_model", model=self.model_name, device=self.device, compute_type=self.compute_type ) self._model = WhisperModel( self.model_name, device=self.device, compute_type=self.compute_type ) log.info("whisper_model_loaded") except ImportError: log.error("faster_whisper_not_installed") raise ImportError( "faster-whisper is not installed. " "Install with: pip install faster-whisper" ) def transcribe( self, audio_path: str, language: str = "de", beam_size: int = 5, word_timestamps: bool = True, vad_filter: bool = True, vad_parameters: Optional[dict] = None ) -> List[Dict]: """ Transcribe an audio file. Args: audio_path: Path to audio file (WAV, MP3, etc.) language: Language code (de, en, fr, etc.) or None for auto-detection beam_size: Beam size for decoding (higher = better but slower) word_timestamps: Include word-level timestamps vad_filter: Enable Voice Activity Detection to filter silence vad_parameters: Custom VAD parameters Returns: List of segments with text, timestamps, and confidence scores """ self._load_model() if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") log.info( "transcribing_audio", audio_path=audio_path, language=language, beam_size=beam_size ) # Default VAD parameters for better speech detection if vad_parameters is None: vad_parameters = { "min_silence_duration_ms": 500, "speech_pad_ms": 400 } # Run transcription segments_gen, info = self._model.transcribe( audio_path, language=language, beam_size=beam_size, word_timestamps=word_timestamps, vad_filter=vad_filter, vad_parameters=vad_parameters ) log.info( "transcription_info", detected_language=info.language, language_probability=info.language_probability, duration=info.duration ) # Convert generator to list of segments segments = [] for i, segment in enumerate(segments_gen): seg_dict = { "index": i, "start_time_ms": int(segment.start * 1000), "end_time_ms": int(segment.end * 1000), "text": segment.text.strip(), "confidence": round(segment.avg_logprob, 3) if segment.avg_logprob else None, "no_speech_prob": segment.no_speech_prob } # Add word-level timestamps if available if word_timestamps and segment.words: seg_dict["words"] = [ { "word": word.word, "start": int(word.start * 1000), "end": int(word.end * 1000), "probability": round(word.probability, 3) } for word in segment.words ] segments.append(seg_dict) log.info( "transcription_complete", segments_count=len(segments), duration_seconds=info.duration ) return segments def detect_language(self, audio_path: str) -> dict: """ Detect the language of an audio file. Args: audio_path: Path to audio file Returns: dict with language code and probability """ self._load_model() if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") # Transcribe first 30 seconds to detect language _, info = self._model.transcribe( audio_path, language=None, # Auto-detect beam_size=1, without_timestamps=True ) return { "language": info.language, "probability": info.language_probability } @property def available_languages(self) -> List[str]: """List of supported languages.""" return [ "de", "en", "fr", "es", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko", "ar", "tr", "hi", "vi", "th", "id" ] def get_model_info(self) -> dict: """Get information about the loaded model.""" return { "model_name": self.model_name, "device": self.device, "compute_type": self.compute_type, "loaded": self._model is not None }