This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/transcription_worker/transcriber.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

212 lines
6.2 KiB
Python

"""
BreakPilot Whisper Transcriber
Uses faster-whisper (MIT License) for GPU-optimized transcription.
Based on CTranslate2 for fast inference.
"""
import os
import structlog
from typing import List, Dict, Optional
log = structlog.get_logger(__name__)
class WhisperTranscriber:
"""
Whisper-based audio transcription using faster-whisper.
faster-whisper is a reimplementation of OpenAI Whisper using CTranslate2,
which is significantly faster than the original implementation.
License: MIT
Source: https://github.com/SYSTRAN/faster-whisper
"""
def __init__(
self,
model_name: str = "large-v3",
device: str = "cpu",
compute_type: str = "int8"
):
"""
Initialize the transcriber.
Args:
model_name: Whisper model to use (tiny, base, small, medium, large-v3)
device: Device to run on ("cpu", "cuda", "auto")
compute_type: Quantization type ("int8", "float16", "float32")
"""
self.model_name = model_name
self.device = device
self.compute_type = compute_type
self._model = None
def _load_model(self):
"""Lazy load the model on first use."""
if self._model is not None:
return
try:
from faster_whisper import WhisperModel
log.info(
"loading_whisper_model",
model=self.model_name,
device=self.device,
compute_type=self.compute_type
)
self._model = WhisperModel(
self.model_name,
device=self.device,
compute_type=self.compute_type
)
log.info("whisper_model_loaded")
except ImportError:
log.error("faster_whisper_not_installed")
raise ImportError(
"faster-whisper is not installed. "
"Install with: pip install faster-whisper"
)
def transcribe(
self,
audio_path: str,
language: str = "de",
beam_size: int = 5,
word_timestamps: bool = True,
vad_filter: bool = True,
vad_parameters: Optional[dict] = None
) -> List[Dict]:
"""
Transcribe an audio file.
Args:
audio_path: Path to audio file (WAV, MP3, etc.)
language: Language code (de, en, fr, etc.) or None for auto-detection
beam_size: Beam size for decoding (higher = better but slower)
word_timestamps: Include word-level timestamps
vad_filter: Enable Voice Activity Detection to filter silence
vad_parameters: Custom VAD parameters
Returns:
List of segments with text, timestamps, and confidence scores
"""
self._load_model()
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
log.info(
"transcribing_audio",
audio_path=audio_path,
language=language,
beam_size=beam_size
)
# Default VAD parameters for better speech detection
if vad_parameters is None:
vad_parameters = {
"min_silence_duration_ms": 500,
"speech_pad_ms": 400
}
# Run transcription
segments_gen, info = self._model.transcribe(
audio_path,
language=language,
beam_size=beam_size,
word_timestamps=word_timestamps,
vad_filter=vad_filter,
vad_parameters=vad_parameters
)
log.info(
"transcription_info",
detected_language=info.language,
language_probability=info.language_probability,
duration=info.duration
)
# Convert generator to list of segments
segments = []
for i, segment in enumerate(segments_gen):
seg_dict = {
"index": i,
"start_time_ms": int(segment.start * 1000),
"end_time_ms": int(segment.end * 1000),
"text": segment.text.strip(),
"confidence": round(segment.avg_logprob, 3) if segment.avg_logprob else None,
"no_speech_prob": segment.no_speech_prob
}
# Add word-level timestamps if available
if word_timestamps and segment.words:
seg_dict["words"] = [
{
"word": word.word,
"start": int(word.start * 1000),
"end": int(word.end * 1000),
"probability": round(word.probability, 3)
}
for word in segment.words
]
segments.append(seg_dict)
log.info(
"transcription_complete",
segments_count=len(segments),
duration_seconds=info.duration
)
return segments
def detect_language(self, audio_path: str) -> dict:
"""
Detect the language of an audio file.
Args:
audio_path: Path to audio file
Returns:
dict with language code and probability
"""
self._load_model()
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# Transcribe first 30 seconds to detect language
_, info = self._model.transcribe(
audio_path,
language=None, # Auto-detect
beam_size=1,
without_timestamps=True
)
return {
"language": info.language,
"probability": info.language_probability
}
@property
def available_languages(self) -> List[str]:
"""List of supported languages."""
return [
"de", "en", "fr", "es", "it", "pt", "nl", "pl", "ru",
"zh", "ja", "ko", "ar", "tr", "hi", "vi", "th", "id"
]
def get_model_info(self) -> dict:
"""Get information about the loaded model."""
return {
"model_name": self.model_name,
"device": self.device,
"compute_type": self.compute_type,
"loaded": self._model is not None
}