fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
211
backend/transcription_worker/transcriber.py
Normal file
211
backend/transcription_worker/transcriber.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
BreakPilot Whisper Transcriber
|
||||
|
||||
Uses faster-whisper (MIT License) for GPU-optimized transcription.
|
||||
Based on CTranslate2 for fast inference.
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
class WhisperTranscriber:
|
||||
"""
|
||||
Whisper-based audio transcription using faster-whisper.
|
||||
|
||||
faster-whisper is a reimplementation of OpenAI Whisper using CTranslate2,
|
||||
which is significantly faster than the original implementation.
|
||||
|
||||
License: MIT
|
||||
Source: https://github.com/SYSTRAN/faster-whisper
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "large-v3",
|
||||
device: str = "cpu",
|
||||
compute_type: str = "int8"
|
||||
):
|
||||
"""
|
||||
Initialize the transcriber.
|
||||
|
||||
Args:
|
||||
model_name: Whisper model to use (tiny, base, small, medium, large-v3)
|
||||
device: Device to run on ("cpu", "cuda", "auto")
|
||||
compute_type: Quantization type ("int8", "float16", "float32")
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.compute_type = compute_type
|
||||
self._model = None
|
||||
|
||||
def _load_model(self):
|
||||
"""Lazy load the model on first use."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
log.info(
|
||||
"loading_whisper_model",
|
||||
model=self.model_name,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type
|
||||
)
|
||||
|
||||
self._model = WhisperModel(
|
||||
self.model_name,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type
|
||||
)
|
||||
|
||||
log.info("whisper_model_loaded")
|
||||
|
||||
except ImportError:
|
||||
log.error("faster_whisper_not_installed")
|
||||
raise ImportError(
|
||||
"faster-whisper is not installed. "
|
||||
"Install with: pip install faster-whisper"
|
||||
)
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
audio_path: str,
|
||||
language: str = "de",
|
||||
beam_size: int = 5,
|
||||
word_timestamps: bool = True,
|
||||
vad_filter: bool = True,
|
||||
vad_parameters: Optional[dict] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Transcribe an audio file.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file (WAV, MP3, etc.)
|
||||
language: Language code (de, en, fr, etc.) or None for auto-detection
|
||||
beam_size: Beam size for decoding (higher = better but slower)
|
||||
word_timestamps: Include word-level timestamps
|
||||
vad_filter: Enable Voice Activity Detection to filter silence
|
||||
vad_parameters: Custom VAD parameters
|
||||
|
||||
Returns:
|
||||
List of segments with text, timestamps, and confidence scores
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
||||
|
||||
log.info(
|
||||
"transcribing_audio",
|
||||
audio_path=audio_path,
|
||||
language=language,
|
||||
beam_size=beam_size
|
||||
)
|
||||
|
||||
# Default VAD parameters for better speech detection
|
||||
if vad_parameters is None:
|
||||
vad_parameters = {
|
||||
"min_silence_duration_ms": 500,
|
||||
"speech_pad_ms": 400
|
||||
}
|
||||
|
||||
# Run transcription
|
||||
segments_gen, info = self._model.transcribe(
|
||||
audio_path,
|
||||
language=language,
|
||||
beam_size=beam_size,
|
||||
word_timestamps=word_timestamps,
|
||||
vad_filter=vad_filter,
|
||||
vad_parameters=vad_parameters
|
||||
)
|
||||
|
||||
log.info(
|
||||
"transcription_info",
|
||||
detected_language=info.language,
|
||||
language_probability=info.language_probability,
|
||||
duration=info.duration
|
||||
)
|
||||
|
||||
# Convert generator to list of segments
|
||||
segments = []
|
||||
for i, segment in enumerate(segments_gen):
|
||||
seg_dict = {
|
||||
"index": i,
|
||||
"start_time_ms": int(segment.start * 1000),
|
||||
"end_time_ms": int(segment.end * 1000),
|
||||
"text": segment.text.strip(),
|
||||
"confidence": round(segment.avg_logprob, 3) if segment.avg_logprob else None,
|
||||
"no_speech_prob": segment.no_speech_prob
|
||||
}
|
||||
|
||||
# Add word-level timestamps if available
|
||||
if word_timestamps and segment.words:
|
||||
seg_dict["words"] = [
|
||||
{
|
||||
"word": word.word,
|
||||
"start": int(word.start * 1000),
|
||||
"end": int(word.end * 1000),
|
||||
"probability": round(word.probability, 3)
|
||||
}
|
||||
for word in segment.words
|
||||
]
|
||||
|
||||
segments.append(seg_dict)
|
||||
|
||||
log.info(
|
||||
"transcription_complete",
|
||||
segments_count=len(segments),
|
||||
duration_seconds=info.duration
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
def detect_language(self, audio_path: str) -> dict:
|
||||
"""
|
||||
Detect the language of an audio file.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file
|
||||
|
||||
Returns:
|
||||
dict with language code and probability
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
||||
|
||||
# Transcribe first 30 seconds to detect language
|
||||
_, info = self._model.transcribe(
|
||||
audio_path,
|
||||
language=None, # Auto-detect
|
||||
beam_size=1,
|
||||
without_timestamps=True
|
||||
)
|
||||
|
||||
return {
|
||||
"language": info.language,
|
||||
"probability": info.language_probability
|
||||
}
|
||||
|
||||
@property
|
||||
def available_languages(self) -> List[str]:
|
||||
"""List of supported languages."""
|
||||
return [
|
||||
"de", "en", "fr", "es", "it", "pt", "nl", "pl", "ru",
|
||||
"zh", "ja", "ko", "ar", "tr", "hi", "vi", "th", "id"
|
||||
]
|
||||
|
||||
def get_model_info(self) -> dict:
|
||||
"""Get information about the loaded model."""
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"device": self.device,
|
||||
"compute_type": self.compute_type,
|
||||
"loaded": self._model is not None
|
||||
}
|
||||
Reference in New Issue
Block a user