fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/transcription_worker/transcriber.py
+++ b/backend/transcription_worker/transcriber.py
@@ -0,0 +1,211 @@
+"""
+BreakPilot Whisper Transcriber
+
+Uses faster-whisper (MIT License) for GPU-optimized transcription.
+Based on CTranslate2 for fast inference.
+"""
+
+import os
+import structlog
+from typing import List, Dict, Optional
+
+log = structlog.get_logger(__name__)
+
+
+class WhisperTranscriber:
+    """
+    Whisper-based audio transcription using faster-whisper.
+
+    faster-whisper is a reimplementation of OpenAI Whisper using CTranslate2,
+    which is significantly faster than the original implementation.
+
+    License: MIT
+    Source: https://github.com/SYSTRAN/faster-whisper
+    """
+
+    def __init__(
+        self,
+        model_name: str = "large-v3",
+        device: str = "cpu",
+        compute_type: str = "int8"
+    ):
+        """
+        Initialize the transcriber.
+
+        Args:
+            model_name: Whisper model to use (tiny, base, small, medium, large-v3)
+            device: Device to run on ("cpu", "cuda", "auto")
+            compute_type: Quantization type ("int8", "float16", "float32")
+        """
+        self.model_name = model_name
+        self.device = device
+        self.compute_type = compute_type
+        self._model = None
+
+    def _load_model(self):
+        """Lazy load the model on first use."""
+        if self._model is not None:
+            return
+
+        try:
+            from faster_whisper import WhisperModel
+
+            log.info(
+                "loading_whisper_model",
+                model=self.model_name,
+                device=self.device,
+                compute_type=self.compute_type
+            )
+
+            self._model = WhisperModel(
+                self.model_name,
+                device=self.device,
+                compute_type=self.compute_type
+            )
+
+            log.info("whisper_model_loaded")
+
+        except ImportError:
+            log.error("faster_whisper_not_installed")
+            raise ImportError(
+                "faster-whisper is not installed. "
+                "Install with: pip install faster-whisper"
+            )
+
+    def transcribe(
+        self,
+        audio_path: str,
+        language: str = "de",
+        beam_size: int = 5,
+        word_timestamps: bool = True,
+        vad_filter: bool = True,
+        vad_parameters: Optional[dict] = None
+    ) -> List[Dict]:
+        """
+        Transcribe an audio file.
+
+        Args:
+            audio_path: Path to audio file (WAV, MP3, etc.)
+            language: Language code (de, en, fr, etc.) or None for auto-detection
+            beam_size: Beam size for decoding (higher = better but slower)
+            word_timestamps: Include word-level timestamps
+            vad_filter: Enable Voice Activity Detection to filter silence
+            vad_parameters: Custom VAD parameters
+
+        Returns:
+            List of segments with text, timestamps, and confidence scores
+        """
+        self._load_model()
+
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+
+        log.info(
+            "transcribing_audio",
+            audio_path=audio_path,
+            language=language,
+            beam_size=beam_size
+        )
+
+        # Default VAD parameters for better speech detection
+        if vad_parameters is None:
+            vad_parameters = {
+                "min_silence_duration_ms": 500,
+                "speech_pad_ms": 400
+            }
+
+        # Run transcription
+        segments_gen, info = self._model.transcribe(
+            audio_path,
+            language=language,
+            beam_size=beam_size,
+            word_timestamps=word_timestamps,
+            vad_filter=vad_filter,
+            vad_parameters=vad_parameters
+        )
+
+        log.info(
+            "transcription_info",
+            detected_language=info.language,
+            language_probability=info.language_probability,
+            duration=info.duration
+        )
+
+        # Convert generator to list of segments
+        segments = []
+        for i, segment in enumerate(segments_gen):
+            seg_dict = {
+                "index": i,
+                "start_time_ms": int(segment.start * 1000),
+                "end_time_ms": int(segment.end * 1000),
+                "text": segment.text.strip(),
+                "confidence": round(segment.avg_logprob, 3) if segment.avg_logprob else None,
+                "no_speech_prob": segment.no_speech_prob
+            }
+
+            # Add word-level timestamps if available
+            if word_timestamps and segment.words:
+                seg_dict["words"] = [
+                    {
+                        "word": word.word,
+                        "start": int(word.start * 1000),
+                        "end": int(word.end * 1000),
+                        "probability": round(word.probability, 3)
+                    }
+                    for word in segment.words
+                ]
+
+            segments.append(seg_dict)
+
+        log.info(
+            "transcription_complete",
+            segments_count=len(segments),
+            duration_seconds=info.duration
+        )
+
+        return segments
+
+    def detect_language(self, audio_path: str) -> dict:
+        """
+        Detect the language of an audio file.
+
+        Args:
+            audio_path: Path to audio file
+
+        Returns:
+            dict with language code and probability
+        """
+        self._load_model()
+
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+
+        # Transcribe first 30 seconds to detect language
+        _, info = self._model.transcribe(
+            audio_path,
+            language=None,  # Auto-detect
+            beam_size=1,
+            without_timestamps=True
+        )
+
+        return {
+            "language": info.language,
+            "probability": info.language_probability
+        }
+
+    @property
+    def available_languages(self) -> List[str]:
+        """List of supported languages."""
+        return [
+            "de", "en", "fr", "es", "it", "pt", "nl", "pl", "ru",
+            "zh", "ja", "ko", "ar", "tr", "hi", "vi", "th", "id"
+        ]
+
+    def get_model_info(self) -> dict:
+        """Get information about the loaded model."""
+        return {
+            "model_name": self.model_name,
+            "device": self.device,
+            "compute_type": self.compute_type,
+            "loaded": self._model is not None
+        }