fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit 21a844cb8a
1986 changed files with 744143 additions and 1731 deletions

View File

@@ -0,0 +1,24 @@
"""
BreakPilot Transcription Worker
Asynchronous processing of meeting recordings using:
- faster-whisper for transcription (MIT License)
- pyannote.audio for speaker diarization (MIT License)
All components are open source and commercially usable.
"""
__version__ = "1.0.0"
__author__ = "BreakPilot Team"
from .transcriber import WhisperTranscriber
from .diarizer import SpeakerDiarizer
from .aligner import TranscriptAligner
from .storage import MinIOStorage
__all__ = [
"WhisperTranscriber",
"SpeakerDiarizer",
"TranscriptAligner",
"MinIOStorage"
]

View File

@@ -0,0 +1,202 @@
"""
BreakPilot Transcript Aligner
Aligns Whisper transcription segments with pyannote speaker diarization.
Assigns speaker IDs to each transcribed segment.
"""
import structlog
from typing import List, Dict, Optional
from collections import defaultdict
log = structlog.get_logger(__name__)
class TranscriptAligner:
"""
Aligns transcription segments with speaker diarization results.
Uses overlap-based matching to assign speaker IDs to each
transcribed segment. Handles cases where speakers change
mid-sentence.
"""
def __init__(self):
"""Initialize the aligner."""
self._speaker_count = 0
self._speaker_map = {} # Maps pyannote IDs to friendly names
def align(
self,
transcription_segments: List[Dict],
diarization_segments: List[Dict],
min_overlap_ratio: float = 0.3
) -> List[Dict]:
"""
Align transcription with speaker diarization.
Args:
transcription_segments: List of segments from Whisper
diarization_segments: List of segments from pyannote
min_overlap_ratio: Minimum overlap ratio to assign speaker
Returns:
Transcription segments with speaker_id added
"""
if not diarization_segments:
log.warning("no_diarization_segments", message="Returning transcription without speakers")
return transcription_segments
log.info(
"aligning_transcription",
transcription_count=len(transcription_segments),
diarization_count=len(diarization_segments)
)
# Build speaker mapping
unique_speakers = set(s["speaker_id"] for s in diarization_segments)
self._speaker_count = len(unique_speakers)
for i, speaker in enumerate(sorted(unique_speakers)):
self._speaker_map[speaker] = f"SPEAKER_{i:02d}"
# Align each transcription segment
aligned_segments = []
for trans_seg in transcription_segments:
speaker_id = self._find_speaker_for_segment(
trans_seg,
diarization_segments,
min_overlap_ratio
)
aligned_seg = trans_seg.copy()
aligned_seg["speaker_id"] = speaker_id
aligned_segments.append(aligned_seg)
# Log statistics
speaker_counts = defaultdict(int)
for seg in aligned_segments:
speaker_counts[seg.get("speaker_id", "UNKNOWN")] += 1
log.info(
"alignment_complete",
speakers=dict(speaker_counts),
total_speakers=self._speaker_count
)
return aligned_segments
def _find_speaker_for_segment(
self,
trans_seg: Dict,
diarization_segments: List[Dict],
min_overlap_ratio: float
) -> Optional[str]:
"""
Find the best matching speaker for a transcription segment.
Uses overlap-based matching with the speaker who has the
highest overlap with the segment.
"""
trans_start = trans_seg["start_time_ms"]
trans_end = trans_seg["end_time_ms"]
trans_duration = trans_end - trans_start
if trans_duration <= 0:
return None
# Find overlapping diarization segments
overlaps = []
for diar_seg in diarization_segments:
diar_start = diar_seg["start_time_ms"]
diar_end = diar_seg["end_time_ms"]
# Calculate overlap
overlap_start = max(trans_start, diar_start)
overlap_end = min(trans_end, diar_end)
overlap_duration = max(0, overlap_end - overlap_start)
if overlap_duration > 0:
overlap_ratio = overlap_duration / trans_duration
overlaps.append({
"speaker_id": diar_seg["speaker_id"],
"overlap_duration": overlap_duration,
"overlap_ratio": overlap_ratio
})
if not overlaps:
return None
# Find speaker with highest overlap
best_match = max(overlaps, key=lambda x: x["overlap_duration"])
if best_match["overlap_ratio"] >= min_overlap_ratio:
original_id = best_match["speaker_id"]
return self._speaker_map.get(original_id, original_id)
return None
def get_speaker_count(self) -> int:
"""Get the number of unique speakers detected."""
return self._speaker_count
def get_speaker_mapping(self) -> Dict[str, str]:
"""Get the mapping from pyannote IDs to friendly names."""
return self._speaker_map.copy()
def merge_consecutive_segments(
self,
segments: List[Dict],
max_gap_ms: int = 1000,
same_speaker_only: bool = True
) -> List[Dict]:
"""
Merge consecutive segments that are close together.
Useful for creating cleaner subtitle output.
Args:
segments: List of aligned segments
max_gap_ms: Maximum gap between segments to merge
same_speaker_only: Only merge if same speaker
Returns:
List of merged segments
"""
if not segments:
return []
merged = []
current = segments[0].copy()
for next_seg in segments[1:]:
gap = next_seg["start_time_ms"] - current["end_time_ms"]
same_speaker = (
not same_speaker_only or
current.get("speaker_id") == next_seg.get("speaker_id")
)
if gap <= max_gap_ms and same_speaker:
# Merge segments
current["end_time_ms"] = next_seg["end_time_ms"]
current["text"] = current["text"] + " " + next_seg["text"]
# Merge word timestamps if present
if "words" in current and "words" in next_seg:
current["words"].extend(next_seg["words"])
else:
# Save current and start new
merged.append(current)
current = next_seg.copy()
# Don't forget the last segment
merged.append(current)
log.info(
"segments_merged",
original_count=len(segments),
merged_count=len(merged)
)
return merged

View File

@@ -0,0 +1,197 @@
"""
BreakPilot Speaker Diarizer
Uses pyannote.audio (MIT License) for speaker diarization.
Identifies who spoke when in an audio recording.
"""
import os
import structlog
from typing import List, Dict, Optional
log = structlog.get_logger(__name__)
class SpeakerDiarizer:
"""
Speaker diarization using pyannote.audio.
Identifies distinct speakers in an audio recording and provides
timestamp information for when each speaker is talking.
License: MIT
Source: https://github.com/pyannote/pyannote-audio
Note: Requires a HuggingFace token with access to pyannote models.
Accept the conditions at: https://huggingface.co/pyannote/speaker-diarization
"""
def __init__(
self,
auth_token: Optional[str] = None,
device: str = "auto"
):
"""
Initialize the diarizer.
Args:
auth_token: HuggingFace token with pyannote access
device: Device to run on ("cpu", "cuda", "auto")
"""
self.auth_token = auth_token or os.getenv("PYANNOTE_AUTH_TOKEN")
self.device = device
self._pipeline = None
if not self.auth_token:
log.warning(
"pyannote_token_missing",
message="Speaker diarization requires a HuggingFace token"
)
def _load_pipeline(self):
"""Lazy load the diarization pipeline."""
if self._pipeline is not None:
return
if not self.auth_token:
raise ValueError(
"HuggingFace token required for pyannote.audio. "
"Set PYANNOTE_AUTH_TOKEN environment variable."
)
try:
from pyannote.audio import Pipeline
import torch
log.info("loading_pyannote_pipeline", device=self.device)
# Load pre-trained speaker diarization pipeline
self._pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=self.auth_token
)
# Move to appropriate device
if self.device == "auto":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
device = torch.device(self.device)
self._pipeline.to(device)
log.info("pyannote_pipeline_loaded", device=str(device))
except ImportError:
log.error("pyannote_not_installed")
raise ImportError(
"pyannote.audio is not installed. "
"Install with: pip install pyannote.audio"
)
def diarize(
self,
audio_path: str,
num_speakers: Optional[int] = None,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None
) -> List[Dict]:
"""
Perform speaker diarization on an audio file.
Args:
audio_path: Path to audio file (WAV recommended)
num_speakers: Exact number of speakers (if known)
min_speakers: Minimum number of speakers
max_speakers: Maximum number of speakers
Returns:
List of speaker segments with speaker ID and timestamps
"""
self._load_pipeline()
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
log.info(
"starting_diarization",
audio_path=audio_path,
num_speakers=num_speakers
)
# Run diarization
diarization = self._pipeline(
audio_path,
num_speakers=num_speakers,
min_speakers=min_speakers,
max_speakers=max_speakers
)
# Convert to list of segments
segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
segments.append({
"speaker_id": speaker,
"start_time_ms": int(turn.start * 1000),
"end_time_ms": int(turn.end * 1000),
"duration_ms": int((turn.end - turn.start) * 1000)
})
# Get unique speakers
unique_speakers = set(s["speaker_id"] for s in segments)
log.info(
"diarization_complete",
segments_count=len(segments),
speakers_count=len(unique_speakers),
speakers=list(unique_speakers)
)
return segments
def get_speaker_stats(self, segments: List[Dict]) -> Dict:
"""
Calculate speaking statistics per speaker.
Args:
segments: List of speaker segments from diarize()
Returns:
dict with speaking time and percentage per speaker
"""
speaker_times = {}
for seg in segments:
speaker = seg["speaker_id"]
duration = seg["duration_ms"]
if speaker not in speaker_times:
speaker_times[speaker] = 0
speaker_times[speaker] += duration
total_time = sum(speaker_times.values())
stats = {}
for speaker, time_ms in speaker_times.items():
stats[speaker] = {
"total_time_ms": time_ms,
"total_time_seconds": round(time_ms / 1000, 1),
"percentage": round((time_ms / total_time) * 100, 1) if total_time > 0 else 0
}
return {
"speakers": stats,
"total_speakers": len(stats),
"total_duration_ms": total_time
}
def is_available(self) -> bool:
"""Check if diarization is available (token configured)."""
return bool(self.auth_token)
def get_pipeline_info(self) -> dict:
"""Get information about the pipeline."""
return {
"available": self.is_available(),
"device": self.device,
"loaded": self._pipeline is not None
}

View File

@@ -0,0 +1,291 @@
"""
BreakPilot Transcript Export
Functions to export transcription segments to various formats:
- WebVTT (for HTML5 video captions)
- SRT (universal subtitle format)
- JSON (full data with speakers and timestamps)
"""
import json
from typing import List, Dict, Any
from datetime import datetime
def ms_to_vtt_timestamp(ms: int) -> str:
"""
Convert milliseconds to WebVTT timestamp format.
Args:
ms: Milliseconds
Returns:
Timestamp string (HH:MM:SS.mmm)
"""
hours = ms // 3600000
minutes = (ms % 3600000) // 60000
seconds = (ms % 60000) // 1000
millis = ms % 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{millis:03d}"
def ms_to_srt_timestamp(ms: int) -> str:
"""
Convert milliseconds to SRT timestamp format.
Args:
ms: Milliseconds
Returns:
Timestamp string (HH:MM:SS,mmm)
"""
hours = ms // 3600000
minutes = (ms % 3600000) // 60000
seconds = (ms % 60000) // 1000
millis = ms % 1000
# SRT uses comma as decimal separator
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"
def export_to_vtt(
segments: List[Dict],
include_speakers: bool = True,
header: str = "WEBVTT\nKind: captions\nLanguage: de\n"
) -> str:
"""
Export segments to WebVTT format.
Args:
segments: List of transcription segments
include_speakers: Include speaker labels
header: VTT header text
Returns:
WebVTT formatted string
"""
lines = [header]
for i, seg in enumerate(segments):
# Cue identifier
lines.append(f"\n{i + 1}")
# Timestamps
start = ms_to_vtt_timestamp(seg["start_time_ms"])
end = ms_to_vtt_timestamp(seg["end_time_ms"])
lines.append(f"{start} --> {end}")
# Text with optional speaker
text = seg["text"]
if include_speakers and seg.get("speaker_id"):
text = f"<v {seg['speaker_id']}>{text}"
lines.append(text)
return "\n".join(lines) + "\n"
def export_to_srt(
segments: List[Dict],
include_speakers: bool = True
) -> str:
"""
Export segments to SRT format.
Args:
segments: List of transcription segments
include_speakers: Include speaker labels in text
Returns:
SRT formatted string
"""
lines = []
for i, seg in enumerate(segments):
# Sequence number
lines.append(str(i + 1))
# Timestamps
start = ms_to_srt_timestamp(seg["start_time_ms"])
end = ms_to_srt_timestamp(seg["end_time_ms"])
lines.append(f"{start} --> {end}")
# Text with optional speaker
text = seg["text"]
if include_speakers and seg.get("speaker_id"):
text = f"[{seg['speaker_id']}] {text}"
lines.append(text)
lines.append("") # Empty line between entries
return "\n".join(lines)
def export_to_json(
segments: List[Dict],
metadata: Dict[str, Any] = None
) -> str:
"""
Export segments to JSON format with full metadata.
Args:
segments: List of transcription segments
metadata: Additional metadata to include
Returns:
JSON formatted string
"""
# Prepare export data
export_data = {
"version": "1.0",
"format": "breakpilot-transcript",
"generated_at": datetime.utcnow().isoformat() + "Z",
"metadata": metadata or {},
"segments": []
}
# Add segments
for seg in segments:
export_seg = {
"index": seg.get("index", 0),
"start_ms": seg["start_time_ms"],
"end_ms": seg["end_time_ms"],
"duration_ms": seg["end_time_ms"] - seg["start_time_ms"],
"text": seg["text"],
"speaker_id": seg.get("speaker_id"),
"confidence": seg.get("confidence")
}
# Include word-level timestamps if available
if "words" in seg:
export_seg["words"] = seg["words"]
export_data["segments"].append(export_seg)
# Calculate statistics
total_duration_ms = sum(s["duration_ms"] for s in export_data["segments"])
total_words = sum(len(s["text"].split()) for s in export_data["segments"])
unique_speakers = set(s["speaker_id"] for s in export_data["segments"] if s["speaker_id"])
export_data["statistics"] = {
"total_segments": len(export_data["segments"]),
"total_duration_ms": total_duration_ms,
"total_duration_seconds": round(total_duration_ms / 1000, 1),
"total_words": total_words,
"unique_speakers": len(unique_speakers),
"speakers": list(unique_speakers)
}
return json.dumps(export_data, indent=2, ensure_ascii=False)
def export_to_txt(
segments: List[Dict],
include_timestamps: bool = False,
include_speakers: bool = True,
paragraph_gap_ms: int = 3000
) -> str:
"""
Export segments to plain text format.
Args:
segments: List of transcription segments
include_timestamps: Add timestamps
include_speakers: Add speaker labels
paragraph_gap_ms: Gap threshold for new paragraph
Returns:
Plain text formatted string
"""
lines = []
last_end = 0
current_speaker = None
for seg in segments:
# Add paragraph break for large gaps
gap = seg["start_time_ms"] - last_end
if gap > paragraph_gap_ms and lines:
lines.append("")
# Build text line
parts = []
if include_timestamps:
ts = ms_to_vtt_timestamp(seg["start_time_ms"])
parts.append(f"[{ts}]")
speaker = seg.get("speaker_id")
if include_speakers and speaker and speaker != current_speaker:
parts.append(f"\n{speaker}:")
current_speaker = speaker
parts.append(seg["text"])
lines.append(" ".join(parts))
last_end = seg["end_time_ms"]
return "\n".join(lines)
def create_chapters(
segments: List[Dict],
min_chapter_duration_ms: int = 60000,
speaker_change_as_chapter: bool = True
) -> List[Dict]:
"""
Create chapter markers from segments.
Useful for video navigation and table of contents.
Args:
segments: List of transcription segments
min_chapter_duration_ms: Minimum chapter duration
speaker_change_as_chapter: Create chapter on speaker change
Returns:
List of chapter markers
"""
if not segments:
return []
chapters = []
chapter_start = segments[0]["start_time_ms"]
chapter_text_parts = []
current_speaker = segments[0].get("speaker_id")
for seg in segments:
elapsed = seg["start_time_ms"] - chapter_start
# Check for new chapter
speaker_changed = (
speaker_change_as_chapter and
seg.get("speaker_id") and
seg.get("speaker_id") != current_speaker
)
if elapsed >= min_chapter_duration_ms or speaker_changed:
# Save current chapter
if chapter_text_parts:
chapters.append({
"start_ms": chapter_start,
"title": " ".join(chapter_text_parts[:5]) + "...", # First 5 words
"speaker": current_speaker
})
# Start new chapter
chapter_start = seg["start_time_ms"]
chapter_text_parts = []
current_speaker = seg.get("speaker_id")
chapter_text_parts.extend(seg["text"].split())
# Don't forget the last chapter
if chapter_text_parts:
chapters.append({
"start_ms": chapter_start,
"title": " ".join(chapter_text_parts[:5]) + "...",
"speaker": current_speaker
})
return chapters

View File

@@ -0,0 +1,359 @@
"""
BreakPilot MinIO Storage Helper
Provides file upload/download operations for MinIO object storage.
"""
import os
import io
import structlog
from typing import Optional, BinaryIO
log = structlog.get_logger(__name__)
class MinIOStorage:
"""
MinIO storage client for recordings and transcriptions.
Provides methods to upload, download, and manage files
in MinIO object storage (S3-compatible).
"""
def __init__(
self,
endpoint: str = "minio:9000",
access_key: str = "breakpilot",
secret_key: str = "breakpilot123",
bucket: str = "breakpilot-recordings",
secure: bool = False
):
"""
Initialize MinIO client.
Args:
endpoint: MinIO server endpoint (host:port)
access_key: Access key (username)
secret_key: Secret key (password)
bucket: Default bucket name
secure: Use HTTPS
"""
self.endpoint = endpoint
self.access_key = access_key
self.secret_key = secret_key
self.bucket = bucket
self.secure = secure
self._client = None
def _get_client(self):
"""Lazy initialize MinIO client."""
if self._client is not None:
return self._client
try:
from minio import Minio
self._client = Minio(
self.endpoint,
access_key=self.access_key,
secret_key=self.secret_key,
secure=self.secure
)
log.info(
"minio_client_initialized",
endpoint=self.endpoint,
bucket=self.bucket
)
return self._client
except ImportError:
log.error("minio_not_installed")
raise ImportError(
"minio is not installed. "
"Install with: pip install minio"
)
def ensure_bucket(self) -> bool:
"""
Ensure the bucket exists, create if needed.
Returns:
True if bucket exists or was created
"""
client = self._get_client()
if not client.bucket_exists(self.bucket):
client.make_bucket(self.bucket)
log.info("bucket_created", bucket=self.bucket)
return True
return True
def download_file(
self,
object_name: str,
local_path: str,
bucket: Optional[str] = None
) -> str:
"""
Download a file from MinIO.
Args:
object_name: Path in MinIO bucket
local_path: Local destination path
bucket: Optional bucket override
Returns:
Local file path
"""
client = self._get_client()
bucket = bucket or self.bucket
log.info(
"downloading_file",
bucket=bucket,
object_name=object_name,
local_path=local_path
)
# Ensure directory exists
os.makedirs(os.path.dirname(local_path), exist_ok=True)
# Download
client.fget_object(bucket, object_name, local_path)
log.info(
"file_downloaded",
object_name=object_name,
local_path=local_path,
size=os.path.getsize(local_path)
)
return local_path
def upload_file(
self,
local_path: str,
object_name: str,
content_type: Optional[str] = None,
bucket: Optional[str] = None
) -> str:
"""
Upload a file to MinIO.
Args:
local_path: Local file path
object_name: Destination path in MinIO
content_type: MIME type
bucket: Optional bucket override
Returns:
Object name in MinIO
"""
client = self._get_client()
bucket = bucket or self.bucket
# Ensure bucket exists
self.ensure_bucket()
log.info(
"uploading_file",
local_path=local_path,
bucket=bucket,
object_name=object_name
)
# Upload
result = client.fput_object(
bucket,
object_name,
local_path,
content_type=content_type
)
log.info(
"file_uploaded",
object_name=object_name,
etag=result.etag
)
return object_name
def upload_content(
self,
content: str,
object_name: str,
content_type: str = "text/plain",
bucket: Optional[str] = None
) -> str:
"""
Upload string content directly to MinIO.
Args:
content: String content to upload
object_name: Destination path in MinIO
content_type: MIME type
bucket: Optional bucket override
Returns:
Object name in MinIO
"""
client = self._get_client()
bucket = bucket or self.bucket
# Ensure bucket exists
self.ensure_bucket()
# Convert to bytes
data = content.encode("utf-8")
data_stream = io.BytesIO(data)
log.info(
"uploading_content",
bucket=bucket,
object_name=object_name,
size=len(data)
)
# Upload
result = client.put_object(
bucket,
object_name,
data_stream,
length=len(data),
content_type=content_type
)
log.info(
"content_uploaded",
object_name=object_name,
etag=result.etag
)
return object_name
def get_content(
self,
object_name: str,
bucket: Optional[str] = None
) -> str:
"""
Get string content from MinIO.
Args:
object_name: Path in MinIO bucket
bucket: Optional bucket override
Returns:
File content as string
"""
client = self._get_client()
bucket = bucket or self.bucket
response = client.get_object(bucket, object_name)
content = response.read().decode("utf-8")
response.close()
response.release_conn()
return content
def delete_file(
self,
object_name: str,
bucket: Optional[str] = None
) -> bool:
"""
Delete a file from MinIO.
Args:
object_name: Path in MinIO bucket
bucket: Optional bucket override
Returns:
True if deleted
"""
client = self._get_client()
bucket = bucket or self.bucket
client.remove_object(bucket, object_name)
log.info("file_deleted", object_name=object_name)
return True
def file_exists(
self,
object_name: str,
bucket: Optional[str] = None
) -> bool:
"""
Check if a file exists in MinIO.
Args:
object_name: Path in MinIO bucket
bucket: Optional bucket override
Returns:
True if file exists
"""
client = self._get_client()
bucket = bucket or self.bucket
try:
client.stat_object(bucket, object_name)
return True
except Exception:
return False
def get_presigned_url(
self,
object_name: str,
expires_hours: int = 24,
bucket: Optional[str] = None
) -> str:
"""
Get a presigned URL for temporary file access.
Args:
object_name: Path in MinIO bucket
expires_hours: URL validity in hours
bucket: Optional bucket override
Returns:
Presigned URL
"""
from datetime import timedelta
client = self._get_client()
bucket = bucket or self.bucket
url = client.presigned_get_object(
bucket,
object_name,
expires=timedelta(hours=expires_hours)
)
return url
def list_files(
self,
prefix: str = "",
bucket: Optional[str] = None
) -> list:
"""
List files with a given prefix.
Args:
prefix: Path prefix to filter
bucket: Optional bucket override
Returns:
List of object names
"""
client = self._get_client()
bucket = bucket or self.bucket
objects = client.list_objects(bucket, prefix=prefix, recursive=True)
return [obj.object_name for obj in objects]

View File

@@ -0,0 +1,230 @@
"""
BreakPilot Transcription Tasks
RQ task definitions for transcription processing.
"""
import os
import time
import tempfile
import structlog
from typing import Optional
from datetime import datetime
from .transcriber import WhisperTranscriber
from .diarizer import SpeakerDiarizer
from .aligner import TranscriptAligner
from .storage import MinIOStorage
from .export import export_to_vtt, export_to_srt, export_to_json
log = structlog.get_logger(__name__)
# Configuration
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu")
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
PYANNOTE_AUTH_TOKEN = os.getenv("PYANNOTE_AUTH_TOKEN")
TEMP_DIR = os.getenv("TEMP_DIR", "/tmp/transcriptions")
# MinIO Configuration
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-recordings")
MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
# Database URL for status updates
DATABASE_URL = os.getenv("DATABASE_URL")
def update_transcription_status(
transcription_id: str,
status: str,
error_message: Optional[str] = None,
**kwargs
):
"""Update transcription status in database."""
# TODO: Implement database update
log.info(
"status_update",
transcription_id=transcription_id,
status=status,
error=error_message,
**kwargs
)
def transcribe_recording(
transcription_id: str,
recording_id: str,
audio_path: str,
language: str = "de",
enable_diarization: bool = True
) -> dict:
"""
Main transcription task.
Downloads audio from MinIO, transcribes with Whisper,
optionally performs speaker diarization, and uploads results.
Args:
transcription_id: UUID of the transcription record
recording_id: UUID of the source recording
audio_path: Path to audio file in MinIO bucket
language: Language code (de, en, etc.)
enable_diarization: Whether to perform speaker diarization
Returns:
dict with transcription results and paths
"""
start_time = time.time()
log.info(
"transcription_started",
transcription_id=transcription_id,
recording_id=recording_id,
audio_path=audio_path,
language=language
)
# Update status to processing
update_transcription_status(
transcription_id,
status="processing",
processing_started_at=datetime.utcnow().isoformat()
)
try:
# Initialize storage
storage = MinIOStorage(
endpoint=MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
bucket=MINIO_BUCKET,
secure=MINIO_SECURE
)
# Create temp directory
os.makedirs(TEMP_DIR, exist_ok=True)
# Download audio file
local_audio_path = os.path.join(TEMP_DIR, f"{transcription_id}_audio.wav")
storage.download_file(audio_path, local_audio_path)
log.info("audio_downloaded", path=local_audio_path)
# Initialize transcriber
transcriber = WhisperTranscriber(
model_name=WHISPER_MODEL,
device=WHISPER_DEVICE,
compute_type=WHISPER_COMPUTE_TYPE
)
# Transcribe audio
log.info("transcription_starting", model=WHISPER_MODEL)
segments = transcriber.transcribe(
audio_path=local_audio_path,
language=language
)
log.info("transcription_complete", segments_count=len(segments))
# Speaker diarization (if enabled and token available)
if enable_diarization and PYANNOTE_AUTH_TOKEN:
log.info("diarization_starting")
diarizer = SpeakerDiarizer(auth_token=PYANNOTE_AUTH_TOKEN)
speaker_segments = diarizer.diarize(local_audio_path)
# Align transcription with speakers
aligner = TranscriptAligner()
segments = aligner.align(segments, speaker_segments)
log.info("diarization_complete", speakers=aligner.get_speaker_count())
else:
log.info("diarization_skipped", reason="disabled or no token")
# Calculate statistics
full_text = " ".join(s["text"] for s in segments)
word_count = len(full_text.split())
avg_confidence = sum(s.get("confidence", 0) for s in segments) / len(segments) if segments else 0
# Export to different formats
base_path = audio_path.rsplit("/", 1)[0] # recordings/{recording_name}
# WebVTT
vtt_content = export_to_vtt(segments)
vtt_path = f"{base_path}/transcript.vtt"
storage.upload_content(vtt_content, vtt_path, content_type="text/vtt")
# SRT
srt_content = export_to_srt(segments)
srt_path = f"{base_path}/transcript.srt"
storage.upload_content(srt_content, srt_path, content_type="text/plain")
# JSON (full data with speakers)
json_content = export_to_json(segments, {
"transcription_id": transcription_id,
"recording_id": recording_id,
"language": language,
"model": WHISPER_MODEL,
"word_count": word_count,
"confidence": avg_confidence
})
json_path = f"{base_path}/transcript.json"
storage.upload_content(json_content, json_path, content_type="application/json")
# Cleanup temp file
if os.path.exists(local_audio_path):
os.remove(local_audio_path)
# Calculate processing time
processing_duration = int(time.time() - start_time)
# Update status to completed
result = {
"transcription_id": transcription_id,
"recording_id": recording_id,
"status": "completed",
"full_text": full_text,
"word_count": word_count,
"confidence_score": round(avg_confidence, 3),
"segments_count": len(segments),
"vtt_path": vtt_path,
"srt_path": srt_path,
"json_path": json_path,
"processing_duration_seconds": processing_duration
}
update_transcription_status(
transcription_id,
status="completed",
full_text=full_text,
word_count=word_count,
confidence_score=avg_confidence,
vtt_path=vtt_path,
srt_path=srt_path,
json_path=json_path,
processing_duration_seconds=processing_duration,
processing_completed_at=datetime.utcnow().isoformat()
)
log.info(
"transcription_completed",
transcription_id=transcription_id,
word_count=word_count,
duration_seconds=processing_duration
)
return result
except Exception as e:
log.error(
"transcription_failed",
transcription_id=transcription_id,
error=str(e)
)
update_transcription_status(
transcription_id,
status="failed",
error_message=str(e)
)
raise

View File

@@ -0,0 +1,211 @@
"""
BreakPilot Whisper Transcriber
Uses faster-whisper (MIT License) for GPU-optimized transcription.
Based on CTranslate2 for fast inference.
"""
import os
import structlog
from typing import List, Dict, Optional
log = structlog.get_logger(__name__)
class WhisperTranscriber:
"""
Whisper-based audio transcription using faster-whisper.
faster-whisper is a reimplementation of OpenAI Whisper using CTranslate2,
which is significantly faster than the original implementation.
License: MIT
Source: https://github.com/SYSTRAN/faster-whisper
"""
def __init__(
self,
model_name: str = "large-v3",
device: str = "cpu",
compute_type: str = "int8"
):
"""
Initialize the transcriber.
Args:
model_name: Whisper model to use (tiny, base, small, medium, large-v3)
device: Device to run on ("cpu", "cuda", "auto")
compute_type: Quantization type ("int8", "float16", "float32")
"""
self.model_name = model_name
self.device = device
self.compute_type = compute_type
self._model = None
def _load_model(self):
"""Lazy load the model on first use."""
if self._model is not None:
return
try:
from faster_whisper import WhisperModel
log.info(
"loading_whisper_model",
model=self.model_name,
device=self.device,
compute_type=self.compute_type
)
self._model = WhisperModel(
self.model_name,
device=self.device,
compute_type=self.compute_type
)
log.info("whisper_model_loaded")
except ImportError:
log.error("faster_whisper_not_installed")
raise ImportError(
"faster-whisper is not installed. "
"Install with: pip install faster-whisper"
)
def transcribe(
self,
audio_path: str,
language: str = "de",
beam_size: int = 5,
word_timestamps: bool = True,
vad_filter: bool = True,
vad_parameters: Optional[dict] = None
) -> List[Dict]:
"""
Transcribe an audio file.
Args:
audio_path: Path to audio file (WAV, MP3, etc.)
language: Language code (de, en, fr, etc.) or None for auto-detection
beam_size: Beam size for decoding (higher = better but slower)
word_timestamps: Include word-level timestamps
vad_filter: Enable Voice Activity Detection to filter silence
vad_parameters: Custom VAD parameters
Returns:
List of segments with text, timestamps, and confidence scores
"""
self._load_model()
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
log.info(
"transcribing_audio",
audio_path=audio_path,
language=language,
beam_size=beam_size
)
# Default VAD parameters for better speech detection
if vad_parameters is None:
vad_parameters = {
"min_silence_duration_ms": 500,
"speech_pad_ms": 400
}
# Run transcription
segments_gen, info = self._model.transcribe(
audio_path,
language=language,
beam_size=beam_size,
word_timestamps=word_timestamps,
vad_filter=vad_filter,
vad_parameters=vad_parameters
)
log.info(
"transcription_info",
detected_language=info.language,
language_probability=info.language_probability,
duration=info.duration
)
# Convert generator to list of segments
segments = []
for i, segment in enumerate(segments_gen):
seg_dict = {
"index": i,
"start_time_ms": int(segment.start * 1000),
"end_time_ms": int(segment.end * 1000),
"text": segment.text.strip(),
"confidence": round(segment.avg_logprob, 3) if segment.avg_logprob else None,
"no_speech_prob": segment.no_speech_prob
}
# Add word-level timestamps if available
if word_timestamps and segment.words:
seg_dict["words"] = [
{
"word": word.word,
"start": int(word.start * 1000),
"end": int(word.end * 1000),
"probability": round(word.probability, 3)
}
for word in segment.words
]
segments.append(seg_dict)
log.info(
"transcription_complete",
segments_count=len(segments),
duration_seconds=info.duration
)
return segments
def detect_language(self, audio_path: str) -> dict:
"""
Detect the language of an audio file.
Args:
audio_path: Path to audio file
Returns:
dict with language code and probability
"""
self._load_model()
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# Transcribe first 30 seconds to detect language
_, info = self._model.transcribe(
audio_path,
language=None, # Auto-detect
beam_size=1,
without_timestamps=True
)
return {
"language": info.language,
"probability": info.language_probability
}
@property
def available_languages(self) -> List[str]:
"""List of supported languages."""
return [
"de", "en", "fr", "es", "it", "pt", "nl", "pl", "ru",
"zh", "ja", "ko", "ar", "tr", "hi", "vi", "th", "id"
]
def get_model_info(self) -> dict:
"""Get information about the loaded model."""
return {
"model_name": self.model_name,
"device": self.device,
"compute_type": self.compute_type,
"loaded": self._model is not None
}

View File

@@ -0,0 +1,129 @@
"""
BreakPilot Transcription Worker - Main Entry Point
Runs as an RQ worker, processing transcription jobs from the queue.
"""
import os
import sys
import signal
import structlog
from redis import Redis
from rq import Worker, Queue, Connection
# Configure logging
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
wrapper_class=structlog.stdlib.BoundLogger,
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
cache_logger_on_first_use=True,
)
log = structlog.get_logger(__name__)
# Configuration
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/1")
QUEUE_NAME = os.getenv("QUEUE_NAME", "transcription")
WORKER_NAME = os.getenv("WORKER_NAME", f"transcription-worker-{os.getpid()}")
def setup_signal_handlers(worker: Worker):
"""Setup graceful shutdown handlers."""
def shutdown_handler(signum, frame):
log.info("shutdown_signal_received", signal=signum)
worker.request_stop(signum, frame)
signal.signal(signal.SIGINT, shutdown_handler)
signal.signal(signal.SIGTERM, shutdown_handler)
def preload_models():
"""Preload ML models to reduce first-job latency."""
log.info("preloading_models")
try:
from .transcriber import WhisperTranscriber
from .diarizer import SpeakerDiarizer
# Initialize transcriber (downloads model if needed)
whisper_model = os.getenv("WHISPER_MODEL", "large-v3")
device = os.getenv("WHISPER_DEVICE", "cpu")
compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
transcriber = WhisperTranscriber(
model_name=whisper_model,
device=device,
compute_type=compute_type
)
log.info("whisper_model_loaded", model=whisper_model, device=device)
# Initialize diarizer (downloads model if needed)
pyannote_token = os.getenv("PYANNOTE_AUTH_TOKEN")
if pyannote_token:
diarizer = SpeakerDiarizer(auth_token=pyannote_token)
log.info("pyannote_model_loaded")
else:
log.warning("pyannote_token_missing", message="Speaker diarization disabled")
except Exception as e:
log.error("model_preload_failed", error=str(e))
# Don't fail startup, models will be loaded on first job
def main():
"""Main entry point for the worker."""
log.info(
"worker_starting",
redis_url=REDIS_URL,
queue=QUEUE_NAME,
worker_name=WORKER_NAME
)
# Connect to Redis
redis_conn = Redis.from_url(REDIS_URL)
# Test connection
try:
redis_conn.ping()
log.info("redis_connected")
except Exception as e:
log.error("redis_connection_failed", error=str(e))
sys.exit(1)
# Preload models
preload_models()
# Create queue
queue = Queue(QUEUE_NAME, connection=redis_conn)
# Create worker
worker = Worker(
queues=[queue],
connection=redis_conn,
name=WORKER_NAME
)
# Setup signal handlers
setup_signal_handlers(worker)
log.info("worker_ready", queues=[QUEUE_NAME])
# Start processing
with Connection(redis_conn):
worker.work(with_scheduler=True)
if __name__ == "__main__":
main()