fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
24
backend/transcription_worker/__init__.py
Normal file
24
backend/transcription_worker/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""
|
||||
BreakPilot Transcription Worker
|
||||
|
||||
Asynchronous processing of meeting recordings using:
|
||||
- faster-whisper for transcription (MIT License)
|
||||
- pyannote.audio for speaker diarization (MIT License)
|
||||
|
||||
All components are open source and commercially usable.
|
||||
"""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__author__ = "BreakPilot Team"
|
||||
|
||||
from .transcriber import WhisperTranscriber
|
||||
from .diarizer import SpeakerDiarizer
|
||||
from .aligner import TranscriptAligner
|
||||
from .storage import MinIOStorage
|
||||
|
||||
__all__ = [
|
||||
"WhisperTranscriber",
|
||||
"SpeakerDiarizer",
|
||||
"TranscriptAligner",
|
||||
"MinIOStorage"
|
||||
]
|
||||
202
backend/transcription_worker/aligner.py
Normal file
202
backend/transcription_worker/aligner.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""
|
||||
BreakPilot Transcript Aligner
|
||||
|
||||
Aligns Whisper transcription segments with pyannote speaker diarization.
|
||||
Assigns speaker IDs to each transcribed segment.
|
||||
"""
|
||||
|
||||
import structlog
|
||||
from typing import List, Dict, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
class TranscriptAligner:
|
||||
"""
|
||||
Aligns transcription segments with speaker diarization results.
|
||||
|
||||
Uses overlap-based matching to assign speaker IDs to each
|
||||
transcribed segment. Handles cases where speakers change
|
||||
mid-sentence.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the aligner."""
|
||||
self._speaker_count = 0
|
||||
self._speaker_map = {} # Maps pyannote IDs to friendly names
|
||||
|
||||
def align(
|
||||
self,
|
||||
transcription_segments: List[Dict],
|
||||
diarization_segments: List[Dict],
|
||||
min_overlap_ratio: float = 0.3
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Align transcription with speaker diarization.
|
||||
|
||||
Args:
|
||||
transcription_segments: List of segments from Whisper
|
||||
diarization_segments: List of segments from pyannote
|
||||
min_overlap_ratio: Minimum overlap ratio to assign speaker
|
||||
|
||||
Returns:
|
||||
Transcription segments with speaker_id added
|
||||
"""
|
||||
if not diarization_segments:
|
||||
log.warning("no_diarization_segments", message="Returning transcription without speakers")
|
||||
return transcription_segments
|
||||
|
||||
log.info(
|
||||
"aligning_transcription",
|
||||
transcription_count=len(transcription_segments),
|
||||
diarization_count=len(diarization_segments)
|
||||
)
|
||||
|
||||
# Build speaker mapping
|
||||
unique_speakers = set(s["speaker_id"] for s in diarization_segments)
|
||||
self._speaker_count = len(unique_speakers)
|
||||
|
||||
for i, speaker in enumerate(sorted(unique_speakers)):
|
||||
self._speaker_map[speaker] = f"SPEAKER_{i:02d}"
|
||||
|
||||
# Align each transcription segment
|
||||
aligned_segments = []
|
||||
for trans_seg in transcription_segments:
|
||||
speaker_id = self._find_speaker_for_segment(
|
||||
trans_seg,
|
||||
diarization_segments,
|
||||
min_overlap_ratio
|
||||
)
|
||||
|
||||
aligned_seg = trans_seg.copy()
|
||||
aligned_seg["speaker_id"] = speaker_id
|
||||
|
||||
aligned_segments.append(aligned_seg)
|
||||
|
||||
# Log statistics
|
||||
speaker_counts = defaultdict(int)
|
||||
for seg in aligned_segments:
|
||||
speaker_counts[seg.get("speaker_id", "UNKNOWN")] += 1
|
||||
|
||||
log.info(
|
||||
"alignment_complete",
|
||||
speakers=dict(speaker_counts),
|
||||
total_speakers=self._speaker_count
|
||||
)
|
||||
|
||||
return aligned_segments
|
||||
|
||||
def _find_speaker_for_segment(
|
||||
self,
|
||||
trans_seg: Dict,
|
||||
diarization_segments: List[Dict],
|
||||
min_overlap_ratio: float
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Find the best matching speaker for a transcription segment.
|
||||
|
||||
Uses overlap-based matching with the speaker who has the
|
||||
highest overlap with the segment.
|
||||
"""
|
||||
trans_start = trans_seg["start_time_ms"]
|
||||
trans_end = trans_seg["end_time_ms"]
|
||||
trans_duration = trans_end - trans_start
|
||||
|
||||
if trans_duration <= 0:
|
||||
return None
|
||||
|
||||
# Find overlapping diarization segments
|
||||
overlaps = []
|
||||
for diar_seg in diarization_segments:
|
||||
diar_start = diar_seg["start_time_ms"]
|
||||
diar_end = diar_seg["end_time_ms"]
|
||||
|
||||
# Calculate overlap
|
||||
overlap_start = max(trans_start, diar_start)
|
||||
overlap_end = min(trans_end, diar_end)
|
||||
overlap_duration = max(0, overlap_end - overlap_start)
|
||||
|
||||
if overlap_duration > 0:
|
||||
overlap_ratio = overlap_duration / trans_duration
|
||||
overlaps.append({
|
||||
"speaker_id": diar_seg["speaker_id"],
|
||||
"overlap_duration": overlap_duration,
|
||||
"overlap_ratio": overlap_ratio
|
||||
})
|
||||
|
||||
if not overlaps:
|
||||
return None
|
||||
|
||||
# Find speaker with highest overlap
|
||||
best_match = max(overlaps, key=lambda x: x["overlap_duration"])
|
||||
|
||||
if best_match["overlap_ratio"] >= min_overlap_ratio:
|
||||
original_id = best_match["speaker_id"]
|
||||
return self._speaker_map.get(original_id, original_id)
|
||||
|
||||
return None
|
||||
|
||||
def get_speaker_count(self) -> int:
|
||||
"""Get the number of unique speakers detected."""
|
||||
return self._speaker_count
|
||||
|
||||
def get_speaker_mapping(self) -> Dict[str, str]:
|
||||
"""Get the mapping from pyannote IDs to friendly names."""
|
||||
return self._speaker_map.copy()
|
||||
|
||||
def merge_consecutive_segments(
|
||||
self,
|
||||
segments: List[Dict],
|
||||
max_gap_ms: int = 1000,
|
||||
same_speaker_only: bool = True
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Merge consecutive segments that are close together.
|
||||
|
||||
Useful for creating cleaner subtitle output.
|
||||
|
||||
Args:
|
||||
segments: List of aligned segments
|
||||
max_gap_ms: Maximum gap between segments to merge
|
||||
same_speaker_only: Only merge if same speaker
|
||||
|
||||
Returns:
|
||||
List of merged segments
|
||||
"""
|
||||
if not segments:
|
||||
return []
|
||||
|
||||
merged = []
|
||||
current = segments[0].copy()
|
||||
|
||||
for next_seg in segments[1:]:
|
||||
gap = next_seg["start_time_ms"] - current["end_time_ms"]
|
||||
same_speaker = (
|
||||
not same_speaker_only or
|
||||
current.get("speaker_id") == next_seg.get("speaker_id")
|
||||
)
|
||||
|
||||
if gap <= max_gap_ms and same_speaker:
|
||||
# Merge segments
|
||||
current["end_time_ms"] = next_seg["end_time_ms"]
|
||||
current["text"] = current["text"] + " " + next_seg["text"]
|
||||
|
||||
# Merge word timestamps if present
|
||||
if "words" in current and "words" in next_seg:
|
||||
current["words"].extend(next_seg["words"])
|
||||
else:
|
||||
# Save current and start new
|
||||
merged.append(current)
|
||||
current = next_seg.copy()
|
||||
|
||||
# Don't forget the last segment
|
||||
merged.append(current)
|
||||
|
||||
log.info(
|
||||
"segments_merged",
|
||||
original_count=len(segments),
|
||||
merged_count=len(merged)
|
||||
)
|
||||
|
||||
return merged
|
||||
197
backend/transcription_worker/diarizer.py
Normal file
197
backend/transcription_worker/diarizer.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
BreakPilot Speaker Diarizer
|
||||
|
||||
Uses pyannote.audio (MIT License) for speaker diarization.
|
||||
Identifies who spoke when in an audio recording.
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
class SpeakerDiarizer:
|
||||
"""
|
||||
Speaker diarization using pyannote.audio.
|
||||
|
||||
Identifies distinct speakers in an audio recording and provides
|
||||
timestamp information for when each speaker is talking.
|
||||
|
||||
License: MIT
|
||||
Source: https://github.com/pyannote/pyannote-audio
|
||||
|
||||
Note: Requires a HuggingFace token with access to pyannote models.
|
||||
Accept the conditions at: https://huggingface.co/pyannote/speaker-diarization
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
auth_token: Optional[str] = None,
|
||||
device: str = "auto"
|
||||
):
|
||||
"""
|
||||
Initialize the diarizer.
|
||||
|
||||
Args:
|
||||
auth_token: HuggingFace token with pyannote access
|
||||
device: Device to run on ("cpu", "cuda", "auto")
|
||||
"""
|
||||
self.auth_token = auth_token or os.getenv("PYANNOTE_AUTH_TOKEN")
|
||||
self.device = device
|
||||
self._pipeline = None
|
||||
|
||||
if not self.auth_token:
|
||||
log.warning(
|
||||
"pyannote_token_missing",
|
||||
message="Speaker diarization requires a HuggingFace token"
|
||||
)
|
||||
|
||||
def _load_pipeline(self):
|
||||
"""Lazy load the diarization pipeline."""
|
||||
if self._pipeline is not None:
|
||||
return
|
||||
|
||||
if not self.auth_token:
|
||||
raise ValueError(
|
||||
"HuggingFace token required for pyannote.audio. "
|
||||
"Set PYANNOTE_AUTH_TOKEN environment variable."
|
||||
)
|
||||
|
||||
try:
|
||||
from pyannote.audio import Pipeline
|
||||
import torch
|
||||
|
||||
log.info("loading_pyannote_pipeline", device=self.device)
|
||||
|
||||
# Load pre-trained speaker diarization pipeline
|
||||
self._pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1",
|
||||
use_auth_token=self.auth_token
|
||||
)
|
||||
|
||||
# Move to appropriate device
|
||||
if self.device == "auto":
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
else:
|
||||
device = torch.device(self.device)
|
||||
|
||||
self._pipeline.to(device)
|
||||
|
||||
log.info("pyannote_pipeline_loaded", device=str(device))
|
||||
|
||||
except ImportError:
|
||||
log.error("pyannote_not_installed")
|
||||
raise ImportError(
|
||||
"pyannote.audio is not installed. "
|
||||
"Install with: pip install pyannote.audio"
|
||||
)
|
||||
|
||||
def diarize(
|
||||
self,
|
||||
audio_path: str,
|
||||
num_speakers: Optional[int] = None,
|
||||
min_speakers: Optional[int] = None,
|
||||
max_speakers: Optional[int] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Perform speaker diarization on an audio file.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file (WAV recommended)
|
||||
num_speakers: Exact number of speakers (if known)
|
||||
min_speakers: Minimum number of speakers
|
||||
max_speakers: Maximum number of speakers
|
||||
|
||||
Returns:
|
||||
List of speaker segments with speaker ID and timestamps
|
||||
"""
|
||||
self._load_pipeline()
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
||||
|
||||
log.info(
|
||||
"starting_diarization",
|
||||
audio_path=audio_path,
|
||||
num_speakers=num_speakers
|
||||
)
|
||||
|
||||
# Run diarization
|
||||
diarization = self._pipeline(
|
||||
audio_path,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers
|
||||
)
|
||||
|
||||
# Convert to list of segments
|
||||
segments = []
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
segments.append({
|
||||
"speaker_id": speaker,
|
||||
"start_time_ms": int(turn.start * 1000),
|
||||
"end_time_ms": int(turn.end * 1000),
|
||||
"duration_ms": int((turn.end - turn.start) * 1000)
|
||||
})
|
||||
|
||||
# Get unique speakers
|
||||
unique_speakers = set(s["speaker_id"] for s in segments)
|
||||
|
||||
log.info(
|
||||
"diarization_complete",
|
||||
segments_count=len(segments),
|
||||
speakers_count=len(unique_speakers),
|
||||
speakers=list(unique_speakers)
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
def get_speaker_stats(self, segments: List[Dict]) -> Dict:
|
||||
"""
|
||||
Calculate speaking statistics per speaker.
|
||||
|
||||
Args:
|
||||
segments: List of speaker segments from diarize()
|
||||
|
||||
Returns:
|
||||
dict with speaking time and percentage per speaker
|
||||
"""
|
||||
speaker_times = {}
|
||||
|
||||
for seg in segments:
|
||||
speaker = seg["speaker_id"]
|
||||
duration = seg["duration_ms"]
|
||||
|
||||
if speaker not in speaker_times:
|
||||
speaker_times[speaker] = 0
|
||||
speaker_times[speaker] += duration
|
||||
|
||||
total_time = sum(speaker_times.values())
|
||||
|
||||
stats = {}
|
||||
for speaker, time_ms in speaker_times.items():
|
||||
stats[speaker] = {
|
||||
"total_time_ms": time_ms,
|
||||
"total_time_seconds": round(time_ms / 1000, 1),
|
||||
"percentage": round((time_ms / total_time) * 100, 1) if total_time > 0 else 0
|
||||
}
|
||||
|
||||
return {
|
||||
"speakers": stats,
|
||||
"total_speakers": len(stats),
|
||||
"total_duration_ms": total_time
|
||||
}
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if diarization is available (token configured)."""
|
||||
return bool(self.auth_token)
|
||||
|
||||
def get_pipeline_info(self) -> dict:
|
||||
"""Get information about the pipeline."""
|
||||
return {
|
||||
"available": self.is_available(),
|
||||
"device": self.device,
|
||||
"loaded": self._pipeline is not None
|
||||
}
|
||||
291
backend/transcription_worker/export.py
Normal file
291
backend/transcription_worker/export.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""
|
||||
BreakPilot Transcript Export
|
||||
|
||||
Functions to export transcription segments to various formats:
|
||||
- WebVTT (for HTML5 video captions)
|
||||
- SRT (universal subtitle format)
|
||||
- JSON (full data with speakers and timestamps)
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def ms_to_vtt_timestamp(ms: int) -> str:
|
||||
"""
|
||||
Convert milliseconds to WebVTT timestamp format.
|
||||
|
||||
Args:
|
||||
ms: Milliseconds
|
||||
|
||||
Returns:
|
||||
Timestamp string (HH:MM:SS.mmm)
|
||||
"""
|
||||
hours = ms // 3600000
|
||||
minutes = (ms % 3600000) // 60000
|
||||
seconds = (ms % 60000) // 1000
|
||||
millis = ms % 1000
|
||||
|
||||
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{millis:03d}"
|
||||
|
||||
|
||||
def ms_to_srt_timestamp(ms: int) -> str:
|
||||
"""
|
||||
Convert milliseconds to SRT timestamp format.
|
||||
|
||||
Args:
|
||||
ms: Milliseconds
|
||||
|
||||
Returns:
|
||||
Timestamp string (HH:MM:SS,mmm)
|
||||
"""
|
||||
hours = ms // 3600000
|
||||
minutes = (ms % 3600000) // 60000
|
||||
seconds = (ms % 60000) // 1000
|
||||
millis = ms % 1000
|
||||
|
||||
# SRT uses comma as decimal separator
|
||||
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"
|
||||
|
||||
|
||||
def export_to_vtt(
|
||||
segments: List[Dict],
|
||||
include_speakers: bool = True,
|
||||
header: str = "WEBVTT\nKind: captions\nLanguage: de\n"
|
||||
) -> str:
|
||||
"""
|
||||
Export segments to WebVTT format.
|
||||
|
||||
Args:
|
||||
segments: List of transcription segments
|
||||
include_speakers: Include speaker labels
|
||||
header: VTT header text
|
||||
|
||||
Returns:
|
||||
WebVTT formatted string
|
||||
"""
|
||||
lines = [header]
|
||||
|
||||
for i, seg in enumerate(segments):
|
||||
# Cue identifier
|
||||
lines.append(f"\n{i + 1}")
|
||||
|
||||
# Timestamps
|
||||
start = ms_to_vtt_timestamp(seg["start_time_ms"])
|
||||
end = ms_to_vtt_timestamp(seg["end_time_ms"])
|
||||
lines.append(f"{start} --> {end}")
|
||||
|
||||
# Text with optional speaker
|
||||
text = seg["text"]
|
||||
if include_speakers and seg.get("speaker_id"):
|
||||
text = f"<v {seg['speaker_id']}>{text}"
|
||||
|
||||
lines.append(text)
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def export_to_srt(
|
||||
segments: List[Dict],
|
||||
include_speakers: bool = True
|
||||
) -> str:
|
||||
"""
|
||||
Export segments to SRT format.
|
||||
|
||||
Args:
|
||||
segments: List of transcription segments
|
||||
include_speakers: Include speaker labels in text
|
||||
|
||||
Returns:
|
||||
SRT formatted string
|
||||
"""
|
||||
lines = []
|
||||
|
||||
for i, seg in enumerate(segments):
|
||||
# Sequence number
|
||||
lines.append(str(i + 1))
|
||||
|
||||
# Timestamps
|
||||
start = ms_to_srt_timestamp(seg["start_time_ms"])
|
||||
end = ms_to_srt_timestamp(seg["end_time_ms"])
|
||||
lines.append(f"{start} --> {end}")
|
||||
|
||||
# Text with optional speaker
|
||||
text = seg["text"]
|
||||
if include_speakers and seg.get("speaker_id"):
|
||||
text = f"[{seg['speaker_id']}] {text}"
|
||||
|
||||
lines.append(text)
|
||||
lines.append("") # Empty line between entries
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def export_to_json(
|
||||
segments: List[Dict],
|
||||
metadata: Dict[str, Any] = None
|
||||
) -> str:
|
||||
"""
|
||||
Export segments to JSON format with full metadata.
|
||||
|
||||
Args:
|
||||
segments: List of transcription segments
|
||||
metadata: Additional metadata to include
|
||||
|
||||
Returns:
|
||||
JSON formatted string
|
||||
"""
|
||||
# Prepare export data
|
||||
export_data = {
|
||||
"version": "1.0",
|
||||
"format": "breakpilot-transcript",
|
||||
"generated_at": datetime.utcnow().isoformat() + "Z",
|
||||
"metadata": metadata or {},
|
||||
"segments": []
|
||||
}
|
||||
|
||||
# Add segments
|
||||
for seg in segments:
|
||||
export_seg = {
|
||||
"index": seg.get("index", 0),
|
||||
"start_ms": seg["start_time_ms"],
|
||||
"end_ms": seg["end_time_ms"],
|
||||
"duration_ms": seg["end_time_ms"] - seg["start_time_ms"],
|
||||
"text": seg["text"],
|
||||
"speaker_id": seg.get("speaker_id"),
|
||||
"confidence": seg.get("confidence")
|
||||
}
|
||||
|
||||
# Include word-level timestamps if available
|
||||
if "words" in seg:
|
||||
export_seg["words"] = seg["words"]
|
||||
|
||||
export_data["segments"].append(export_seg)
|
||||
|
||||
# Calculate statistics
|
||||
total_duration_ms = sum(s["duration_ms"] for s in export_data["segments"])
|
||||
total_words = sum(len(s["text"].split()) for s in export_data["segments"])
|
||||
unique_speakers = set(s["speaker_id"] for s in export_data["segments"] if s["speaker_id"])
|
||||
|
||||
export_data["statistics"] = {
|
||||
"total_segments": len(export_data["segments"]),
|
||||
"total_duration_ms": total_duration_ms,
|
||||
"total_duration_seconds": round(total_duration_ms / 1000, 1),
|
||||
"total_words": total_words,
|
||||
"unique_speakers": len(unique_speakers),
|
||||
"speakers": list(unique_speakers)
|
||||
}
|
||||
|
||||
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def export_to_txt(
|
||||
segments: List[Dict],
|
||||
include_timestamps: bool = False,
|
||||
include_speakers: bool = True,
|
||||
paragraph_gap_ms: int = 3000
|
||||
) -> str:
|
||||
"""
|
||||
Export segments to plain text format.
|
||||
|
||||
Args:
|
||||
segments: List of transcription segments
|
||||
include_timestamps: Add timestamps
|
||||
include_speakers: Add speaker labels
|
||||
paragraph_gap_ms: Gap threshold for new paragraph
|
||||
|
||||
Returns:
|
||||
Plain text formatted string
|
||||
"""
|
||||
lines = []
|
||||
last_end = 0
|
||||
current_speaker = None
|
||||
|
||||
for seg in segments:
|
||||
# Add paragraph break for large gaps
|
||||
gap = seg["start_time_ms"] - last_end
|
||||
if gap > paragraph_gap_ms and lines:
|
||||
lines.append("")
|
||||
|
||||
# Build text line
|
||||
parts = []
|
||||
|
||||
if include_timestamps:
|
||||
ts = ms_to_vtt_timestamp(seg["start_time_ms"])
|
||||
parts.append(f"[{ts}]")
|
||||
|
||||
speaker = seg.get("speaker_id")
|
||||
if include_speakers and speaker and speaker != current_speaker:
|
||||
parts.append(f"\n{speaker}:")
|
||||
current_speaker = speaker
|
||||
|
||||
parts.append(seg["text"])
|
||||
|
||||
lines.append(" ".join(parts))
|
||||
last_end = seg["end_time_ms"]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def create_chapters(
|
||||
segments: List[Dict],
|
||||
min_chapter_duration_ms: int = 60000,
|
||||
speaker_change_as_chapter: bool = True
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Create chapter markers from segments.
|
||||
|
||||
Useful for video navigation and table of contents.
|
||||
|
||||
Args:
|
||||
segments: List of transcription segments
|
||||
min_chapter_duration_ms: Minimum chapter duration
|
||||
speaker_change_as_chapter: Create chapter on speaker change
|
||||
|
||||
Returns:
|
||||
List of chapter markers
|
||||
"""
|
||||
if not segments:
|
||||
return []
|
||||
|
||||
chapters = []
|
||||
chapter_start = segments[0]["start_time_ms"]
|
||||
chapter_text_parts = []
|
||||
current_speaker = segments[0].get("speaker_id")
|
||||
|
||||
for seg in segments:
|
||||
elapsed = seg["start_time_ms"] - chapter_start
|
||||
|
||||
# Check for new chapter
|
||||
speaker_changed = (
|
||||
speaker_change_as_chapter and
|
||||
seg.get("speaker_id") and
|
||||
seg.get("speaker_id") != current_speaker
|
||||
)
|
||||
|
||||
if elapsed >= min_chapter_duration_ms or speaker_changed:
|
||||
# Save current chapter
|
||||
if chapter_text_parts:
|
||||
chapters.append({
|
||||
"start_ms": chapter_start,
|
||||
"title": " ".join(chapter_text_parts[:5]) + "...", # First 5 words
|
||||
"speaker": current_speaker
|
||||
})
|
||||
|
||||
# Start new chapter
|
||||
chapter_start = seg["start_time_ms"]
|
||||
chapter_text_parts = []
|
||||
current_speaker = seg.get("speaker_id")
|
||||
|
||||
chapter_text_parts.extend(seg["text"].split())
|
||||
|
||||
# Don't forget the last chapter
|
||||
if chapter_text_parts:
|
||||
chapters.append({
|
||||
"start_ms": chapter_start,
|
||||
"title": " ".join(chapter_text_parts[:5]) + "...",
|
||||
"speaker": current_speaker
|
||||
})
|
||||
|
||||
return chapters
|
||||
359
backend/transcription_worker/storage.py
Normal file
359
backend/transcription_worker/storage.py
Normal file
@@ -0,0 +1,359 @@
|
||||
"""
|
||||
BreakPilot MinIO Storage Helper
|
||||
|
||||
Provides file upload/download operations for MinIO object storage.
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import structlog
|
||||
from typing import Optional, BinaryIO
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
class MinIOStorage:
|
||||
"""
|
||||
MinIO storage client for recordings and transcriptions.
|
||||
|
||||
Provides methods to upload, download, and manage files
|
||||
in MinIO object storage (S3-compatible).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: str = "minio:9000",
|
||||
access_key: str = "breakpilot",
|
||||
secret_key: str = "breakpilot123",
|
||||
bucket: str = "breakpilot-recordings",
|
||||
secure: bool = False
|
||||
):
|
||||
"""
|
||||
Initialize MinIO client.
|
||||
|
||||
Args:
|
||||
endpoint: MinIO server endpoint (host:port)
|
||||
access_key: Access key (username)
|
||||
secret_key: Secret key (password)
|
||||
bucket: Default bucket name
|
||||
secure: Use HTTPS
|
||||
"""
|
||||
self.endpoint = endpoint
|
||||
self.access_key = access_key
|
||||
self.secret_key = secret_key
|
||||
self.bucket = bucket
|
||||
self.secure = secure
|
||||
self._client = None
|
||||
|
||||
def _get_client(self):
|
||||
"""Lazy initialize MinIO client."""
|
||||
if self._client is not None:
|
||||
return self._client
|
||||
|
||||
try:
|
||||
from minio import Minio
|
||||
|
||||
self._client = Minio(
|
||||
self.endpoint,
|
||||
access_key=self.access_key,
|
||||
secret_key=self.secret_key,
|
||||
secure=self.secure
|
||||
)
|
||||
|
||||
log.info(
|
||||
"minio_client_initialized",
|
||||
endpoint=self.endpoint,
|
||||
bucket=self.bucket
|
||||
)
|
||||
|
||||
return self._client
|
||||
|
||||
except ImportError:
|
||||
log.error("minio_not_installed")
|
||||
raise ImportError(
|
||||
"minio is not installed. "
|
||||
"Install with: pip install minio"
|
||||
)
|
||||
|
||||
def ensure_bucket(self) -> bool:
|
||||
"""
|
||||
Ensure the bucket exists, create if needed.
|
||||
|
||||
Returns:
|
||||
True if bucket exists or was created
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
if not client.bucket_exists(self.bucket):
|
||||
client.make_bucket(self.bucket)
|
||||
log.info("bucket_created", bucket=self.bucket)
|
||||
return True
|
||||
|
||||
return True
|
||||
|
||||
def download_file(
|
||||
self,
|
||||
object_name: str,
|
||||
local_path: str,
|
||||
bucket: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Download a file from MinIO.
|
||||
|
||||
Args:
|
||||
object_name: Path in MinIO bucket
|
||||
local_path: Local destination path
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
Local file path
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
log.info(
|
||||
"downloading_file",
|
||||
bucket=bucket,
|
||||
object_name=object_name,
|
||||
local_path=local_path
|
||||
)
|
||||
|
||||
# Ensure directory exists
|
||||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||
|
||||
# Download
|
||||
client.fget_object(bucket, object_name, local_path)
|
||||
|
||||
log.info(
|
||||
"file_downloaded",
|
||||
object_name=object_name,
|
||||
local_path=local_path,
|
||||
size=os.path.getsize(local_path)
|
||||
)
|
||||
|
||||
return local_path
|
||||
|
||||
def upload_file(
|
||||
self,
|
||||
local_path: str,
|
||||
object_name: str,
|
||||
content_type: Optional[str] = None,
|
||||
bucket: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Upload a file to MinIO.
|
||||
|
||||
Args:
|
||||
local_path: Local file path
|
||||
object_name: Destination path in MinIO
|
||||
content_type: MIME type
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
Object name in MinIO
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
# Ensure bucket exists
|
||||
self.ensure_bucket()
|
||||
|
||||
log.info(
|
||||
"uploading_file",
|
||||
local_path=local_path,
|
||||
bucket=bucket,
|
||||
object_name=object_name
|
||||
)
|
||||
|
||||
# Upload
|
||||
result = client.fput_object(
|
||||
bucket,
|
||||
object_name,
|
||||
local_path,
|
||||
content_type=content_type
|
||||
)
|
||||
|
||||
log.info(
|
||||
"file_uploaded",
|
||||
object_name=object_name,
|
||||
etag=result.etag
|
||||
)
|
||||
|
||||
return object_name
|
||||
|
||||
def upload_content(
|
||||
self,
|
||||
content: str,
|
||||
object_name: str,
|
||||
content_type: str = "text/plain",
|
||||
bucket: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Upload string content directly to MinIO.
|
||||
|
||||
Args:
|
||||
content: String content to upload
|
||||
object_name: Destination path in MinIO
|
||||
content_type: MIME type
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
Object name in MinIO
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
# Ensure bucket exists
|
||||
self.ensure_bucket()
|
||||
|
||||
# Convert to bytes
|
||||
data = content.encode("utf-8")
|
||||
data_stream = io.BytesIO(data)
|
||||
|
||||
log.info(
|
||||
"uploading_content",
|
||||
bucket=bucket,
|
||||
object_name=object_name,
|
||||
size=len(data)
|
||||
)
|
||||
|
||||
# Upload
|
||||
result = client.put_object(
|
||||
bucket,
|
||||
object_name,
|
||||
data_stream,
|
||||
length=len(data),
|
||||
content_type=content_type
|
||||
)
|
||||
|
||||
log.info(
|
||||
"content_uploaded",
|
||||
object_name=object_name,
|
||||
etag=result.etag
|
||||
)
|
||||
|
||||
return object_name
|
||||
|
||||
def get_content(
|
||||
self,
|
||||
object_name: str,
|
||||
bucket: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Get string content from MinIO.
|
||||
|
||||
Args:
|
||||
object_name: Path in MinIO bucket
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
File content as string
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
response = client.get_object(bucket, object_name)
|
||||
content = response.read().decode("utf-8")
|
||||
response.close()
|
||||
response.release_conn()
|
||||
|
||||
return content
|
||||
|
||||
def delete_file(
|
||||
self,
|
||||
object_name: str,
|
||||
bucket: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Delete a file from MinIO.
|
||||
|
||||
Args:
|
||||
object_name: Path in MinIO bucket
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
True if deleted
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
client.remove_object(bucket, object_name)
|
||||
|
||||
log.info("file_deleted", object_name=object_name)
|
||||
return True
|
||||
|
||||
def file_exists(
|
||||
self,
|
||||
object_name: str,
|
||||
bucket: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a file exists in MinIO.
|
||||
|
||||
Args:
|
||||
object_name: Path in MinIO bucket
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
True if file exists
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
try:
|
||||
client.stat_object(bucket, object_name)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_presigned_url(
|
||||
self,
|
||||
object_name: str,
|
||||
expires_hours: int = 24,
|
||||
bucket: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Get a presigned URL for temporary file access.
|
||||
|
||||
Args:
|
||||
object_name: Path in MinIO bucket
|
||||
expires_hours: URL validity in hours
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
Presigned URL
|
||||
"""
|
||||
from datetime import timedelta
|
||||
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
url = client.presigned_get_object(
|
||||
bucket,
|
||||
object_name,
|
||||
expires=timedelta(hours=expires_hours)
|
||||
)
|
||||
|
||||
return url
|
||||
|
||||
def list_files(
|
||||
self,
|
||||
prefix: str = "",
|
||||
bucket: Optional[str] = None
|
||||
) -> list:
|
||||
"""
|
||||
List files with a given prefix.
|
||||
|
||||
Args:
|
||||
prefix: Path prefix to filter
|
||||
bucket: Optional bucket override
|
||||
|
||||
Returns:
|
||||
List of object names
|
||||
"""
|
||||
client = self._get_client()
|
||||
bucket = bucket or self.bucket
|
||||
|
||||
objects = client.list_objects(bucket, prefix=prefix, recursive=True)
|
||||
|
||||
return [obj.object_name for obj in objects]
|
||||
230
backend/transcription_worker/tasks.py
Normal file
230
backend/transcription_worker/tasks.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
BreakPilot Transcription Tasks
|
||||
|
||||
RQ task definitions for transcription processing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import tempfile
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from .transcriber import WhisperTranscriber
|
||||
from .diarizer import SpeakerDiarizer
|
||||
from .aligner import TranscriptAligner
|
||||
from .storage import MinIOStorage
|
||||
from .export import export_to_vtt, export_to_srt, export_to_json
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
# Configuration
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
|
||||
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu")
|
||||
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
|
||||
PYANNOTE_AUTH_TOKEN = os.getenv("PYANNOTE_AUTH_TOKEN")
|
||||
TEMP_DIR = os.getenv("TEMP_DIR", "/tmp/transcriptions")
|
||||
|
||||
# MinIO Configuration
|
||||
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio:9000")
|
||||
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot")
|
||||
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123")
|
||||
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-recordings")
|
||||
MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
|
||||
# Database URL for status updates
|
||||
DATABASE_URL = os.getenv("DATABASE_URL")
|
||||
|
||||
|
||||
def update_transcription_status(
|
||||
transcription_id: str,
|
||||
status: str,
|
||||
error_message: Optional[str] = None,
|
||||
**kwargs
|
||||
):
|
||||
"""Update transcription status in database."""
|
||||
# TODO: Implement database update
|
||||
log.info(
|
||||
"status_update",
|
||||
transcription_id=transcription_id,
|
||||
status=status,
|
||||
error=error_message,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
def transcribe_recording(
|
||||
transcription_id: str,
|
||||
recording_id: str,
|
||||
audio_path: str,
|
||||
language: str = "de",
|
||||
enable_diarization: bool = True
|
||||
) -> dict:
|
||||
"""
|
||||
Main transcription task.
|
||||
|
||||
Downloads audio from MinIO, transcribes with Whisper,
|
||||
optionally performs speaker diarization, and uploads results.
|
||||
|
||||
Args:
|
||||
transcription_id: UUID of the transcription record
|
||||
recording_id: UUID of the source recording
|
||||
audio_path: Path to audio file in MinIO bucket
|
||||
language: Language code (de, en, etc.)
|
||||
enable_diarization: Whether to perform speaker diarization
|
||||
|
||||
Returns:
|
||||
dict with transcription results and paths
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
log.info(
|
||||
"transcription_started",
|
||||
transcription_id=transcription_id,
|
||||
recording_id=recording_id,
|
||||
audio_path=audio_path,
|
||||
language=language
|
||||
)
|
||||
|
||||
# Update status to processing
|
||||
update_transcription_status(
|
||||
transcription_id,
|
||||
status="processing",
|
||||
processing_started_at=datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
try:
|
||||
# Initialize storage
|
||||
storage = MinIOStorage(
|
||||
endpoint=MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
bucket=MINIO_BUCKET,
|
||||
secure=MINIO_SECURE
|
||||
)
|
||||
|
||||
# Create temp directory
|
||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||
|
||||
# Download audio file
|
||||
local_audio_path = os.path.join(TEMP_DIR, f"{transcription_id}_audio.wav")
|
||||
storage.download_file(audio_path, local_audio_path)
|
||||
log.info("audio_downloaded", path=local_audio_path)
|
||||
|
||||
# Initialize transcriber
|
||||
transcriber = WhisperTranscriber(
|
||||
model_name=WHISPER_MODEL,
|
||||
device=WHISPER_DEVICE,
|
||||
compute_type=WHISPER_COMPUTE_TYPE
|
||||
)
|
||||
|
||||
# Transcribe audio
|
||||
log.info("transcription_starting", model=WHISPER_MODEL)
|
||||
segments = transcriber.transcribe(
|
||||
audio_path=local_audio_path,
|
||||
language=language
|
||||
)
|
||||
log.info("transcription_complete", segments_count=len(segments))
|
||||
|
||||
# Speaker diarization (if enabled and token available)
|
||||
if enable_diarization and PYANNOTE_AUTH_TOKEN:
|
||||
log.info("diarization_starting")
|
||||
diarizer = SpeakerDiarizer(auth_token=PYANNOTE_AUTH_TOKEN)
|
||||
speaker_segments = diarizer.diarize(local_audio_path)
|
||||
|
||||
# Align transcription with speakers
|
||||
aligner = TranscriptAligner()
|
||||
segments = aligner.align(segments, speaker_segments)
|
||||
log.info("diarization_complete", speakers=aligner.get_speaker_count())
|
||||
else:
|
||||
log.info("diarization_skipped", reason="disabled or no token")
|
||||
|
||||
# Calculate statistics
|
||||
full_text = " ".join(s["text"] for s in segments)
|
||||
word_count = len(full_text.split())
|
||||
avg_confidence = sum(s.get("confidence", 0) for s in segments) / len(segments) if segments else 0
|
||||
|
||||
# Export to different formats
|
||||
base_path = audio_path.rsplit("/", 1)[0] # recordings/{recording_name}
|
||||
|
||||
# WebVTT
|
||||
vtt_content = export_to_vtt(segments)
|
||||
vtt_path = f"{base_path}/transcript.vtt"
|
||||
storage.upload_content(vtt_content, vtt_path, content_type="text/vtt")
|
||||
|
||||
# SRT
|
||||
srt_content = export_to_srt(segments)
|
||||
srt_path = f"{base_path}/transcript.srt"
|
||||
storage.upload_content(srt_content, srt_path, content_type="text/plain")
|
||||
|
||||
# JSON (full data with speakers)
|
||||
json_content = export_to_json(segments, {
|
||||
"transcription_id": transcription_id,
|
||||
"recording_id": recording_id,
|
||||
"language": language,
|
||||
"model": WHISPER_MODEL,
|
||||
"word_count": word_count,
|
||||
"confidence": avg_confidence
|
||||
})
|
||||
json_path = f"{base_path}/transcript.json"
|
||||
storage.upload_content(json_content, json_path, content_type="application/json")
|
||||
|
||||
# Cleanup temp file
|
||||
if os.path.exists(local_audio_path):
|
||||
os.remove(local_audio_path)
|
||||
|
||||
# Calculate processing time
|
||||
processing_duration = int(time.time() - start_time)
|
||||
|
||||
# Update status to completed
|
||||
result = {
|
||||
"transcription_id": transcription_id,
|
||||
"recording_id": recording_id,
|
||||
"status": "completed",
|
||||
"full_text": full_text,
|
||||
"word_count": word_count,
|
||||
"confidence_score": round(avg_confidence, 3),
|
||||
"segments_count": len(segments),
|
||||
"vtt_path": vtt_path,
|
||||
"srt_path": srt_path,
|
||||
"json_path": json_path,
|
||||
"processing_duration_seconds": processing_duration
|
||||
}
|
||||
|
||||
update_transcription_status(
|
||||
transcription_id,
|
||||
status="completed",
|
||||
full_text=full_text,
|
||||
word_count=word_count,
|
||||
confidence_score=avg_confidence,
|
||||
vtt_path=vtt_path,
|
||||
srt_path=srt_path,
|
||||
json_path=json_path,
|
||||
processing_duration_seconds=processing_duration,
|
||||
processing_completed_at=datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
log.info(
|
||||
"transcription_completed",
|
||||
transcription_id=transcription_id,
|
||||
word_count=word_count,
|
||||
duration_seconds=processing_duration
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
"transcription_failed",
|
||||
transcription_id=transcription_id,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
update_transcription_status(
|
||||
transcription_id,
|
||||
status="failed",
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
raise
|
||||
211
backend/transcription_worker/transcriber.py
Normal file
211
backend/transcription_worker/transcriber.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
BreakPilot Whisper Transcriber
|
||||
|
||||
Uses faster-whisper (MIT License) for GPU-optimized transcription.
|
||||
Based on CTranslate2 for fast inference.
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
class WhisperTranscriber:
|
||||
"""
|
||||
Whisper-based audio transcription using faster-whisper.
|
||||
|
||||
faster-whisper is a reimplementation of OpenAI Whisper using CTranslate2,
|
||||
which is significantly faster than the original implementation.
|
||||
|
||||
License: MIT
|
||||
Source: https://github.com/SYSTRAN/faster-whisper
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "large-v3",
|
||||
device: str = "cpu",
|
||||
compute_type: str = "int8"
|
||||
):
|
||||
"""
|
||||
Initialize the transcriber.
|
||||
|
||||
Args:
|
||||
model_name: Whisper model to use (tiny, base, small, medium, large-v3)
|
||||
device: Device to run on ("cpu", "cuda", "auto")
|
||||
compute_type: Quantization type ("int8", "float16", "float32")
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.compute_type = compute_type
|
||||
self._model = None
|
||||
|
||||
def _load_model(self):
|
||||
"""Lazy load the model on first use."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
log.info(
|
||||
"loading_whisper_model",
|
||||
model=self.model_name,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type
|
||||
)
|
||||
|
||||
self._model = WhisperModel(
|
||||
self.model_name,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type
|
||||
)
|
||||
|
||||
log.info("whisper_model_loaded")
|
||||
|
||||
except ImportError:
|
||||
log.error("faster_whisper_not_installed")
|
||||
raise ImportError(
|
||||
"faster-whisper is not installed. "
|
||||
"Install with: pip install faster-whisper"
|
||||
)
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
audio_path: str,
|
||||
language: str = "de",
|
||||
beam_size: int = 5,
|
||||
word_timestamps: bool = True,
|
||||
vad_filter: bool = True,
|
||||
vad_parameters: Optional[dict] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Transcribe an audio file.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file (WAV, MP3, etc.)
|
||||
language: Language code (de, en, fr, etc.) or None for auto-detection
|
||||
beam_size: Beam size for decoding (higher = better but slower)
|
||||
word_timestamps: Include word-level timestamps
|
||||
vad_filter: Enable Voice Activity Detection to filter silence
|
||||
vad_parameters: Custom VAD parameters
|
||||
|
||||
Returns:
|
||||
List of segments with text, timestamps, and confidence scores
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
||||
|
||||
log.info(
|
||||
"transcribing_audio",
|
||||
audio_path=audio_path,
|
||||
language=language,
|
||||
beam_size=beam_size
|
||||
)
|
||||
|
||||
# Default VAD parameters for better speech detection
|
||||
if vad_parameters is None:
|
||||
vad_parameters = {
|
||||
"min_silence_duration_ms": 500,
|
||||
"speech_pad_ms": 400
|
||||
}
|
||||
|
||||
# Run transcription
|
||||
segments_gen, info = self._model.transcribe(
|
||||
audio_path,
|
||||
language=language,
|
||||
beam_size=beam_size,
|
||||
word_timestamps=word_timestamps,
|
||||
vad_filter=vad_filter,
|
||||
vad_parameters=vad_parameters
|
||||
)
|
||||
|
||||
log.info(
|
||||
"transcription_info",
|
||||
detected_language=info.language,
|
||||
language_probability=info.language_probability,
|
||||
duration=info.duration
|
||||
)
|
||||
|
||||
# Convert generator to list of segments
|
||||
segments = []
|
||||
for i, segment in enumerate(segments_gen):
|
||||
seg_dict = {
|
||||
"index": i,
|
||||
"start_time_ms": int(segment.start * 1000),
|
||||
"end_time_ms": int(segment.end * 1000),
|
||||
"text": segment.text.strip(),
|
||||
"confidence": round(segment.avg_logprob, 3) if segment.avg_logprob else None,
|
||||
"no_speech_prob": segment.no_speech_prob
|
||||
}
|
||||
|
||||
# Add word-level timestamps if available
|
||||
if word_timestamps and segment.words:
|
||||
seg_dict["words"] = [
|
||||
{
|
||||
"word": word.word,
|
||||
"start": int(word.start * 1000),
|
||||
"end": int(word.end * 1000),
|
||||
"probability": round(word.probability, 3)
|
||||
}
|
||||
for word in segment.words
|
||||
]
|
||||
|
||||
segments.append(seg_dict)
|
||||
|
||||
log.info(
|
||||
"transcription_complete",
|
||||
segments_count=len(segments),
|
||||
duration_seconds=info.duration
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
def detect_language(self, audio_path: str) -> dict:
|
||||
"""
|
||||
Detect the language of an audio file.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file
|
||||
|
||||
Returns:
|
||||
dict with language code and probability
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
||||
|
||||
# Transcribe first 30 seconds to detect language
|
||||
_, info = self._model.transcribe(
|
||||
audio_path,
|
||||
language=None, # Auto-detect
|
||||
beam_size=1,
|
||||
without_timestamps=True
|
||||
)
|
||||
|
||||
return {
|
||||
"language": info.language,
|
||||
"probability": info.language_probability
|
||||
}
|
||||
|
||||
@property
|
||||
def available_languages(self) -> List[str]:
|
||||
"""List of supported languages."""
|
||||
return [
|
||||
"de", "en", "fr", "es", "it", "pt", "nl", "pl", "ru",
|
||||
"zh", "ja", "ko", "ar", "tr", "hi", "vi", "th", "id"
|
||||
]
|
||||
|
||||
def get_model_info(self) -> dict:
|
||||
"""Get information about the loaded model."""
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"device": self.device,
|
||||
"compute_type": self.compute_type,
|
||||
"loaded": self._model is not None
|
||||
}
|
||||
129
backend/transcription_worker/worker.py
Normal file
129
backend/transcription_worker/worker.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
BreakPilot Transcription Worker - Main Entry Point
|
||||
|
||||
Runs as an RQ worker, processing transcription jobs from the queue.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import structlog
|
||||
from redis import Redis
|
||||
from rq import Worker, Queue, Connection
|
||||
|
||||
# Configure logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
structlog.processors.JSONRenderer()
|
||||
],
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
log = structlog.get_logger(__name__)
|
||||
|
||||
# Configuration
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/1")
|
||||
QUEUE_NAME = os.getenv("QUEUE_NAME", "transcription")
|
||||
WORKER_NAME = os.getenv("WORKER_NAME", f"transcription-worker-{os.getpid()}")
|
||||
|
||||
|
||||
def setup_signal_handlers(worker: Worker):
|
||||
"""Setup graceful shutdown handlers."""
|
||||
|
||||
def shutdown_handler(signum, frame):
|
||||
log.info("shutdown_signal_received", signal=signum)
|
||||
worker.request_stop(signum, frame)
|
||||
|
||||
signal.signal(signal.SIGINT, shutdown_handler)
|
||||
signal.signal(signal.SIGTERM, shutdown_handler)
|
||||
|
||||
|
||||
def preload_models():
|
||||
"""Preload ML models to reduce first-job latency."""
|
||||
log.info("preloading_models")
|
||||
|
||||
try:
|
||||
from .transcriber import WhisperTranscriber
|
||||
from .diarizer import SpeakerDiarizer
|
||||
|
||||
# Initialize transcriber (downloads model if needed)
|
||||
whisper_model = os.getenv("WHISPER_MODEL", "large-v3")
|
||||
device = os.getenv("WHISPER_DEVICE", "cpu")
|
||||
compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
|
||||
|
||||
transcriber = WhisperTranscriber(
|
||||
model_name=whisper_model,
|
||||
device=device,
|
||||
compute_type=compute_type
|
||||
)
|
||||
log.info("whisper_model_loaded", model=whisper_model, device=device)
|
||||
|
||||
# Initialize diarizer (downloads model if needed)
|
||||
pyannote_token = os.getenv("PYANNOTE_AUTH_TOKEN")
|
||||
if pyannote_token:
|
||||
diarizer = SpeakerDiarizer(auth_token=pyannote_token)
|
||||
log.info("pyannote_model_loaded")
|
||||
else:
|
||||
log.warning("pyannote_token_missing", message="Speaker diarization disabled")
|
||||
|
||||
except Exception as e:
|
||||
log.error("model_preload_failed", error=str(e))
|
||||
# Don't fail startup, models will be loaded on first job
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the worker."""
|
||||
log.info(
|
||||
"worker_starting",
|
||||
redis_url=REDIS_URL,
|
||||
queue=QUEUE_NAME,
|
||||
worker_name=WORKER_NAME
|
||||
)
|
||||
|
||||
# Connect to Redis
|
||||
redis_conn = Redis.from_url(REDIS_URL)
|
||||
|
||||
# Test connection
|
||||
try:
|
||||
redis_conn.ping()
|
||||
log.info("redis_connected")
|
||||
except Exception as e:
|
||||
log.error("redis_connection_failed", error=str(e))
|
||||
sys.exit(1)
|
||||
|
||||
# Preload models
|
||||
preload_models()
|
||||
|
||||
# Create queue
|
||||
queue = Queue(QUEUE_NAME, connection=redis_conn)
|
||||
|
||||
# Create worker
|
||||
worker = Worker(
|
||||
queues=[queue],
|
||||
connection=redis_conn,
|
||||
name=WORKER_NAME
|
||||
)
|
||||
|
||||
# Setup signal handlers
|
||||
setup_signal_handlers(worker)
|
||||
|
||||
log.info("worker_ready", queues=[QUEUE_NAME])
|
||||
|
||||
# Start processing
|
||||
with Connection(redis_conn):
|
||||
worker.work(with_scheduler=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user