breakpilot-pwa/backend/transcription_worker/tasks.py

"""
BreakPilot Transcription Tasks

RQ task definitions for transcription processing.
"""

import os
import time
import tempfile
import structlog
from typing import Optional
from datetime import datetime

from .transcriber import WhisperTranscriber
from .diarizer import SpeakerDiarizer
from .aligner import TranscriptAligner
from .storage import MinIOStorage
from .export import export_to_vtt, export_to_srt, export_to_json

log = structlog.get_logger(__name__)

# Configuration
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu")
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
PYANNOTE_AUTH_TOKEN = os.getenv("PYANNOTE_AUTH_TOKEN")
TEMP_DIR = os.getenv("TEMP_DIR", "/tmp/transcriptions")

# MinIO Configuration
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "breakpilot")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "breakpilot123")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-recordings")
MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"

# Database URL for status updates
DATABASE_URL = os.getenv("DATABASE_URL")


def update_transcription_status(
    transcription_id: str,
    status: str,
    error_message: Optional[str] = None,
    **kwargs
):
    """Update transcription status in database."""
    # TODO: Implement database update
    log.info(
        "status_update",
        transcription_id=transcription_id,
        status=status,
        error=error_message,
        **kwargs
    )


def transcribe_recording(
    transcription_id: str,
    recording_id: str,
    audio_path: str,
    language: str = "de",
    enable_diarization: bool = True
) -> dict:
    """
    Main transcription task.

    Downloads audio from MinIO, transcribes with Whisper,
    optionally performs speaker diarization, and uploads results.

    Args:
        transcription_id: UUID of the transcription record
        recording_id: UUID of the source recording
        audio_path: Path to audio file in MinIO bucket
        language: Language code (de, en, etc.)
        enable_diarization: Whether to perform speaker diarization

    Returns:
        dict with transcription results and paths
    """
    start_time = time.time()

    log.info(
        "transcription_started",
        transcription_id=transcription_id,
        recording_id=recording_id,
        audio_path=audio_path,
        language=language
    )

    # Update status to processing
    update_transcription_status(
        transcription_id,
        status="processing",
        processing_started_at=datetime.utcnow().isoformat()
    )

    try:
        # Initialize storage
        storage = MinIOStorage(
            endpoint=MINIO_ENDPOINT,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            bucket=MINIO_BUCKET,
            secure=MINIO_SECURE
        )

        # Create temp directory
        os.makedirs(TEMP_DIR, exist_ok=True)

        # Download audio file
        local_audio_path = os.path.join(TEMP_DIR, f"{transcription_id}_audio.wav")
        storage.download_file(audio_path, local_audio_path)
        log.info("audio_downloaded", path=local_audio_path)

        # Initialize transcriber
        transcriber = WhisperTranscriber(
            model_name=WHISPER_MODEL,
            device=WHISPER_DEVICE,
            compute_type=WHISPER_COMPUTE_TYPE
        )

        # Transcribe audio
        log.info("transcription_starting", model=WHISPER_MODEL)
        segments = transcriber.transcribe(
            audio_path=local_audio_path,
            language=language
        )
        log.info("transcription_complete", segments_count=len(segments))

        # Speaker diarization (if enabled and token available)
        if enable_diarization and PYANNOTE_AUTH_TOKEN:
            log.info("diarization_starting")
            diarizer = SpeakerDiarizer(auth_token=PYANNOTE_AUTH_TOKEN)
            speaker_segments = diarizer.diarize(local_audio_path)

            # Align transcription with speakers
            aligner = TranscriptAligner()
            segments = aligner.align(segments, speaker_segments)
            log.info("diarization_complete", speakers=aligner.get_speaker_count())
        else:
            log.info("diarization_skipped", reason="disabled or no token")

        # Calculate statistics
        full_text = " ".join(s["text"] for s in segments)
        word_count = len(full_text.split())
        avg_confidence = sum(s.get("confidence", 0) for s in segments) / len(segments) if segments else 0

        # Export to different formats
        base_path = audio_path.rsplit("/", 1)[0]  # recordings/{recording_name}

        # WebVTT
        vtt_content = export_to_vtt(segments)
        vtt_path = f"{base_path}/transcript.vtt"
        storage.upload_content(vtt_content, vtt_path, content_type="text/vtt")

        # SRT
        srt_content = export_to_srt(segments)
        srt_path = f"{base_path}/transcript.srt"
        storage.upload_content(srt_content, srt_path, content_type="text/plain")

        # JSON (full data with speakers)
        json_content = export_to_json(segments, {
            "transcription_id": transcription_id,
            "recording_id": recording_id,
            "language": language,
            "model": WHISPER_MODEL,
            "word_count": word_count,
            "confidence": avg_confidence
        })
        json_path = f"{base_path}/transcript.json"
        storage.upload_content(json_content, json_path, content_type="application/json")

        # Cleanup temp file
        if os.path.exists(local_audio_path):
            os.remove(local_audio_path)

        # Calculate processing time
        processing_duration = int(time.time() - start_time)

        # Update status to completed
        result = {
            "transcription_id": transcription_id,
            "recording_id": recording_id,
            "status": "completed",
            "full_text": full_text,
            "word_count": word_count,
            "confidence_score": round(avg_confidence, 3),
            "segments_count": len(segments),
            "vtt_path": vtt_path,
            "srt_path": srt_path,
            "json_path": json_path,
            "processing_duration_seconds": processing_duration
        }

        update_transcription_status(
            transcription_id,
            status="completed",
            full_text=full_text,
            word_count=word_count,
            confidence_score=avg_confidence,
            vtt_path=vtt_path,
            srt_path=srt_path,
            json_path=json_path,
            processing_duration_seconds=processing_duration,
            processing_completed_at=datetime.utcnow().isoformat()
        )

        log.info(
            "transcription_completed",
            transcription_id=transcription_id,
            word_count=word_count,
            duration_seconds=processing_duration
        )

        return result

    except Exception as e:
        log.error(
            "transcription_failed",
            transcription_id=transcription_id,
            error=str(e)
        )

        update_transcription_status(
            transcription_id,
            status="failed",
            error_message=str(e)
        )

        raise