backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
251 lines
7.8 KiB
Python
251 lines
7.8 KiB
Python
"""
|
|
Recording API - Transcription Routes.
|
|
|
|
Start transcription, get status, download VTT/SRT subtitle files.
|
|
"""
|
|
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
from fastapi import APIRouter, HTTPException
|
|
from fastapi.responses import PlainTextResponse
|
|
|
|
from recording_models import (
|
|
TranscriptionRequest,
|
|
TranscriptionStatusResponse,
|
|
)
|
|
from recording_helpers import (
|
|
_recordings_store,
|
|
_transcriptions_store,
|
|
log_audit,
|
|
format_vtt_time,
|
|
format_srt_time,
|
|
)
|
|
|
|
router = APIRouter(tags=["Recordings"])
|
|
|
|
|
|
# ==========================================
|
|
# TRANSCRIPTION ENDPOINTS
|
|
# ==========================================
|
|
|
|
@router.post("/{recording_id}/transcribe", response_model=TranscriptionStatusResponse)
|
|
async def start_transcription(recording_id: str, request: TranscriptionRequest):
|
|
"""
|
|
Start transcription for a recording.
|
|
|
|
Queues the recording for processing by the transcription worker.
|
|
"""
|
|
recording = _recordings_store.get(recording_id)
|
|
if not recording:
|
|
raise HTTPException(status_code=404, detail="Recording not found")
|
|
|
|
if recording["status"] == "deleted":
|
|
raise HTTPException(status_code=400, detail="Cannot transcribe deleted recording")
|
|
|
|
# Check if transcription already exists
|
|
existing = next(
|
|
(t for t in _transcriptions_store.values()
|
|
if t["recording_id"] == recording_id and t["status"] != "failed"),
|
|
None
|
|
)
|
|
if existing:
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail=f"Transcription already exists with status: {existing['status']}"
|
|
)
|
|
|
|
# Create transcription entry
|
|
transcription_id = str(uuid.uuid4())
|
|
now = datetime.utcnow()
|
|
|
|
transcription = {
|
|
"id": transcription_id,
|
|
"recording_id": recording_id,
|
|
"language": request.language,
|
|
"model": request.model,
|
|
"status": "pending",
|
|
"full_text": None,
|
|
"word_count": None,
|
|
"confidence_score": None,
|
|
"vtt_path": None,
|
|
"srt_path": None,
|
|
"json_path": None,
|
|
"error_message": None,
|
|
"processing_started_at": None,
|
|
"processing_completed_at": None,
|
|
"processing_duration_seconds": None,
|
|
"created_at": now.isoformat(),
|
|
"updated_at": now.isoformat()
|
|
}
|
|
|
|
_transcriptions_store[transcription_id] = transcription
|
|
|
|
# Update recording status
|
|
recording["status"] = "processing"
|
|
recording["updated_at"] = now.isoformat()
|
|
|
|
# Log transcription start
|
|
log_audit(
|
|
action="transcription_started",
|
|
recording_id=recording_id,
|
|
transcription_id=transcription_id,
|
|
metadata={"language": request.language, "model": request.model}
|
|
)
|
|
|
|
# TODO: Queue job to Redis/Valkey for transcription worker
|
|
|
|
return TranscriptionStatusResponse(
|
|
id=transcription_id,
|
|
recording_id=recording_id,
|
|
status="pending",
|
|
language=request.language,
|
|
model=request.model,
|
|
word_count=None,
|
|
confidence_score=None,
|
|
processing_duration_seconds=None,
|
|
error_message=None,
|
|
created_at=now,
|
|
completed_at=None
|
|
)
|
|
|
|
|
|
@router.get("/{recording_id}/transcription", response_model=TranscriptionStatusResponse)
|
|
async def get_transcription_status(recording_id: str):
|
|
"""
|
|
Get transcription status for a recording.
|
|
"""
|
|
transcription = next(
|
|
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
|
None
|
|
)
|
|
if not transcription:
|
|
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
|
|
|
return TranscriptionStatusResponse(
|
|
id=transcription["id"],
|
|
recording_id=transcription["recording_id"],
|
|
status=transcription["status"],
|
|
language=transcription["language"],
|
|
model=transcription["model"],
|
|
word_count=transcription.get("word_count"),
|
|
confidence_score=transcription.get("confidence_score"),
|
|
processing_duration_seconds=transcription.get("processing_duration_seconds"),
|
|
error_message=transcription.get("error_message"),
|
|
created_at=datetime.fromisoformat(transcription["created_at"]),
|
|
completed_at=(
|
|
datetime.fromisoformat(transcription["processing_completed_at"])
|
|
if transcription.get("processing_completed_at") else None
|
|
)
|
|
)
|
|
|
|
|
|
@router.get("/{recording_id}/transcription/text")
|
|
async def get_transcription_text(recording_id: str):
|
|
"""
|
|
Get the full transcription text.
|
|
"""
|
|
transcription = next(
|
|
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
|
None
|
|
)
|
|
if not transcription:
|
|
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
|
|
|
if transcription["status"] != "completed":
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Transcription not ready. Status: {transcription['status']}"
|
|
)
|
|
|
|
return {
|
|
"transcription_id": transcription["id"],
|
|
"recording_id": recording_id,
|
|
"language": transcription["language"],
|
|
"text": transcription.get("full_text", ""),
|
|
"word_count": transcription.get("word_count", 0)
|
|
}
|
|
|
|
|
|
@router.get("/{recording_id}/transcription/vtt")
|
|
async def get_transcription_vtt(recording_id: str):
|
|
"""
|
|
Download transcription as WebVTT subtitle file.
|
|
"""
|
|
transcription = next(
|
|
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
|
None
|
|
)
|
|
if not transcription:
|
|
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
|
|
|
if transcription["status"] != "completed":
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Transcription not ready. Status: {transcription['status']}"
|
|
)
|
|
|
|
# Generate VTT content
|
|
vtt_content = "WEBVTT\n\n"
|
|
text = transcription.get("full_text", "")
|
|
|
|
if text:
|
|
sentences = text.replace(".", ".\n").split("\n")
|
|
time_offset = 0
|
|
for sentence in sentences:
|
|
sentence = sentence.strip()
|
|
if sentence:
|
|
start = format_vtt_time(time_offset)
|
|
time_offset += 3000
|
|
end = format_vtt_time(time_offset)
|
|
vtt_content += f"{start} --> {end}\n{sentence}\n\n"
|
|
|
|
return PlainTextResponse(
|
|
content=vtt_content,
|
|
media_type="text/vtt",
|
|
headers={"Content-Disposition": f"attachment; filename=transcript_{recording_id}.vtt"}
|
|
)
|
|
|
|
|
|
@router.get("/{recording_id}/transcription/srt")
|
|
async def get_transcription_srt(recording_id: str):
|
|
"""
|
|
Download transcription as SRT subtitle file.
|
|
"""
|
|
transcription = next(
|
|
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
|
None
|
|
)
|
|
if not transcription:
|
|
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
|
|
|
if transcription["status"] != "completed":
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Transcription not ready. Status: {transcription['status']}"
|
|
)
|
|
|
|
# Generate SRT content
|
|
srt_content = ""
|
|
text = transcription.get("full_text", "")
|
|
|
|
if text:
|
|
sentences = text.replace(".", ".\n").split("\n")
|
|
time_offset = 0
|
|
index = 1
|
|
for sentence in sentences:
|
|
sentence = sentence.strip()
|
|
if sentence:
|
|
start = format_srt_time(time_offset)
|
|
time_offset += 3000
|
|
end = format_srt_time(time_offset)
|
|
srt_content += f"{index}\n{start} --> {end}\n{sentence}\n\n"
|
|
index += 1
|
|
|
|
return PlainTextResponse(
|
|
content=srt_content,
|
|
media_type="text/plain",
|
|
headers={"Content-Disposition": f"attachment; filename=transcript_{recording_id}.srt"}
|
|
)
|