[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
250
backend-lehrer/recording_transcription.py
Normal file
250
backend-lehrer/recording_transcription.py
Normal file
@@ -0,0 +1,250 @@
|
||||
"""
|
||||
Recording API - Transcription Routes.
|
||||
|
||||
Start transcription, get status, download VTT/SRT subtitle files.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import PlainTextResponse
|
||||
|
||||
from recording_models import (
|
||||
TranscriptionRequest,
|
||||
TranscriptionStatusResponse,
|
||||
)
|
||||
from recording_helpers import (
|
||||
_recordings_store,
|
||||
_transcriptions_store,
|
||||
log_audit,
|
||||
format_vtt_time,
|
||||
format_srt_time,
|
||||
)
|
||||
|
||||
router = APIRouter(tags=["Recordings"])
|
||||
|
||||
|
||||
# ==========================================
|
||||
# TRANSCRIPTION ENDPOINTS
|
||||
# ==========================================
|
||||
|
||||
@router.post("/{recording_id}/transcribe", response_model=TranscriptionStatusResponse)
|
||||
async def start_transcription(recording_id: str, request: TranscriptionRequest):
|
||||
"""
|
||||
Start transcription for a recording.
|
||||
|
||||
Queues the recording for processing by the transcription worker.
|
||||
"""
|
||||
recording = _recordings_store.get(recording_id)
|
||||
if not recording:
|
||||
raise HTTPException(status_code=404, detail="Recording not found")
|
||||
|
||||
if recording["status"] == "deleted":
|
||||
raise HTTPException(status_code=400, detail="Cannot transcribe deleted recording")
|
||||
|
||||
# Check if transcription already exists
|
||||
existing = next(
|
||||
(t for t in _transcriptions_store.values()
|
||||
if t["recording_id"] == recording_id and t["status"] != "failed"),
|
||||
None
|
||||
)
|
||||
if existing:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"Transcription already exists with status: {existing['status']}"
|
||||
)
|
||||
|
||||
# Create transcription entry
|
||||
transcription_id = str(uuid.uuid4())
|
||||
now = datetime.utcnow()
|
||||
|
||||
transcription = {
|
||||
"id": transcription_id,
|
||||
"recording_id": recording_id,
|
||||
"language": request.language,
|
||||
"model": request.model,
|
||||
"status": "pending",
|
||||
"full_text": None,
|
||||
"word_count": None,
|
||||
"confidence_score": None,
|
||||
"vtt_path": None,
|
||||
"srt_path": None,
|
||||
"json_path": None,
|
||||
"error_message": None,
|
||||
"processing_started_at": None,
|
||||
"processing_completed_at": None,
|
||||
"processing_duration_seconds": None,
|
||||
"created_at": now.isoformat(),
|
||||
"updated_at": now.isoformat()
|
||||
}
|
||||
|
||||
_transcriptions_store[transcription_id] = transcription
|
||||
|
||||
# Update recording status
|
||||
recording["status"] = "processing"
|
||||
recording["updated_at"] = now.isoformat()
|
||||
|
||||
# Log transcription start
|
||||
log_audit(
|
||||
action="transcription_started",
|
||||
recording_id=recording_id,
|
||||
transcription_id=transcription_id,
|
||||
metadata={"language": request.language, "model": request.model}
|
||||
)
|
||||
|
||||
# TODO: Queue job to Redis/Valkey for transcription worker
|
||||
|
||||
return TranscriptionStatusResponse(
|
||||
id=transcription_id,
|
||||
recording_id=recording_id,
|
||||
status="pending",
|
||||
language=request.language,
|
||||
model=request.model,
|
||||
word_count=None,
|
||||
confidence_score=None,
|
||||
processing_duration_seconds=None,
|
||||
error_message=None,
|
||||
created_at=now,
|
||||
completed_at=None
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{recording_id}/transcription", response_model=TranscriptionStatusResponse)
|
||||
async def get_transcription_status(recording_id: str):
|
||||
"""
|
||||
Get transcription status for a recording.
|
||||
"""
|
||||
transcription = next(
|
||||
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
||||
None
|
||||
)
|
||||
if not transcription:
|
||||
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
||||
|
||||
return TranscriptionStatusResponse(
|
||||
id=transcription["id"],
|
||||
recording_id=transcription["recording_id"],
|
||||
status=transcription["status"],
|
||||
language=transcription["language"],
|
||||
model=transcription["model"],
|
||||
word_count=transcription.get("word_count"),
|
||||
confidence_score=transcription.get("confidence_score"),
|
||||
processing_duration_seconds=transcription.get("processing_duration_seconds"),
|
||||
error_message=transcription.get("error_message"),
|
||||
created_at=datetime.fromisoformat(transcription["created_at"]),
|
||||
completed_at=(
|
||||
datetime.fromisoformat(transcription["processing_completed_at"])
|
||||
if transcription.get("processing_completed_at") else None
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{recording_id}/transcription/text")
|
||||
async def get_transcription_text(recording_id: str):
|
||||
"""
|
||||
Get the full transcription text.
|
||||
"""
|
||||
transcription = next(
|
||||
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
||||
None
|
||||
)
|
||||
if not transcription:
|
||||
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
||||
|
||||
if transcription["status"] != "completed":
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Transcription not ready. Status: {transcription['status']}"
|
||||
)
|
||||
|
||||
return {
|
||||
"transcription_id": transcription["id"],
|
||||
"recording_id": recording_id,
|
||||
"language": transcription["language"],
|
||||
"text": transcription.get("full_text", ""),
|
||||
"word_count": transcription.get("word_count", 0)
|
||||
}
|
||||
|
||||
|
||||
@router.get("/{recording_id}/transcription/vtt")
|
||||
async def get_transcription_vtt(recording_id: str):
|
||||
"""
|
||||
Download transcription as WebVTT subtitle file.
|
||||
"""
|
||||
transcription = next(
|
||||
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
||||
None
|
||||
)
|
||||
if not transcription:
|
||||
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
||||
|
||||
if transcription["status"] != "completed":
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Transcription not ready. Status: {transcription['status']}"
|
||||
)
|
||||
|
||||
# Generate VTT content
|
||||
vtt_content = "WEBVTT\n\n"
|
||||
text = transcription.get("full_text", "")
|
||||
|
||||
if text:
|
||||
sentences = text.replace(".", ".\n").split("\n")
|
||||
time_offset = 0
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if sentence:
|
||||
start = format_vtt_time(time_offset)
|
||||
time_offset += 3000
|
||||
end = format_vtt_time(time_offset)
|
||||
vtt_content += f"{start} --> {end}\n{sentence}\n\n"
|
||||
|
||||
return PlainTextResponse(
|
||||
content=vtt_content,
|
||||
media_type="text/vtt",
|
||||
headers={"Content-Disposition": f"attachment; filename=transcript_{recording_id}.vtt"}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{recording_id}/transcription/srt")
|
||||
async def get_transcription_srt(recording_id: str):
|
||||
"""
|
||||
Download transcription as SRT subtitle file.
|
||||
"""
|
||||
transcription = next(
|
||||
(t for t in _transcriptions_store.values() if t["recording_id"] == recording_id),
|
||||
None
|
||||
)
|
||||
if not transcription:
|
||||
raise HTTPException(status_code=404, detail="No transcription found for this recording")
|
||||
|
||||
if transcription["status"] != "completed":
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Transcription not ready. Status: {transcription['status']}"
|
||||
)
|
||||
|
||||
# Generate SRT content
|
||||
srt_content = ""
|
||||
text = transcription.get("full_text", "")
|
||||
|
||||
if text:
|
||||
sentences = text.replace(".", ".\n").split("\n")
|
||||
time_offset = 0
|
||||
index = 1
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if sentence:
|
||||
start = format_srt_time(time_offset)
|
||||
time_offset += 3000
|
||||
end = format_srt_time(time_offset)
|
||||
srt_content += f"{index}\n{start} --> {end}\n{sentence}\n\n"
|
||||
index += 1
|
||||
|
||||
return PlainTextResponse(
|
||||
content=srt_content,
|
||||
media_type="text/plain",
|
||||
headers={"Content-Disposition": f"attachment; filename=transcript_{recording_id}.srt"}
|
||||
)
|
||||
Reference in New Issue
Block a user