feat: voice-service von lehrer nach core verschoben, Pipeline erweitert (voice, BQAS, embedding, night-scheduler)
This commit is contained in:
325
voice-service/api/streaming.py
Normal file
325
voice-service/api/streaming.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""
|
||||
WebSocket Streaming API
|
||||
Handles real-time audio streaming for voice interface
|
||||
|
||||
WebSocket Protocol:
|
||||
- Binary frames: Int16 PCM Audio (24kHz, 80ms frames)
|
||||
- JSON frames: {"type": "config|end_turn|interrupt"}
|
||||
|
||||
Server -> Client:
|
||||
- Binary: Audio Response (base64)
|
||||
- JSON: {"type": "transcript|intent|status|error"}
|
||||
"""
|
||||
import structlog
|
||||
import asyncio
|
||||
import json
|
||||
import base64
|
||||
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Query
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from config import settings
|
||||
from models.session import SessionStatus, TranscriptMessage, AudioChunk
|
||||
from models.task import TaskCreate, TaskType
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Active WebSocket connections (transient)
|
||||
active_connections: dict[str, WebSocket] = {}
|
||||
|
||||
|
||||
@router.websocket("/ws/voice")
|
||||
async def voice_websocket(
|
||||
websocket: WebSocket,
|
||||
session_id: str = Query(..., description="Session ID from /api/v1/sessions"),
|
||||
namespace: Optional[str] = Query(None, description="Namespace ID"),
|
||||
key_hash: Optional[str] = Query(None, description="Encryption key hash"),
|
||||
):
|
||||
"""
|
||||
WebSocket endpoint for voice streaming.
|
||||
|
||||
Protocol:
|
||||
1. Client connects with session_id
|
||||
2. Client sends binary audio frames (Int16 PCM, 24kHz)
|
||||
3. Server responds with transcripts, intents, and audio
|
||||
|
||||
Audio Processing:
|
||||
- Chunks are processed in RAM only
|
||||
- No audio is ever persisted
|
||||
- Transcripts are encrypted before any storage
|
||||
"""
|
||||
# Get session
|
||||
from api.sessions import _sessions
|
||||
session = _sessions.get(session_id)
|
||||
|
||||
if not session:
|
||||
await websocket.close(code=4004, reason="Session not found")
|
||||
return
|
||||
|
||||
# Accept connection
|
||||
await websocket.accept()
|
||||
|
||||
logger.info(
|
||||
"WebSocket connected",
|
||||
session_id=session_id[:8],
|
||||
namespace_id=session.namespace_id[:8],
|
||||
)
|
||||
|
||||
# Update session status
|
||||
session.status = SessionStatus.CONNECTED
|
||||
active_connections[session_id] = websocket
|
||||
|
||||
# Audio buffer for accumulating chunks
|
||||
audio_buffer = bytearray()
|
||||
chunk_sequence = 0
|
||||
|
||||
try:
|
||||
# Send initial status
|
||||
await websocket.send_json({
|
||||
"type": "status",
|
||||
"status": "connected",
|
||||
"session_id": session_id,
|
||||
"audio_config": {
|
||||
"sample_rate": settings.audio_sample_rate,
|
||||
"frame_size_ms": settings.audio_frame_size_ms,
|
||||
"encoding": "pcm_s16le",
|
||||
},
|
||||
})
|
||||
|
||||
while True:
|
||||
# Receive message (binary or text)
|
||||
message = await websocket.receive()
|
||||
|
||||
if "bytes" in message:
|
||||
# Binary audio data
|
||||
audio_data = message["bytes"]
|
||||
session.audio_chunks_received += 1
|
||||
|
||||
# Create audio chunk (transient - never persisted)
|
||||
chunk = AudioChunk(
|
||||
sequence=chunk_sequence,
|
||||
timestamp_ms=int((datetime.utcnow().timestamp() * 1000) % (24 * 60 * 60 * 1000)),
|
||||
data=audio_data,
|
||||
)
|
||||
chunk_sequence += 1
|
||||
|
||||
# Accumulate in buffer
|
||||
audio_buffer.extend(audio_data)
|
||||
|
||||
# Process when we have enough data (e.g., 500ms worth)
|
||||
samples_needed = settings.audio_sample_rate // 2 # 500ms
|
||||
bytes_needed = samples_needed * 2 # 16-bit = 2 bytes
|
||||
|
||||
if len(audio_buffer) >= bytes_needed:
|
||||
session.status = SessionStatus.PROCESSING
|
||||
|
||||
# Process audio chunk
|
||||
await process_audio_chunk(
|
||||
websocket,
|
||||
session,
|
||||
bytes(audio_buffer[:bytes_needed]),
|
||||
)
|
||||
|
||||
# Remove processed data
|
||||
audio_buffer = audio_buffer[bytes_needed:]
|
||||
session.audio_chunks_processed += 1
|
||||
|
||||
elif "text" in message:
|
||||
# JSON control message
|
||||
try:
|
||||
data = json.loads(message["text"])
|
||||
msg_type = data.get("type")
|
||||
|
||||
if msg_type == "config":
|
||||
# Client configuration
|
||||
logger.debug("Received config", config=data)
|
||||
|
||||
elif msg_type == "end_turn":
|
||||
# User finished speaking
|
||||
session.status = SessionStatus.PROCESSING
|
||||
|
||||
# Process remaining audio buffer
|
||||
if audio_buffer:
|
||||
await process_audio_chunk(
|
||||
websocket,
|
||||
session,
|
||||
bytes(audio_buffer),
|
||||
)
|
||||
audio_buffer.clear()
|
||||
|
||||
# Signal end of user turn
|
||||
await websocket.send_json({
|
||||
"type": "status",
|
||||
"status": "processing",
|
||||
})
|
||||
|
||||
elif msg_type == "interrupt":
|
||||
# User interrupted response
|
||||
session.status = SessionStatus.LISTENING
|
||||
await websocket.send_json({
|
||||
"type": "status",
|
||||
"status": "interrupted",
|
||||
})
|
||||
|
||||
elif msg_type == "ping":
|
||||
# Keep-alive ping
|
||||
await websocket.send_json({"type": "pong"})
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Invalid JSON message", message=message["text"][:100])
|
||||
|
||||
# Update activity
|
||||
session.update_activity()
|
||||
|
||||
except WebSocketDisconnect:
|
||||
logger.info("WebSocket disconnected", session_id=session_id[:8])
|
||||
except Exception as e:
|
||||
logger.error("WebSocket error", session_id=session_id[:8], error=str(e))
|
||||
session.status = SessionStatus.ERROR
|
||||
finally:
|
||||
# Cleanup
|
||||
session.status = SessionStatus.CLOSED
|
||||
if session_id in active_connections:
|
||||
del active_connections[session_id]
|
||||
|
||||
|
||||
async def process_audio_chunk(
|
||||
websocket: WebSocket,
|
||||
session,
|
||||
audio_data: bytes,
|
||||
):
|
||||
"""
|
||||
Process an audio chunk through the voice pipeline.
|
||||
|
||||
1. PersonaPlex/Ollama for transcription + understanding
|
||||
2. Intent detection
|
||||
3. Task creation if needed
|
||||
4. Response generation
|
||||
5. Audio synthesis (if PersonaPlex)
|
||||
"""
|
||||
from services.task_orchestrator import TaskOrchestrator
|
||||
from services.intent_router import IntentRouter
|
||||
|
||||
orchestrator = TaskOrchestrator()
|
||||
intent_router = IntentRouter()
|
||||
|
||||
try:
|
||||
# Transcribe audio
|
||||
if settings.use_personaplex:
|
||||
# Use PersonaPlex for transcription
|
||||
from services.personaplex_client import PersonaPlexClient
|
||||
client = PersonaPlexClient()
|
||||
transcript = await client.transcribe(audio_data)
|
||||
else:
|
||||
# Use Ollama fallback (text-only, requires separate ASR)
|
||||
# For MVP, we'll simulate with a placeholder
|
||||
# In production, integrate with Whisper or similar
|
||||
from services.fallback_llm_client import FallbackLLMClient
|
||||
llm_client = FallbackLLMClient()
|
||||
transcript = await llm_client.process_audio_description(audio_data)
|
||||
|
||||
if not transcript or not transcript.strip():
|
||||
return
|
||||
|
||||
# Send transcript to client
|
||||
await websocket.send_json({
|
||||
"type": "transcript",
|
||||
"text": transcript,
|
||||
"final": True,
|
||||
"confidence": 0.95,
|
||||
})
|
||||
|
||||
# Add to session messages
|
||||
user_message = TranscriptMessage(
|
||||
role="user",
|
||||
content=transcript,
|
||||
confidence=0.95,
|
||||
)
|
||||
session.messages.append(user_message)
|
||||
|
||||
# Detect intent
|
||||
intent = await intent_router.detect_intent(transcript, session.messages)
|
||||
|
||||
if intent:
|
||||
await websocket.send_json({
|
||||
"type": "intent",
|
||||
"intent": intent.type.value,
|
||||
"confidence": intent.confidence,
|
||||
"parameters": intent.parameters,
|
||||
})
|
||||
|
||||
# Create task if intent is actionable
|
||||
if intent.is_actionable:
|
||||
task = await orchestrator.create_task_from_intent(
|
||||
session_id=session.id,
|
||||
namespace_id=session.namespace_id,
|
||||
intent=intent,
|
||||
transcript=transcript,
|
||||
)
|
||||
|
||||
await websocket.send_json({
|
||||
"type": "task_created",
|
||||
"task_id": task.id,
|
||||
"task_type": task.type.value,
|
||||
"state": task.state.value,
|
||||
})
|
||||
|
||||
# Generate response
|
||||
response_text = await orchestrator.generate_response(
|
||||
session_messages=session.messages,
|
||||
intent=intent,
|
||||
namespace_id=session.namespace_id,
|
||||
)
|
||||
|
||||
# Send text response
|
||||
await websocket.send_json({
|
||||
"type": "response",
|
||||
"text": response_text,
|
||||
})
|
||||
|
||||
# Add to session messages
|
||||
assistant_message = TranscriptMessage(
|
||||
role="assistant",
|
||||
content=response_text,
|
||||
)
|
||||
session.messages.append(assistant_message)
|
||||
|
||||
# Generate audio response if PersonaPlex is available
|
||||
if settings.use_personaplex:
|
||||
from services.personaplex_client import PersonaPlexClient
|
||||
client = PersonaPlexClient()
|
||||
audio_response = await client.synthesize(response_text)
|
||||
|
||||
if audio_response:
|
||||
# Send audio in chunks
|
||||
chunk_size = settings.audio_frame_samples * 2 # 16-bit
|
||||
for i in range(0, len(audio_response), chunk_size):
|
||||
chunk = audio_response[i:i + chunk_size]
|
||||
await websocket.send_bytes(chunk)
|
||||
|
||||
# Update session status
|
||||
session.status = SessionStatus.LISTENING
|
||||
|
||||
await websocket.send_json({
|
||||
"type": "status",
|
||||
"status": "listening",
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Audio processing error", error=str(e))
|
||||
await websocket.send_json({
|
||||
"type": "error",
|
||||
"message": "Failed to process audio",
|
||||
"code": "processing_error",
|
||||
})
|
||||
|
||||
|
||||
@router.get("/ws/stats")
|
||||
async def get_websocket_stats():
|
||||
"""Get WebSocket connection statistics."""
|
||||
return {
|
||||
"active_connections": len(active_connections),
|
||||
"connection_ids": [cid[:8] for cid in active_connections.keys()],
|
||||
}
|
||||
Reference in New Issue
Block a user