A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
326 lines
10 KiB
Python
326 lines
10 KiB
Python
"""
|
|
WebSocket Streaming API
|
|
Handles real-time audio streaming for voice interface
|
|
|
|
WebSocket Protocol:
|
|
- Binary frames: Int16 PCM Audio (24kHz, 80ms frames)
|
|
- JSON frames: {"type": "config|end_turn|interrupt"}
|
|
|
|
Server -> Client:
|
|
- Binary: Audio Response (base64)
|
|
- JSON: {"type": "transcript|intent|status|error"}
|
|
"""
|
|
import structlog
|
|
import asyncio
|
|
import json
|
|
import base64
|
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Query
|
|
from typing import Optional
|
|
from datetime import datetime
|
|
|
|
from config import settings
|
|
from models.session import SessionStatus, TranscriptMessage, AudioChunk
|
|
from models.task import TaskCreate, TaskType
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
router = APIRouter()
|
|
|
|
# Active WebSocket connections (transient)
|
|
active_connections: dict[str, WebSocket] = {}
|
|
|
|
|
|
@router.websocket("/ws/voice")
|
|
async def voice_websocket(
|
|
websocket: WebSocket,
|
|
session_id: str = Query(..., description="Session ID from /api/v1/sessions"),
|
|
namespace: Optional[str] = Query(None, description="Namespace ID"),
|
|
key_hash: Optional[str] = Query(None, description="Encryption key hash"),
|
|
):
|
|
"""
|
|
WebSocket endpoint for voice streaming.
|
|
|
|
Protocol:
|
|
1. Client connects with session_id
|
|
2. Client sends binary audio frames (Int16 PCM, 24kHz)
|
|
3. Server responds with transcripts, intents, and audio
|
|
|
|
Audio Processing:
|
|
- Chunks are processed in RAM only
|
|
- No audio is ever persisted
|
|
- Transcripts are encrypted before any storage
|
|
"""
|
|
# Get session
|
|
from api.sessions import _sessions
|
|
session = _sessions.get(session_id)
|
|
|
|
if not session:
|
|
await websocket.close(code=4004, reason="Session not found")
|
|
return
|
|
|
|
# Accept connection
|
|
await websocket.accept()
|
|
|
|
logger.info(
|
|
"WebSocket connected",
|
|
session_id=session_id[:8],
|
|
namespace_id=session.namespace_id[:8],
|
|
)
|
|
|
|
# Update session status
|
|
session.status = SessionStatus.CONNECTED
|
|
active_connections[session_id] = websocket
|
|
|
|
# Audio buffer for accumulating chunks
|
|
audio_buffer = bytearray()
|
|
chunk_sequence = 0
|
|
|
|
try:
|
|
# Send initial status
|
|
await websocket.send_json({
|
|
"type": "status",
|
|
"status": "connected",
|
|
"session_id": session_id,
|
|
"audio_config": {
|
|
"sample_rate": settings.audio_sample_rate,
|
|
"frame_size_ms": settings.audio_frame_size_ms,
|
|
"encoding": "pcm_s16le",
|
|
},
|
|
})
|
|
|
|
while True:
|
|
# Receive message (binary or text)
|
|
message = await websocket.receive()
|
|
|
|
if "bytes" in message:
|
|
# Binary audio data
|
|
audio_data = message["bytes"]
|
|
session.audio_chunks_received += 1
|
|
|
|
# Create audio chunk (transient - never persisted)
|
|
chunk = AudioChunk(
|
|
sequence=chunk_sequence,
|
|
timestamp_ms=int((datetime.utcnow().timestamp() * 1000) % (24 * 60 * 60 * 1000)),
|
|
data=audio_data,
|
|
)
|
|
chunk_sequence += 1
|
|
|
|
# Accumulate in buffer
|
|
audio_buffer.extend(audio_data)
|
|
|
|
# Process when we have enough data (e.g., 500ms worth)
|
|
samples_needed = settings.audio_sample_rate // 2 # 500ms
|
|
bytes_needed = samples_needed * 2 # 16-bit = 2 bytes
|
|
|
|
if len(audio_buffer) >= bytes_needed:
|
|
session.status = SessionStatus.PROCESSING
|
|
|
|
# Process audio chunk
|
|
await process_audio_chunk(
|
|
websocket,
|
|
session,
|
|
bytes(audio_buffer[:bytes_needed]),
|
|
)
|
|
|
|
# Remove processed data
|
|
audio_buffer = audio_buffer[bytes_needed:]
|
|
session.audio_chunks_processed += 1
|
|
|
|
elif "text" in message:
|
|
# JSON control message
|
|
try:
|
|
data = json.loads(message["text"])
|
|
msg_type = data.get("type")
|
|
|
|
if msg_type == "config":
|
|
# Client configuration
|
|
logger.debug("Received config", config=data)
|
|
|
|
elif msg_type == "end_turn":
|
|
# User finished speaking
|
|
session.status = SessionStatus.PROCESSING
|
|
|
|
# Process remaining audio buffer
|
|
if audio_buffer:
|
|
await process_audio_chunk(
|
|
websocket,
|
|
session,
|
|
bytes(audio_buffer),
|
|
)
|
|
audio_buffer.clear()
|
|
|
|
# Signal end of user turn
|
|
await websocket.send_json({
|
|
"type": "status",
|
|
"status": "processing",
|
|
})
|
|
|
|
elif msg_type == "interrupt":
|
|
# User interrupted response
|
|
session.status = SessionStatus.LISTENING
|
|
await websocket.send_json({
|
|
"type": "status",
|
|
"status": "interrupted",
|
|
})
|
|
|
|
elif msg_type == "ping":
|
|
# Keep-alive ping
|
|
await websocket.send_json({"type": "pong"})
|
|
|
|
except json.JSONDecodeError:
|
|
logger.warning("Invalid JSON message", message=message["text"][:100])
|
|
|
|
# Update activity
|
|
session.update_activity()
|
|
|
|
except WebSocketDisconnect:
|
|
logger.info("WebSocket disconnected", session_id=session_id[:8])
|
|
except Exception as e:
|
|
logger.error("WebSocket error", session_id=session_id[:8], error=str(e))
|
|
session.status = SessionStatus.ERROR
|
|
finally:
|
|
# Cleanup
|
|
session.status = SessionStatus.CLOSED
|
|
if session_id in active_connections:
|
|
del active_connections[session_id]
|
|
|
|
|
|
async def process_audio_chunk(
|
|
websocket: WebSocket,
|
|
session,
|
|
audio_data: bytes,
|
|
):
|
|
"""
|
|
Process an audio chunk through the voice pipeline.
|
|
|
|
1. PersonaPlex/Ollama for transcription + understanding
|
|
2. Intent detection
|
|
3. Task creation if needed
|
|
4. Response generation
|
|
5. Audio synthesis (if PersonaPlex)
|
|
"""
|
|
from services.task_orchestrator import TaskOrchestrator
|
|
from services.intent_router import IntentRouter
|
|
|
|
orchestrator = TaskOrchestrator()
|
|
intent_router = IntentRouter()
|
|
|
|
try:
|
|
# Transcribe audio
|
|
if settings.use_personaplex:
|
|
# Use PersonaPlex for transcription
|
|
from services.personaplex_client import PersonaPlexClient
|
|
client = PersonaPlexClient()
|
|
transcript = await client.transcribe(audio_data)
|
|
else:
|
|
# Use Ollama fallback (text-only, requires separate ASR)
|
|
# For MVP, we'll simulate with a placeholder
|
|
# In production, integrate with Whisper or similar
|
|
from services.fallback_llm_client import FallbackLLMClient
|
|
llm_client = FallbackLLMClient()
|
|
transcript = await llm_client.process_audio_description(audio_data)
|
|
|
|
if not transcript or not transcript.strip():
|
|
return
|
|
|
|
# Send transcript to client
|
|
await websocket.send_json({
|
|
"type": "transcript",
|
|
"text": transcript,
|
|
"final": True,
|
|
"confidence": 0.95,
|
|
})
|
|
|
|
# Add to session messages
|
|
user_message = TranscriptMessage(
|
|
role="user",
|
|
content=transcript,
|
|
confidence=0.95,
|
|
)
|
|
session.messages.append(user_message)
|
|
|
|
# Detect intent
|
|
intent = await intent_router.detect_intent(transcript, session.messages)
|
|
|
|
if intent:
|
|
await websocket.send_json({
|
|
"type": "intent",
|
|
"intent": intent.type.value,
|
|
"confidence": intent.confidence,
|
|
"parameters": intent.parameters,
|
|
})
|
|
|
|
# Create task if intent is actionable
|
|
if intent.is_actionable:
|
|
task = await orchestrator.create_task_from_intent(
|
|
session_id=session.id,
|
|
namespace_id=session.namespace_id,
|
|
intent=intent,
|
|
transcript=transcript,
|
|
)
|
|
|
|
await websocket.send_json({
|
|
"type": "task_created",
|
|
"task_id": task.id,
|
|
"task_type": task.type.value,
|
|
"state": task.state.value,
|
|
})
|
|
|
|
# Generate response
|
|
response_text = await orchestrator.generate_response(
|
|
session_messages=session.messages,
|
|
intent=intent,
|
|
namespace_id=session.namespace_id,
|
|
)
|
|
|
|
# Send text response
|
|
await websocket.send_json({
|
|
"type": "response",
|
|
"text": response_text,
|
|
})
|
|
|
|
# Add to session messages
|
|
assistant_message = TranscriptMessage(
|
|
role="assistant",
|
|
content=response_text,
|
|
)
|
|
session.messages.append(assistant_message)
|
|
|
|
# Generate audio response if PersonaPlex is available
|
|
if settings.use_personaplex:
|
|
from services.personaplex_client import PersonaPlexClient
|
|
client = PersonaPlexClient()
|
|
audio_response = await client.synthesize(response_text)
|
|
|
|
if audio_response:
|
|
# Send audio in chunks
|
|
chunk_size = settings.audio_frame_samples * 2 # 16-bit
|
|
for i in range(0, len(audio_response), chunk_size):
|
|
chunk = audio_response[i:i + chunk_size]
|
|
await websocket.send_bytes(chunk)
|
|
|
|
# Update session status
|
|
session.status = SessionStatus.LISTENING
|
|
|
|
await websocket.send_json({
|
|
"type": "status",
|
|
"status": "listening",
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error("Audio processing error", error=str(e))
|
|
await websocket.send_json({
|
|
"type": "error",
|
|
"message": "Failed to process audio",
|
|
"code": "processing_error",
|
|
})
|
|
|
|
|
|
@router.get("/ws/stats")
|
|
async def get_websocket_stats():
|
|
"""Get WebSocket connection statistics."""
|
|
return {
|
|
"active_connections": len(active_connections),
|
|
"connection_ids": [cid[:8] for cid in active_connections.keys()],
|
|
}
|