fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
36
agent-core/orchestrator/__init__.py
Normal file
36
agent-core/orchestrator/__init__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Orchestration Layer for Breakpilot Agents
|
||||
|
||||
Provides:
|
||||
- MessageBus: Inter-agent communication via Redis Pub/Sub
|
||||
- AgentSupervisor: Agent lifecycle and health management
|
||||
- TaskRouter: Intent-based task routing
|
||||
"""
|
||||
|
||||
from agent_core.orchestrator.message_bus import (
|
||||
MessageBus,
|
||||
AgentMessage,
|
||||
MessagePriority,
|
||||
)
|
||||
from agent_core.orchestrator.supervisor import (
|
||||
AgentSupervisor,
|
||||
AgentInfo,
|
||||
AgentStatus,
|
||||
)
|
||||
from agent_core.orchestrator.task_router import (
|
||||
TaskRouter,
|
||||
RoutingResult,
|
||||
RoutingStrategy,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"MessageBus",
|
||||
"AgentMessage",
|
||||
"MessagePriority",
|
||||
"AgentSupervisor",
|
||||
"AgentInfo",
|
||||
"AgentStatus",
|
||||
"TaskRouter",
|
||||
"RoutingResult",
|
||||
"RoutingStrategy",
|
||||
]
|
||||
479
agent-core/orchestrator/message_bus.py
Normal file
479
agent-core/orchestrator/message_bus.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""
|
||||
Message Bus for Inter-Agent Communication
|
||||
|
||||
Provides:
|
||||
- Pub/Sub messaging via Redis/Valkey
|
||||
- Request-Response pattern with timeouts
|
||||
- Priority-based message handling
|
||||
- Message persistence for audit
|
||||
"""
|
||||
|
||||
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
import asyncio
|
||||
import uuid
|
||||
import json
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MessagePriority(Enum):
|
||||
"""Message priority levels"""
|
||||
LOW = 0
|
||||
NORMAL = 1
|
||||
HIGH = 2
|
||||
CRITICAL = 3
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
"""Standard message types"""
|
||||
REQUEST = "request"
|
||||
RESPONSE = "response"
|
||||
EVENT = "event"
|
||||
BROADCAST = "broadcast"
|
||||
HEARTBEAT = "heartbeat"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentMessage:
|
||||
"""Represents a message between agents"""
|
||||
sender: str
|
||||
receiver: str
|
||||
message_type: str
|
||||
payload: Dict[str, Any]
|
||||
priority: MessagePriority = MessagePriority.NORMAL
|
||||
correlation_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
||||
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
reply_to: Optional[str] = None
|
||||
expires_at: Optional[datetime] = None
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"sender": self.sender,
|
||||
"receiver": self.receiver,
|
||||
"message_type": self.message_type,
|
||||
"payload": self.payload,
|
||||
"priority": self.priority.value,
|
||||
"correlation_id": self.correlation_id,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
"reply_to": self.reply_to,
|
||||
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
|
||||
"metadata": self.metadata
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "AgentMessage":
|
||||
return cls(
|
||||
sender=data["sender"],
|
||||
receiver=data["receiver"],
|
||||
message_type=data["message_type"],
|
||||
payload=data["payload"],
|
||||
priority=MessagePriority(data.get("priority", 1)),
|
||||
correlation_id=data.get("correlation_id", str(uuid.uuid4())),
|
||||
timestamp=datetime.fromisoformat(data["timestamp"]) if "timestamp" in data else datetime.now(timezone.utc),
|
||||
reply_to=data.get("reply_to"),
|
||||
expires_at=datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None,
|
||||
metadata=data.get("metadata", {})
|
||||
)
|
||||
|
||||
|
||||
MessageHandler = Callable[[AgentMessage], Awaitable[Optional[Dict[str, Any]]]]
|
||||
|
||||
|
||||
class MessageBus:
|
||||
"""
|
||||
Inter-agent communication via Redis Pub/Sub.
|
||||
|
||||
Features:
|
||||
- Publish/Subscribe pattern
|
||||
- Request/Response with timeout
|
||||
- Priority queues
|
||||
- Message persistence for audit
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
redis_client=None,
|
||||
db_pool=None,
|
||||
namespace: str = "breakpilot",
|
||||
persist_messages: bool = True
|
||||
):
|
||||
"""
|
||||
Initialize the message bus.
|
||||
|
||||
Args:
|
||||
redis_client: Async Redis/Valkey client
|
||||
db_pool: Async PostgreSQL pool for persistence
|
||||
namespace: Channel namespace
|
||||
persist_messages: Whether to persist messages for audit
|
||||
"""
|
||||
self.redis = redis_client
|
||||
self.db_pool = db_pool
|
||||
self.namespace = namespace
|
||||
self.persist_messages = persist_messages
|
||||
|
||||
self._handlers: Dict[str, List[MessageHandler]] = {}
|
||||
self._pending_responses: Dict[str, asyncio.Future] = {}
|
||||
self._subscriptions: Dict[str, asyncio.Task] = {}
|
||||
self._running = False
|
||||
|
||||
def _channel(self, agent_id: str) -> str:
|
||||
"""Generate channel name for agent"""
|
||||
return f"{self.namespace}:agent:{agent_id}"
|
||||
|
||||
def _broadcast_channel(self) -> str:
|
||||
"""Generate broadcast channel name"""
|
||||
return f"{self.namespace}:broadcast"
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Starts the message bus"""
|
||||
self._running = True
|
||||
logger.info("Message bus started")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stops the message bus and all subscriptions"""
|
||||
self._running = False
|
||||
|
||||
# Cancel all subscription tasks
|
||||
for task in self._subscriptions.values():
|
||||
task.cancel()
|
||||
|
||||
# Wait for cancellation
|
||||
if self._subscriptions:
|
||||
await asyncio.gather(
|
||||
*self._subscriptions.values(),
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
self._subscriptions.clear()
|
||||
logger.info("Message bus stopped")
|
||||
|
||||
async def subscribe(
|
||||
self,
|
||||
agent_id: str,
|
||||
handler: MessageHandler
|
||||
) -> None:
|
||||
"""
|
||||
Subscribe an agent to receive messages.
|
||||
|
||||
Args:
|
||||
agent_id: The agent ID to subscribe
|
||||
handler: Async function to handle incoming messages
|
||||
"""
|
||||
if agent_id in self._subscriptions:
|
||||
logger.warning(f"Agent {agent_id} already subscribed")
|
||||
return
|
||||
|
||||
if agent_id not in self._handlers:
|
||||
self._handlers[agent_id] = []
|
||||
self._handlers[agent_id].append(handler)
|
||||
|
||||
if self.redis:
|
||||
# Start Redis subscription
|
||||
task = asyncio.create_task(
|
||||
self._subscription_loop(agent_id)
|
||||
)
|
||||
self._subscriptions[agent_id] = task
|
||||
|
||||
logger.info(f"Agent {agent_id} subscribed to message bus")
|
||||
|
||||
async def unsubscribe(self, agent_id: str) -> None:
|
||||
"""
|
||||
Unsubscribe an agent from messages.
|
||||
|
||||
Args:
|
||||
agent_id: The agent ID to unsubscribe
|
||||
"""
|
||||
if agent_id in self._subscriptions:
|
||||
self._subscriptions[agent_id].cancel()
|
||||
try:
|
||||
await self._subscriptions[agent_id]
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
del self._subscriptions[agent_id]
|
||||
|
||||
self._handlers.pop(agent_id, None)
|
||||
logger.info(f"Agent {agent_id} unsubscribed from message bus")
|
||||
|
||||
async def _subscription_loop(self, agent_id: str) -> None:
|
||||
"""Main subscription loop for an agent"""
|
||||
if not self.redis:
|
||||
return
|
||||
|
||||
channel = self._channel(agent_id)
|
||||
broadcast = self._broadcast_channel()
|
||||
|
||||
pubsub = self.redis.pubsub()
|
||||
await pubsub.subscribe(channel, broadcast)
|
||||
|
||||
try:
|
||||
while self._running:
|
||||
message = await pubsub.get_message(
|
||||
ignore_subscribe_messages=True,
|
||||
timeout=1.0
|
||||
)
|
||||
|
||||
if message and message["type"] == "message":
|
||||
await self._handle_incoming_message(
|
||||
agent_id,
|
||||
message["data"]
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
finally:
|
||||
await pubsub.unsubscribe(channel, broadcast)
|
||||
await pubsub.close()
|
||||
|
||||
async def _handle_incoming_message(
|
||||
self,
|
||||
agent_id: str,
|
||||
raw_data: bytes
|
||||
) -> None:
|
||||
"""Process an incoming message"""
|
||||
try:
|
||||
data = json.loads(raw_data)
|
||||
message = AgentMessage.from_dict(data)
|
||||
|
||||
# Check if this is a response to a pending request
|
||||
if message.correlation_id in self._pending_responses:
|
||||
future = self._pending_responses[message.correlation_id]
|
||||
if not future.done():
|
||||
future.set_result(message.payload)
|
||||
return
|
||||
|
||||
# Call handlers
|
||||
handlers = self._handlers.get(agent_id, [])
|
||||
for handler in handlers:
|
||||
try:
|
||||
response = await handler(message)
|
||||
|
||||
# If handler returns data and there's a reply_to, send response
|
||||
if response and message.reply_to:
|
||||
await self.publish(AgentMessage(
|
||||
sender=agent_id,
|
||||
receiver=message.sender,
|
||||
message_type="response",
|
||||
payload=response,
|
||||
correlation_id=message.correlation_id
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error in message handler for {agent_id}: {e}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing message: {e}")
|
||||
|
||||
async def publish(self, message: AgentMessage) -> None:
|
||||
"""
|
||||
Publishes a message to an agent.
|
||||
|
||||
Args:
|
||||
message: The message to publish
|
||||
"""
|
||||
# Persist message if enabled
|
||||
if self.persist_messages:
|
||||
await self._persist_message(message)
|
||||
|
||||
if self.redis:
|
||||
channel = self._channel(message.receiver)
|
||||
await self.redis.publish(
|
||||
channel,
|
||||
json.dumps(message.to_dict())
|
||||
)
|
||||
else:
|
||||
# Local delivery for testing
|
||||
await self._local_deliver(message)
|
||||
|
||||
logger.debug(
|
||||
f"Published message from {message.sender} to {message.receiver}: "
|
||||
f"{message.message_type}"
|
||||
)
|
||||
|
||||
async def broadcast(self, message: AgentMessage) -> None:
|
||||
"""
|
||||
Broadcasts a message to all agents.
|
||||
|
||||
Args:
|
||||
message: The message to broadcast
|
||||
"""
|
||||
message.receiver = "*" # Indicate broadcast
|
||||
|
||||
if self.persist_messages:
|
||||
await self._persist_message(message)
|
||||
|
||||
if self.redis:
|
||||
await self.redis.publish(
|
||||
self._broadcast_channel(),
|
||||
json.dumps(message.to_dict())
|
||||
)
|
||||
else:
|
||||
# Local broadcast
|
||||
for agent_id in self._handlers:
|
||||
await self._local_deliver(message, agent_id)
|
||||
|
||||
logger.debug(f"Broadcast message from {message.sender}: {message.message_type}")
|
||||
|
||||
async def request(
|
||||
self,
|
||||
message: AgentMessage,
|
||||
timeout: float = 30.0
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Sends a request and waits for a response.
|
||||
|
||||
Args:
|
||||
message: The request message
|
||||
timeout: Timeout in seconds
|
||||
|
||||
Returns:
|
||||
Response payload
|
||||
|
||||
Raises:
|
||||
TimeoutError: If no response within timeout
|
||||
"""
|
||||
# Mark this as a request that needs a response
|
||||
message.reply_to = message.sender
|
||||
|
||||
# Create future for response
|
||||
future: asyncio.Future = asyncio.Future()
|
||||
self._pending_responses[message.correlation_id] = future
|
||||
|
||||
try:
|
||||
# Publish the request
|
||||
await self.publish(message)
|
||||
|
||||
# Wait for response
|
||||
return await asyncio.wait_for(future, timeout)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
f"Request timeout: {message.sender} -> {message.receiver} "
|
||||
f"({message.message_type})"
|
||||
)
|
||||
raise
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
self._pending_responses.pop(message.correlation_id, None)
|
||||
|
||||
async def _local_deliver(
|
||||
self,
|
||||
message: AgentMessage,
|
||||
target_agent: Optional[str] = None
|
||||
) -> None:
|
||||
"""Local message delivery for testing without Redis"""
|
||||
agent_id = target_agent or message.receiver
|
||||
handlers = self._handlers.get(agent_id, [])
|
||||
|
||||
for handler in handlers:
|
||||
try:
|
||||
response = await handler(message)
|
||||
if response and message.reply_to:
|
||||
if message.correlation_id in self._pending_responses:
|
||||
future = self._pending_responses[message.correlation_id]
|
||||
if not future.done():
|
||||
future.set_result(response)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in local handler: {e}")
|
||||
|
||||
async def _persist_message(self, message: AgentMessage) -> None:
|
||||
"""Persist message to PostgreSQL for audit"""
|
||||
if not self.db_pool:
|
||||
return
|
||||
|
||||
try:
|
||||
async with self.db_pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO agent_messages
|
||||
(id, sender, receiver, message_type, payload,
|
||||
priority, correlation_id, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
""",
|
||||
str(uuid.uuid4()),
|
||||
message.sender,
|
||||
message.receiver,
|
||||
message.message_type,
|
||||
json.dumps(message.payload),
|
||||
message.priority.value,
|
||||
message.correlation_id,
|
||||
message.timestamp
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to persist message: {e}")
|
||||
|
||||
def on_message(self, message_type: str):
|
||||
"""
|
||||
Decorator for message handlers.
|
||||
|
||||
Usage:
|
||||
@bus.on_message("grade_request")
|
||||
async def handle_grade(message):
|
||||
return {"score": 85}
|
||||
"""
|
||||
def decorator(func: MessageHandler):
|
||||
async def wrapper(message: AgentMessage):
|
||||
if message.message_type == message_type:
|
||||
return await func(message)
|
||||
return None
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
async def get_message_history(
|
||||
self,
|
||||
agent_id: Optional[str] = None,
|
||||
message_type: Optional[str] = None,
|
||||
limit: int = 100
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Gets message history from persistence.
|
||||
|
||||
Args:
|
||||
agent_id: Filter by sender or receiver
|
||||
message_type: Filter by message type
|
||||
limit: Maximum results
|
||||
|
||||
Returns:
|
||||
List of message dicts
|
||||
"""
|
||||
if not self.db_pool:
|
||||
return []
|
||||
|
||||
query = """
|
||||
SELECT sender, receiver, message_type, payload, priority,
|
||||
correlation_id, created_at
|
||||
FROM agent_messages
|
||||
WHERE 1=1
|
||||
"""
|
||||
params = []
|
||||
|
||||
if agent_id:
|
||||
query += " AND (sender = $1 OR receiver = $1)"
|
||||
params.append(agent_id)
|
||||
|
||||
if message_type:
|
||||
param_num = len(params) + 1
|
||||
query += f" AND message_type = ${param_num}"
|
||||
params.append(message_type)
|
||||
|
||||
query += f" ORDER BY created_at DESC LIMIT {limit}"
|
||||
|
||||
async with self.db_pool.acquire() as conn:
|
||||
rows = await conn.fetch(query, *params)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
@property
|
||||
def connected(self) -> bool:
|
||||
"""Returns whether the bus is connected to Redis"""
|
||||
return self.redis is not None and self._running
|
||||
|
||||
@property
|
||||
def subscriber_count(self) -> int:
|
||||
"""Returns number of subscribed agents"""
|
||||
return len(self._subscriptions)
|
||||
553
agent-core/orchestrator/supervisor.py
Normal file
553
agent-core/orchestrator/supervisor.py
Normal file
@@ -0,0 +1,553 @@
|
||||
"""
|
||||
Agent Supervisor for Breakpilot
|
||||
|
||||
Provides:
|
||||
- Agent lifecycle management
|
||||
- Health monitoring
|
||||
- Restart policies
|
||||
- Load balancing
|
||||
"""
|
||||
|
||||
from typing import Dict, Optional, Callable, Awaitable, List, Any
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from enum import Enum
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from agent_core.sessions.heartbeat import HeartbeatMonitor
|
||||
from agent_core.orchestrator.message_bus import (
|
||||
MessageBus,
|
||||
AgentMessage,
|
||||
MessagePriority,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AgentStatus(Enum):
|
||||
"""Agent lifecycle states"""
|
||||
INITIALIZING = "initializing"
|
||||
STARTING = "starting"
|
||||
RUNNING = "running"
|
||||
PAUSED = "paused"
|
||||
STOPPING = "stopping"
|
||||
STOPPED = "stopped"
|
||||
ERROR = "error"
|
||||
RESTARTING = "restarting"
|
||||
|
||||
|
||||
class RestartPolicy(Enum):
|
||||
"""Agent restart policies"""
|
||||
NEVER = "never"
|
||||
ON_FAILURE = "on_failure"
|
||||
ALWAYS = "always"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentInfo:
|
||||
"""Information about a registered agent"""
|
||||
agent_id: str
|
||||
agent_type: str
|
||||
status: AgentStatus = AgentStatus.INITIALIZING
|
||||
current_task: Optional[str] = None
|
||||
started_at: Optional[datetime] = None
|
||||
last_activity: Optional[datetime] = None
|
||||
error_count: int = 0
|
||||
restart_count: int = 0
|
||||
max_restarts: int = 3
|
||||
restart_policy: RestartPolicy = RestartPolicy.ON_FAILURE
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
capacity: int = 10 # Max concurrent tasks
|
||||
current_load: int = 0
|
||||
|
||||
def is_healthy(self) -> bool:
|
||||
"""Check if agent is healthy"""
|
||||
return self.status == AgentStatus.RUNNING and self.error_count < 3
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if agent can accept new tasks"""
|
||||
return (
|
||||
self.status == AgentStatus.RUNNING and
|
||||
self.current_load < self.capacity
|
||||
)
|
||||
|
||||
def utilization(self) -> float:
|
||||
"""Returns agent utilization (0-1)"""
|
||||
return self.current_load / max(self.capacity, 1)
|
||||
|
||||
|
||||
AgentFactory = Callable[[str], Awaitable[Any]]
|
||||
|
||||
|
||||
class AgentSupervisor:
|
||||
"""
|
||||
Supervises and coordinates all agents.
|
||||
|
||||
Responsibilities:
|
||||
- Agent registration and lifecycle
|
||||
- Health monitoring via heartbeat
|
||||
- Restart policies
|
||||
- Load balancing
|
||||
- Alert escalation
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message_bus: MessageBus,
|
||||
heartbeat_monitor: Optional[HeartbeatMonitor] = None,
|
||||
check_interval_seconds: int = 10
|
||||
):
|
||||
"""
|
||||
Initialize the supervisor.
|
||||
|
||||
Args:
|
||||
message_bus: Message bus for inter-agent communication
|
||||
heartbeat_monitor: Heartbeat monitor for liveness checks
|
||||
check_interval_seconds: How often to run health checks
|
||||
"""
|
||||
self.bus = message_bus
|
||||
self.heartbeat = heartbeat_monitor or HeartbeatMonitor()
|
||||
self.check_interval = check_interval_seconds
|
||||
|
||||
self.agents: Dict[str, AgentInfo] = {}
|
||||
self._factories: Dict[str, AgentFactory] = {}
|
||||
self._running = False
|
||||
self._health_task: Optional[asyncio.Task] = None
|
||||
|
||||
# Set up heartbeat timeout handler
|
||||
self.heartbeat.on_timeout = self._handle_agent_timeout
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Starts the supervisor"""
|
||||
self._running = True
|
||||
await self.heartbeat.start_monitoring()
|
||||
|
||||
# Start health check loop
|
||||
self._health_task = asyncio.create_task(self._health_check_loop())
|
||||
|
||||
logger.info("Agent supervisor started")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stops the supervisor and all agents"""
|
||||
self._running = False
|
||||
|
||||
# Stop health check
|
||||
if self._health_task:
|
||||
self._health_task.cancel()
|
||||
try:
|
||||
await self._health_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Stop heartbeat monitor
|
||||
await self.heartbeat.stop_monitoring()
|
||||
|
||||
# Stop all agents
|
||||
for agent_id in list(self.agents.keys()):
|
||||
await self.stop_agent(agent_id)
|
||||
|
||||
logger.info("Agent supervisor stopped")
|
||||
|
||||
def register_factory(
|
||||
self,
|
||||
agent_type: str,
|
||||
factory: AgentFactory
|
||||
) -> None:
|
||||
"""
|
||||
Registers a factory function for creating agents.
|
||||
|
||||
Args:
|
||||
agent_type: Type of agent this factory creates
|
||||
factory: Async function that creates agent instances
|
||||
"""
|
||||
self._factories[agent_type] = factory
|
||||
logger.debug(f"Registered factory for agent type: {agent_type}")
|
||||
|
||||
async def register_agent(
|
||||
self,
|
||||
agent_id: str,
|
||||
agent_type: str,
|
||||
restart_policy: RestartPolicy = RestartPolicy.ON_FAILURE,
|
||||
max_restarts: int = 3,
|
||||
capacity: int = 10,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
) -> AgentInfo:
|
||||
"""
|
||||
Registers a new agent with the supervisor.
|
||||
|
||||
Args:
|
||||
agent_id: Unique agent identifier
|
||||
agent_type: Type of agent
|
||||
restart_policy: When to restart the agent
|
||||
max_restarts: Maximum restart attempts
|
||||
capacity: Max concurrent tasks
|
||||
metadata: Additional agent metadata
|
||||
|
||||
Returns:
|
||||
AgentInfo for the registered agent
|
||||
"""
|
||||
if agent_id in self.agents:
|
||||
logger.warning(f"Agent {agent_id} already registered")
|
||||
return self.agents[agent_id]
|
||||
|
||||
info = AgentInfo(
|
||||
agent_id=agent_id,
|
||||
agent_type=agent_type,
|
||||
restart_policy=restart_policy,
|
||||
max_restarts=max_restarts,
|
||||
capacity=capacity,
|
||||
metadata=metadata or {}
|
||||
)
|
||||
|
||||
self.agents[agent_id] = info
|
||||
self.heartbeat.register(agent_id, agent_type)
|
||||
|
||||
logger.info(f"Registered agent: {agent_id} ({agent_type})")
|
||||
|
||||
return info
|
||||
|
||||
async def start_agent(self, agent_id: str) -> bool:
|
||||
"""
|
||||
Starts a registered agent.
|
||||
|
||||
Args:
|
||||
agent_id: The agent to start
|
||||
|
||||
Returns:
|
||||
True if started successfully
|
||||
"""
|
||||
if agent_id not in self.agents:
|
||||
logger.error(f"Cannot start unregistered agent: {agent_id}")
|
||||
return False
|
||||
|
||||
info = self.agents[agent_id]
|
||||
|
||||
if info.status == AgentStatus.RUNNING:
|
||||
logger.warning(f"Agent {agent_id} is already running")
|
||||
return True
|
||||
|
||||
info.status = AgentStatus.STARTING
|
||||
|
||||
try:
|
||||
# If we have a factory, create the agent
|
||||
if info.agent_type in self._factories:
|
||||
factory = self._factories[info.agent_type]
|
||||
await factory(agent_id)
|
||||
|
||||
info.status = AgentStatus.RUNNING
|
||||
info.started_at = datetime.now(timezone.utc)
|
||||
info.last_activity = info.started_at
|
||||
|
||||
# Subscribe to message bus
|
||||
await self.bus.subscribe(
|
||||
agent_id,
|
||||
self._create_message_handler(agent_id)
|
||||
)
|
||||
|
||||
logger.info(f"Started agent: {agent_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
info.status = AgentStatus.ERROR
|
||||
info.error_count += 1
|
||||
logger.error(f"Failed to start agent {agent_id}: {e}")
|
||||
return False
|
||||
|
||||
async def stop_agent(self, agent_id: str) -> bool:
|
||||
"""
|
||||
Stops a running agent.
|
||||
|
||||
Args:
|
||||
agent_id: The agent to stop
|
||||
|
||||
Returns:
|
||||
True if stopped successfully
|
||||
"""
|
||||
if agent_id not in self.agents:
|
||||
return False
|
||||
|
||||
info = self.agents[agent_id]
|
||||
info.status = AgentStatus.STOPPING
|
||||
|
||||
try:
|
||||
# Unsubscribe from message bus
|
||||
await self.bus.unsubscribe(agent_id)
|
||||
|
||||
# Unregister from heartbeat
|
||||
self.heartbeat.unregister(agent_id)
|
||||
|
||||
info.status = AgentStatus.STOPPED
|
||||
logger.info(f"Stopped agent: {agent_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
info.status = AgentStatus.ERROR
|
||||
logger.error(f"Error stopping agent {agent_id}: {e}")
|
||||
return False
|
||||
|
||||
async def restart_agent(self, agent_id: str) -> bool:
|
||||
"""
|
||||
Restarts an agent.
|
||||
|
||||
Args:
|
||||
agent_id: The agent to restart
|
||||
|
||||
Returns:
|
||||
True if restarted successfully
|
||||
"""
|
||||
if agent_id not in self.agents:
|
||||
return False
|
||||
|
||||
info = self.agents[agent_id]
|
||||
|
||||
# Check restart count
|
||||
if info.restart_count >= info.max_restarts:
|
||||
logger.error(
|
||||
f"Agent {agent_id} exceeded max restarts "
|
||||
f"({info.restart_count}/{info.max_restarts})"
|
||||
)
|
||||
await self._escalate_agent_failure(agent_id)
|
||||
return False
|
||||
|
||||
info.status = AgentStatus.RESTARTING
|
||||
info.restart_count += 1
|
||||
|
||||
logger.info(
|
||||
f"Restarting agent {agent_id} "
|
||||
f"(attempt {info.restart_count}/{info.max_restarts})"
|
||||
)
|
||||
|
||||
# Stop and start
|
||||
await self.stop_agent(agent_id)
|
||||
await asyncio.sleep(1) # Brief pause
|
||||
return await self.start_agent(agent_id)
|
||||
|
||||
async def _handle_agent_timeout(
|
||||
self,
|
||||
session_id: str,
|
||||
agent_type: str
|
||||
) -> None:
|
||||
"""Handles agent heartbeat timeout"""
|
||||
# Find the agent by session/ID
|
||||
agent_id = session_id # In our case, session_id == agent_id
|
||||
|
||||
if agent_id not in self.agents:
|
||||
return
|
||||
|
||||
info = self.agents[agent_id]
|
||||
info.status = AgentStatus.ERROR
|
||||
info.error_count += 1
|
||||
|
||||
logger.warning(f"Agent {agent_id} timed out (heartbeat)")
|
||||
|
||||
# Apply restart policy
|
||||
if info.restart_policy == RestartPolicy.NEVER:
|
||||
await self._escalate_agent_failure(agent_id)
|
||||
elif info.restart_policy == RestartPolicy.ON_FAILURE:
|
||||
if info.restart_count < info.max_restarts:
|
||||
await self.restart_agent(agent_id)
|
||||
else:
|
||||
await self._escalate_agent_failure(agent_id)
|
||||
elif info.restart_policy == RestartPolicy.ALWAYS:
|
||||
await self.restart_agent(agent_id)
|
||||
|
||||
async def _escalate_agent_failure(self, agent_id: str) -> None:
|
||||
"""Escalates an agent failure to the alert system"""
|
||||
info = self.agents.get(agent_id)
|
||||
if not info:
|
||||
return
|
||||
|
||||
await self.bus.publish(AgentMessage(
|
||||
sender="supervisor",
|
||||
receiver="alert-agent",
|
||||
message_type="agent_failure",
|
||||
payload={
|
||||
"agent_id": agent_id,
|
||||
"agent_type": info.agent_type,
|
||||
"error_count": info.error_count,
|
||||
"restart_count": info.restart_count,
|
||||
"last_activity": info.last_activity.isoformat() if info.last_activity else None
|
||||
},
|
||||
priority=MessagePriority.CRITICAL
|
||||
))
|
||||
|
||||
logger.error(f"Escalated agent failure: {agent_id}")
|
||||
|
||||
def _create_message_handler(self, agent_id: str):
|
||||
"""Creates a message handler that updates agent activity"""
|
||||
async def handler(message: AgentMessage):
|
||||
if agent_id in self.agents:
|
||||
self.agents[agent_id].last_activity = datetime.now(timezone.utc)
|
||||
# Heartbeat on activity
|
||||
self.heartbeat.beat(agent_id)
|
||||
return None
|
||||
return handler
|
||||
|
||||
async def _health_check_loop(self) -> None:
|
||||
"""Periodic health check loop"""
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
await self._run_health_checks()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health check loop: {e}")
|
||||
|
||||
async def _run_health_checks(self) -> None:
|
||||
"""Runs health checks on all agents"""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
for agent_id, info in list(self.agents.items()):
|
||||
if info.status != AgentStatus.RUNNING:
|
||||
continue
|
||||
|
||||
# Check for stale agents (no activity for 5 minutes)
|
||||
if info.last_activity:
|
||||
idle_time = now - info.last_activity
|
||||
if idle_time > timedelta(minutes=5):
|
||||
logger.warning(
|
||||
f"Agent {agent_id} has been idle for "
|
||||
f"{idle_time.total_seconds():.0f}s"
|
||||
)
|
||||
|
||||
# Load Balancing
|
||||
|
||||
def get_available_agent(
|
||||
self,
|
||||
agent_type: str,
|
||||
strategy: str = "least_loaded"
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Gets an available agent of the specified type.
|
||||
|
||||
Args:
|
||||
agent_type: Type of agent needed
|
||||
strategy: Load balancing strategy
|
||||
|
||||
Returns:
|
||||
Agent ID or None if none available
|
||||
"""
|
||||
candidates = [
|
||||
info for info in self.agents.values()
|
||||
if info.agent_type == agent_type and info.is_available()
|
||||
]
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
if strategy == "least_loaded":
|
||||
# Pick agent with lowest load
|
||||
best = min(candidates, key=lambda a: a.utilization())
|
||||
elif strategy == "round_robin":
|
||||
# Simple round-robin (just pick first available)
|
||||
best = candidates[0]
|
||||
else:
|
||||
best = candidates[0]
|
||||
|
||||
return best.agent_id
|
||||
|
||||
def acquire_capacity(self, agent_id: str) -> bool:
|
||||
"""
|
||||
Acquires capacity from an agent.
|
||||
|
||||
Args:
|
||||
agent_id: The agent to acquire from
|
||||
|
||||
Returns:
|
||||
True if capacity was acquired
|
||||
"""
|
||||
if agent_id not in self.agents:
|
||||
return False
|
||||
|
||||
info = self.agents[agent_id]
|
||||
if not info.is_available():
|
||||
return False
|
||||
|
||||
info.current_load += 1
|
||||
return True
|
||||
|
||||
def release_capacity(self, agent_id: str) -> None:
|
||||
"""
|
||||
Releases capacity back to an agent.
|
||||
|
||||
Args:
|
||||
agent_id: The agent to release to
|
||||
"""
|
||||
if agent_id in self.agents:
|
||||
info = self.agents[agent_id]
|
||||
info.current_load = max(0, info.current_load - 1)
|
||||
|
||||
# Status and Metrics
|
||||
|
||||
def get_agent_status(self, agent_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Gets status information for an agent"""
|
||||
if agent_id not in self.agents:
|
||||
return None
|
||||
|
||||
info = self.agents[agent_id]
|
||||
return {
|
||||
"agent_id": info.agent_id,
|
||||
"agent_type": info.agent_type,
|
||||
"status": info.status.value,
|
||||
"current_task": info.current_task,
|
||||
"started_at": info.started_at.isoformat() if info.started_at else None,
|
||||
"last_activity": info.last_activity.isoformat() if info.last_activity else None,
|
||||
"error_count": info.error_count,
|
||||
"restart_count": info.restart_count,
|
||||
"utilization": info.utilization(),
|
||||
"is_healthy": info.is_healthy(),
|
||||
"is_available": info.is_available()
|
||||
}
|
||||
|
||||
def get_all_status(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Gets status for all agents"""
|
||||
return {
|
||||
agent_id: self.get_agent_status(agent_id)
|
||||
for agent_id in self.agents
|
||||
}
|
||||
|
||||
def get_metrics(self) -> Dict[str, Any]:
|
||||
"""Gets aggregate metrics"""
|
||||
total = len(self.agents)
|
||||
running = sum(
|
||||
1 for a in self.agents.values()
|
||||
if a.status == AgentStatus.RUNNING
|
||||
)
|
||||
healthy = sum(1 for a in self.agents.values() if a.is_healthy())
|
||||
available = sum(1 for a in self.agents.values() if a.is_available())
|
||||
|
||||
total_capacity = sum(a.capacity for a in self.agents.values())
|
||||
total_load = sum(a.current_load for a in self.agents.values())
|
||||
|
||||
return {
|
||||
"total_agents": total,
|
||||
"running_agents": running,
|
||||
"healthy_agents": healthy,
|
||||
"available_agents": available,
|
||||
"total_capacity": total_capacity,
|
||||
"total_load": total_load,
|
||||
"overall_utilization": total_load / max(total_capacity, 1),
|
||||
"by_type": self._metrics_by_type()
|
||||
}
|
||||
|
||||
def _metrics_by_type(self) -> Dict[str, Dict[str, int]]:
|
||||
"""Gets metrics grouped by agent type"""
|
||||
by_type: Dict[str, Dict[str, int]] = {}
|
||||
|
||||
for info in self.agents.values():
|
||||
if info.agent_type not in by_type:
|
||||
by_type[info.agent_type] = {
|
||||
"total": 0,
|
||||
"running": 0,
|
||||
"healthy": 0
|
||||
}
|
||||
|
||||
by_type[info.agent_type]["total"] += 1
|
||||
if info.status == AgentStatus.RUNNING:
|
||||
by_type[info.agent_type]["running"] += 1
|
||||
if info.is_healthy():
|
||||
by_type[info.agent_type]["healthy"] += 1
|
||||
|
||||
return by_type
|
||||
436
agent-core/orchestrator/task_router.py
Normal file
436
agent-core/orchestrator/task_router.py
Normal file
@@ -0,0 +1,436 @@
|
||||
"""
|
||||
Task Router for Intent-Based Routing
|
||||
|
||||
Provides:
|
||||
- Intent classification
|
||||
- Agent selection
|
||||
- Fallback handling
|
||||
- Routing metrics
|
||||
"""
|
||||
|
||||
from typing import Dict, Optional, List, Any, Callable, Awaitable
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
import logging
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RoutingStrategy(Enum):
|
||||
"""Routing strategies"""
|
||||
DIRECT = "direct" # Route to specific agent
|
||||
ROUND_ROBIN = "round_robin" # Distribute evenly
|
||||
LEAST_LOADED = "least_loaded" # Route to least loaded
|
||||
PRIORITY = "priority" # Route based on priority
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoutingRule:
|
||||
"""A rule for routing tasks to agents"""
|
||||
intent_pattern: str
|
||||
agent_type: str
|
||||
priority: int = 0
|
||||
conditions: Dict[str, Any] = field(default_factory=dict)
|
||||
fallback_agent: Optional[str] = None
|
||||
|
||||
def matches(self, intent: str, context: Dict[str, Any]) -> bool:
|
||||
"""Check if this rule matches the intent and context"""
|
||||
# Check intent pattern (supports wildcards)
|
||||
pattern = self.intent_pattern.replace("*", ".*")
|
||||
if not re.match(f"^{pattern}$", intent, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
# Check conditions
|
||||
for key, value in self.conditions.items():
|
||||
if context.get(key) != value:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoutingResult:
|
||||
"""Result of a routing decision"""
|
||||
success: bool
|
||||
agent_id: Optional[str] = None
|
||||
agent_type: Optional[str] = None
|
||||
is_fallback: bool = False
|
||||
reason: str = ""
|
||||
routing_time_ms: float = 0
|
||||
|
||||
|
||||
class TaskRouter:
|
||||
"""
|
||||
Routes tasks to appropriate agents based on intent.
|
||||
|
||||
Features:
|
||||
- Pattern-based routing rules
|
||||
- Priority ordering
|
||||
- Fallback chains
|
||||
- Routing metrics
|
||||
"""
|
||||
|
||||
def __init__(self, supervisor=None):
|
||||
"""
|
||||
Initialize the task router.
|
||||
|
||||
Args:
|
||||
supervisor: AgentSupervisor for agent availability
|
||||
"""
|
||||
self.supervisor = supervisor
|
||||
self.rules: List[RoutingRule] = []
|
||||
self._default_routes: Dict[str, str] = {}
|
||||
self._routing_history: List[Dict[str, Any]] = []
|
||||
self._max_history = 1000
|
||||
|
||||
# Initialize default rules
|
||||
self._setup_default_rules()
|
||||
|
||||
def _setup_default_rules(self) -> None:
|
||||
"""Sets up default routing rules"""
|
||||
default_rules = [
|
||||
# Learning support
|
||||
RoutingRule(
|
||||
intent_pattern="learning_*",
|
||||
agent_type="tutor-agent",
|
||||
priority=10,
|
||||
fallback_agent="orchestrator"
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="help_*",
|
||||
agent_type="tutor-agent",
|
||||
priority=5
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="explain_*",
|
||||
agent_type="tutor-agent",
|
||||
priority=5
|
||||
),
|
||||
|
||||
# Grading
|
||||
RoutingRule(
|
||||
intent_pattern="grade_*",
|
||||
agent_type="grader-agent",
|
||||
priority=10,
|
||||
fallback_agent="quality-judge"
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="evaluate_*",
|
||||
agent_type="grader-agent",
|
||||
priority=5
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="correct_*",
|
||||
agent_type="grader-agent",
|
||||
priority=5
|
||||
),
|
||||
|
||||
# Quality checks
|
||||
RoutingRule(
|
||||
intent_pattern="quality_*",
|
||||
agent_type="quality-judge",
|
||||
priority=10
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="review_*",
|
||||
agent_type="quality-judge",
|
||||
priority=5
|
||||
),
|
||||
|
||||
# Alerts
|
||||
RoutingRule(
|
||||
intent_pattern="alert_*",
|
||||
agent_type="alert-agent",
|
||||
priority=10
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="notify_*",
|
||||
agent_type="alert-agent",
|
||||
priority=5
|
||||
),
|
||||
|
||||
# System/Admin
|
||||
RoutingRule(
|
||||
intent_pattern="system_*",
|
||||
agent_type="orchestrator",
|
||||
priority=10
|
||||
),
|
||||
RoutingRule(
|
||||
intent_pattern="admin_*",
|
||||
agent_type="orchestrator",
|
||||
priority=10
|
||||
),
|
||||
]
|
||||
|
||||
for rule in default_rules:
|
||||
self.add_rule(rule)
|
||||
|
||||
def add_rule(self, rule: RoutingRule) -> None:
|
||||
"""
|
||||
Adds a routing rule.
|
||||
|
||||
Args:
|
||||
rule: The routing rule to add
|
||||
"""
|
||||
self.rules.append(rule)
|
||||
# Sort by priority (higher first)
|
||||
self.rules.sort(key=lambda r: r.priority, reverse=True)
|
||||
|
||||
def remove_rule(self, intent_pattern: str) -> bool:
|
||||
"""
|
||||
Removes a routing rule by pattern.
|
||||
|
||||
Args:
|
||||
intent_pattern: The pattern to remove
|
||||
|
||||
Returns:
|
||||
True if a rule was removed
|
||||
"""
|
||||
original_len = len(self.rules)
|
||||
self.rules = [r for r in self.rules if r.intent_pattern != intent_pattern]
|
||||
return len(self.rules) < original_len
|
||||
|
||||
def set_default_route(self, agent_type: str, agent_id: str) -> None:
|
||||
"""
|
||||
Sets a default agent for a type.
|
||||
|
||||
Args:
|
||||
agent_type: The agent type
|
||||
agent_id: The default agent ID
|
||||
"""
|
||||
self._default_routes[agent_type] = agent_id
|
||||
|
||||
async def route(
|
||||
self,
|
||||
intent: str,
|
||||
context: Optional[Dict[str, Any]] = None,
|
||||
strategy: RoutingStrategy = RoutingStrategy.LEAST_LOADED
|
||||
) -> RoutingResult:
|
||||
"""
|
||||
Routes a task based on intent.
|
||||
|
||||
Args:
|
||||
intent: The task intent
|
||||
context: Additional context for routing
|
||||
strategy: Load balancing strategy
|
||||
|
||||
Returns:
|
||||
RoutingResult with agent assignment
|
||||
"""
|
||||
start_time = datetime.now(timezone.utc)
|
||||
context = context or {}
|
||||
|
||||
# Find matching rule
|
||||
matching_rule = None
|
||||
for rule in self.rules:
|
||||
if rule.matches(intent, context):
|
||||
matching_rule = rule
|
||||
break
|
||||
|
||||
if not matching_rule:
|
||||
result = RoutingResult(
|
||||
success=False,
|
||||
reason=f"No routing rule matches intent: {intent}"
|
||||
)
|
||||
self._record_routing(intent, result)
|
||||
return result
|
||||
|
||||
# Get available agent
|
||||
agent_id = await self._get_agent(
|
||||
matching_rule.agent_type,
|
||||
strategy
|
||||
)
|
||||
|
||||
if agent_id:
|
||||
result = RoutingResult(
|
||||
success=True,
|
||||
agent_id=agent_id,
|
||||
agent_type=matching_rule.agent_type,
|
||||
is_fallback=False,
|
||||
reason="Primary agent selected"
|
||||
)
|
||||
elif matching_rule.fallback_agent:
|
||||
# Try fallback
|
||||
agent_id = await self._get_agent(
|
||||
matching_rule.fallback_agent,
|
||||
strategy
|
||||
)
|
||||
if agent_id:
|
||||
result = RoutingResult(
|
||||
success=True,
|
||||
agent_id=agent_id,
|
||||
agent_type=matching_rule.fallback_agent,
|
||||
is_fallback=True,
|
||||
reason="Fallback agent selected"
|
||||
)
|
||||
else:
|
||||
result = RoutingResult(
|
||||
success=False,
|
||||
reason="No agents available (primary or fallback)"
|
||||
)
|
||||
else:
|
||||
result = RoutingResult(
|
||||
success=False,
|
||||
reason=f"No {matching_rule.agent_type} agents available"
|
||||
)
|
||||
|
||||
# Calculate routing time
|
||||
end_time = datetime.now(timezone.utc)
|
||||
result.routing_time_ms = (end_time - start_time).total_seconds() * 1000
|
||||
|
||||
self._record_routing(intent, result)
|
||||
|
||||
return result
|
||||
|
||||
async def _get_agent(
|
||||
self,
|
||||
agent_type: str,
|
||||
strategy: RoutingStrategy
|
||||
) -> Optional[str]:
|
||||
"""Gets an available agent of the specified type"""
|
||||
# Check default route first
|
||||
if agent_type in self._default_routes:
|
||||
agent_id = self._default_routes[agent_type]
|
||||
if self.supervisor and self.supervisor.agents.get(agent_id):
|
||||
info = self.supervisor.agents[agent_id]
|
||||
if info.is_available():
|
||||
return agent_id
|
||||
|
||||
# Use supervisor for load balancing
|
||||
if self.supervisor:
|
||||
strategy_str = "least_loaded"
|
||||
if strategy == RoutingStrategy.ROUND_ROBIN:
|
||||
strategy_str = "round_robin"
|
||||
|
||||
return self.supervisor.get_available_agent(
|
||||
agent_type,
|
||||
strategy_str
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def _record_routing(
|
||||
self,
|
||||
intent: str,
|
||||
result: RoutingResult
|
||||
) -> None:
|
||||
"""Records routing decision for analytics"""
|
||||
record = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"intent": intent,
|
||||
"success": result.success,
|
||||
"agent_id": result.agent_id,
|
||||
"agent_type": result.agent_type,
|
||||
"is_fallback": result.is_fallback,
|
||||
"routing_time_ms": result.routing_time_ms,
|
||||
"reason": result.reason
|
||||
}
|
||||
|
||||
self._routing_history.append(record)
|
||||
|
||||
# Trim history
|
||||
if len(self._routing_history) > self._max_history:
|
||||
self._routing_history = self._routing_history[-self._max_history:]
|
||||
|
||||
# Log
|
||||
if result.success:
|
||||
logger.debug(
|
||||
f"Routed '{intent}' to {result.agent_id} "
|
||||
f"({'fallback' if result.is_fallback else 'primary'})"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Failed to route '{intent}': {result.reason}")
|
||||
|
||||
# Analytics
|
||||
|
||||
def get_routing_stats(self) -> Dict[str, Any]:
|
||||
"""Gets routing statistics"""
|
||||
if not self._routing_history:
|
||||
return {
|
||||
"total_routes": 0,
|
||||
"success_rate": 0,
|
||||
"fallback_rate": 0,
|
||||
"avg_routing_time_ms": 0
|
||||
}
|
||||
|
||||
total = len(self._routing_history)
|
||||
successful = sum(1 for r in self._routing_history if r["success"])
|
||||
fallbacks = sum(1 for r in self._routing_history if r["is_fallback"])
|
||||
avg_time = sum(
|
||||
r["routing_time_ms"] for r in self._routing_history
|
||||
) / total
|
||||
|
||||
return {
|
||||
"total_routes": total,
|
||||
"successful_routes": successful,
|
||||
"success_rate": successful / total,
|
||||
"fallback_routes": fallbacks,
|
||||
"fallback_rate": fallbacks / max(successful, 1),
|
||||
"avg_routing_time_ms": avg_time
|
||||
}
|
||||
|
||||
def get_intent_distribution(self) -> Dict[str, int]:
|
||||
"""Gets distribution of routed intents"""
|
||||
distribution: Dict[str, int] = {}
|
||||
for record in self._routing_history:
|
||||
intent = record["intent"]
|
||||
# Extract base intent (before _)
|
||||
base = intent.split("_")[0] if "_" in intent else intent
|
||||
distribution[base] = distribution.get(base, 0) + 1
|
||||
return distribution
|
||||
|
||||
def get_agent_distribution(self) -> Dict[str, int]:
|
||||
"""Gets distribution of routes by agent type"""
|
||||
distribution: Dict[str, int] = {}
|
||||
for record in self._routing_history:
|
||||
agent_type = record.get("agent_type", "unknown")
|
||||
if agent_type:
|
||||
distribution[agent_type] = distribution.get(agent_type, 0) + 1
|
||||
return distribution
|
||||
|
||||
def get_failure_reasons(self) -> Dict[str, int]:
|
||||
"""Gets distribution of routing failure reasons"""
|
||||
reasons: Dict[str, int] = {}
|
||||
for record in self._routing_history:
|
||||
if not record["success"]:
|
||||
reason = record["reason"]
|
||||
reasons[reason] = reasons.get(reason, 0) + 1
|
||||
return reasons
|
||||
|
||||
def clear_history(self) -> None:
|
||||
"""Clears routing history"""
|
||||
self._routing_history.clear()
|
||||
|
||||
# Rule inspection
|
||||
|
||||
def get_rules(self) -> List[Dict[str, Any]]:
|
||||
"""Gets all routing rules as dicts"""
|
||||
return [
|
||||
{
|
||||
"intent_pattern": r.intent_pattern,
|
||||
"agent_type": r.agent_type,
|
||||
"priority": r.priority,
|
||||
"conditions": r.conditions,
|
||||
"fallback_agent": r.fallback_agent
|
||||
}
|
||||
for r in self.rules
|
||||
]
|
||||
|
||||
def find_matching_rules(
|
||||
self,
|
||||
intent: str,
|
||||
context: Optional[Dict[str, Any]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Finds all rules that match an intent"""
|
||||
context = context or {}
|
||||
return [
|
||||
{
|
||||
"intent_pattern": r.intent_pattern,
|
||||
"agent_type": r.agent_type,
|
||||
"priority": r.priority
|
||||
}
|
||||
for r in self.rules
|
||||
if r.matches(intent, context)
|
||||
]
|
||||
Reference in New Issue
Block a user