fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,36 @@
"""
Orchestration Layer for Breakpilot Agents
Provides:
- MessageBus: Inter-agent communication via Redis Pub/Sub
- AgentSupervisor: Agent lifecycle and health management
- TaskRouter: Intent-based task routing
"""
from agent_core.orchestrator.message_bus import (
MessageBus,
AgentMessage,
MessagePriority,
)
from agent_core.orchestrator.supervisor import (
AgentSupervisor,
AgentInfo,
AgentStatus,
)
from agent_core.orchestrator.task_router import (
TaskRouter,
RoutingResult,
RoutingStrategy,
)
__all__ = [
"MessageBus",
"AgentMessage",
"MessagePriority",
"AgentSupervisor",
"AgentInfo",
"AgentStatus",
"TaskRouter",
"RoutingResult",
"RoutingStrategy",
]

View File

@@ -0,0 +1,479 @@
"""
Message Bus for Inter-Agent Communication
Provides:
- Pub/Sub messaging via Redis/Valkey
- Request-Response pattern with timeouts
- Priority-based message handling
- Message persistence for audit
"""
from typing import Callable, Dict, Any, List, Optional, Awaitable
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
import asyncio
import uuid
import json
import logging
logger = logging.getLogger(__name__)
class MessagePriority(Enum):
"""Message priority levels"""
LOW = 0
NORMAL = 1
HIGH = 2
CRITICAL = 3
class MessageType(Enum):
"""Standard message types"""
REQUEST = "request"
RESPONSE = "response"
EVENT = "event"
BROADCAST = "broadcast"
HEARTBEAT = "heartbeat"
@dataclass
class AgentMessage:
"""Represents a message between agents"""
sender: str
receiver: str
message_type: str
payload: Dict[str, Any]
priority: MessagePriority = MessagePriority.NORMAL
correlation_id: str = field(default_factory=lambda: str(uuid.uuid4()))
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
reply_to: Optional[str] = None
expires_at: Optional[datetime] = None
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return {
"sender": self.sender,
"receiver": self.receiver,
"message_type": self.message_type,
"payload": self.payload,
"priority": self.priority.value,
"correlation_id": self.correlation_id,
"timestamp": self.timestamp.isoformat(),
"reply_to": self.reply_to,
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
"metadata": self.metadata
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AgentMessage":
return cls(
sender=data["sender"],
receiver=data["receiver"],
message_type=data["message_type"],
payload=data["payload"],
priority=MessagePriority(data.get("priority", 1)),
correlation_id=data.get("correlation_id", str(uuid.uuid4())),
timestamp=datetime.fromisoformat(data["timestamp"]) if "timestamp" in data else datetime.now(timezone.utc),
reply_to=data.get("reply_to"),
expires_at=datetime.fromisoformat(data["expires_at"]) if data.get("expires_at") else None,
metadata=data.get("metadata", {})
)
MessageHandler = Callable[[AgentMessage], Awaitable[Optional[Dict[str, Any]]]]
class MessageBus:
"""
Inter-agent communication via Redis Pub/Sub.
Features:
- Publish/Subscribe pattern
- Request/Response with timeout
- Priority queues
- Message persistence for audit
"""
def __init__(
self,
redis_client=None,
db_pool=None,
namespace: str = "breakpilot",
persist_messages: bool = True
):
"""
Initialize the message bus.
Args:
redis_client: Async Redis/Valkey client
db_pool: Async PostgreSQL pool for persistence
namespace: Channel namespace
persist_messages: Whether to persist messages for audit
"""
self.redis = redis_client
self.db_pool = db_pool
self.namespace = namespace
self.persist_messages = persist_messages
self._handlers: Dict[str, List[MessageHandler]] = {}
self._pending_responses: Dict[str, asyncio.Future] = {}
self._subscriptions: Dict[str, asyncio.Task] = {}
self._running = False
def _channel(self, agent_id: str) -> str:
"""Generate channel name for agent"""
return f"{self.namespace}:agent:{agent_id}"
def _broadcast_channel(self) -> str:
"""Generate broadcast channel name"""
return f"{self.namespace}:broadcast"
async def start(self) -> None:
"""Starts the message bus"""
self._running = True
logger.info("Message bus started")
async def stop(self) -> None:
"""Stops the message bus and all subscriptions"""
self._running = False
# Cancel all subscription tasks
for task in self._subscriptions.values():
task.cancel()
# Wait for cancellation
if self._subscriptions:
await asyncio.gather(
*self._subscriptions.values(),
return_exceptions=True
)
self._subscriptions.clear()
logger.info("Message bus stopped")
async def subscribe(
self,
agent_id: str,
handler: MessageHandler
) -> None:
"""
Subscribe an agent to receive messages.
Args:
agent_id: The agent ID to subscribe
handler: Async function to handle incoming messages
"""
if agent_id in self._subscriptions:
logger.warning(f"Agent {agent_id} already subscribed")
return
if agent_id not in self._handlers:
self._handlers[agent_id] = []
self._handlers[agent_id].append(handler)
if self.redis:
# Start Redis subscription
task = asyncio.create_task(
self._subscription_loop(agent_id)
)
self._subscriptions[agent_id] = task
logger.info(f"Agent {agent_id} subscribed to message bus")
async def unsubscribe(self, agent_id: str) -> None:
"""
Unsubscribe an agent from messages.
Args:
agent_id: The agent ID to unsubscribe
"""
if agent_id in self._subscriptions:
self._subscriptions[agent_id].cancel()
try:
await self._subscriptions[agent_id]
except asyncio.CancelledError:
pass
del self._subscriptions[agent_id]
self._handlers.pop(agent_id, None)
logger.info(f"Agent {agent_id} unsubscribed from message bus")
async def _subscription_loop(self, agent_id: str) -> None:
"""Main subscription loop for an agent"""
if not self.redis:
return
channel = self._channel(agent_id)
broadcast = self._broadcast_channel()
pubsub = self.redis.pubsub()
await pubsub.subscribe(channel, broadcast)
try:
while self._running:
message = await pubsub.get_message(
ignore_subscribe_messages=True,
timeout=1.0
)
if message and message["type"] == "message":
await self._handle_incoming_message(
agent_id,
message["data"]
)
except asyncio.CancelledError:
pass
finally:
await pubsub.unsubscribe(channel, broadcast)
await pubsub.close()
async def _handle_incoming_message(
self,
agent_id: str,
raw_data: bytes
) -> None:
"""Process an incoming message"""
try:
data = json.loads(raw_data)
message = AgentMessage.from_dict(data)
# Check if this is a response to a pending request
if message.correlation_id in self._pending_responses:
future = self._pending_responses[message.correlation_id]
if not future.done():
future.set_result(message.payload)
return
# Call handlers
handlers = self._handlers.get(agent_id, [])
for handler in handlers:
try:
response = await handler(message)
# If handler returns data and there's a reply_to, send response
if response and message.reply_to:
await self.publish(AgentMessage(
sender=agent_id,
receiver=message.sender,
message_type="response",
payload=response,
correlation_id=message.correlation_id
))
except Exception as e:
logger.error(
f"Error in message handler for {agent_id}: {e}"
)
except Exception as e:
logger.error(f"Error processing message: {e}")
async def publish(self, message: AgentMessage) -> None:
"""
Publishes a message to an agent.
Args:
message: The message to publish
"""
# Persist message if enabled
if self.persist_messages:
await self._persist_message(message)
if self.redis:
channel = self._channel(message.receiver)
await self.redis.publish(
channel,
json.dumps(message.to_dict())
)
else:
# Local delivery for testing
await self._local_deliver(message)
logger.debug(
f"Published message from {message.sender} to {message.receiver}: "
f"{message.message_type}"
)
async def broadcast(self, message: AgentMessage) -> None:
"""
Broadcasts a message to all agents.
Args:
message: The message to broadcast
"""
message.receiver = "*" # Indicate broadcast
if self.persist_messages:
await self._persist_message(message)
if self.redis:
await self.redis.publish(
self._broadcast_channel(),
json.dumps(message.to_dict())
)
else:
# Local broadcast
for agent_id in self._handlers:
await self._local_deliver(message, agent_id)
logger.debug(f"Broadcast message from {message.sender}: {message.message_type}")
async def request(
self,
message: AgentMessage,
timeout: float = 30.0
) -> Dict[str, Any]:
"""
Sends a request and waits for a response.
Args:
message: The request message
timeout: Timeout in seconds
Returns:
Response payload
Raises:
TimeoutError: If no response within timeout
"""
# Mark this as a request that needs a response
message.reply_to = message.sender
# Create future for response
future: asyncio.Future = asyncio.Future()
self._pending_responses[message.correlation_id] = future
try:
# Publish the request
await self.publish(message)
# Wait for response
return await asyncio.wait_for(future, timeout)
except asyncio.TimeoutError:
logger.warning(
f"Request timeout: {message.sender} -> {message.receiver} "
f"({message.message_type})"
)
raise
finally:
# Clean up
self._pending_responses.pop(message.correlation_id, None)
async def _local_deliver(
self,
message: AgentMessage,
target_agent: Optional[str] = None
) -> None:
"""Local message delivery for testing without Redis"""
agent_id = target_agent or message.receiver
handlers = self._handlers.get(agent_id, [])
for handler in handlers:
try:
response = await handler(message)
if response and message.reply_to:
if message.correlation_id in self._pending_responses:
future = self._pending_responses[message.correlation_id]
if not future.done():
future.set_result(response)
except Exception as e:
logger.error(f"Error in local handler: {e}")
async def _persist_message(self, message: AgentMessage) -> None:
"""Persist message to PostgreSQL for audit"""
if not self.db_pool:
return
try:
async with self.db_pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO agent_messages
(id, sender, receiver, message_type, payload,
priority, correlation_id, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
""",
str(uuid.uuid4()),
message.sender,
message.receiver,
message.message_type,
json.dumps(message.payload),
message.priority.value,
message.correlation_id,
message.timestamp
)
except Exception as e:
logger.warning(f"Failed to persist message: {e}")
def on_message(self, message_type: str):
"""
Decorator for message handlers.
Usage:
@bus.on_message("grade_request")
async def handle_grade(message):
return {"score": 85}
"""
def decorator(func: MessageHandler):
async def wrapper(message: AgentMessage):
if message.message_type == message_type:
return await func(message)
return None
return wrapper
return decorator
async def get_message_history(
self,
agent_id: Optional[str] = None,
message_type: Optional[str] = None,
limit: int = 100
) -> List[Dict[str, Any]]:
"""
Gets message history from persistence.
Args:
agent_id: Filter by sender or receiver
message_type: Filter by message type
limit: Maximum results
Returns:
List of message dicts
"""
if not self.db_pool:
return []
query = """
SELECT sender, receiver, message_type, payload, priority,
correlation_id, created_at
FROM agent_messages
WHERE 1=1
"""
params = []
if agent_id:
query += " AND (sender = $1 OR receiver = $1)"
params.append(agent_id)
if message_type:
param_num = len(params) + 1
query += f" AND message_type = ${param_num}"
params.append(message_type)
query += f" ORDER BY created_at DESC LIMIT {limit}"
async with self.db_pool.acquire() as conn:
rows = await conn.fetch(query, *params)
return [dict(row) for row in rows]
@property
def connected(self) -> bool:
"""Returns whether the bus is connected to Redis"""
return self.redis is not None and self._running
@property
def subscriber_count(self) -> int:
"""Returns number of subscribed agents"""
return len(self._subscriptions)

View File

@@ -0,0 +1,553 @@
"""
Agent Supervisor for Breakpilot
Provides:
- Agent lifecycle management
- Health monitoring
- Restart policies
- Load balancing
"""
from typing import Dict, Optional, Callable, Awaitable, List, Any
from dataclasses import dataclass, field
from datetime import datetime, timezone, timedelta
from enum import Enum
import asyncio
import logging
from agent_core.sessions.heartbeat import HeartbeatMonitor
from agent_core.orchestrator.message_bus import (
MessageBus,
AgentMessage,
MessagePriority,
)
logger = logging.getLogger(__name__)
class AgentStatus(Enum):
"""Agent lifecycle states"""
INITIALIZING = "initializing"
STARTING = "starting"
RUNNING = "running"
PAUSED = "paused"
STOPPING = "stopping"
STOPPED = "stopped"
ERROR = "error"
RESTARTING = "restarting"
class RestartPolicy(Enum):
"""Agent restart policies"""
NEVER = "never"
ON_FAILURE = "on_failure"
ALWAYS = "always"
@dataclass
class AgentInfo:
"""Information about a registered agent"""
agent_id: str
agent_type: str
status: AgentStatus = AgentStatus.INITIALIZING
current_task: Optional[str] = None
started_at: Optional[datetime] = None
last_activity: Optional[datetime] = None
error_count: int = 0
restart_count: int = 0
max_restarts: int = 3
restart_policy: RestartPolicy = RestartPolicy.ON_FAILURE
metadata: Dict[str, Any] = field(default_factory=dict)
capacity: int = 10 # Max concurrent tasks
current_load: int = 0
def is_healthy(self) -> bool:
"""Check if agent is healthy"""
return self.status == AgentStatus.RUNNING and self.error_count < 3
def is_available(self) -> bool:
"""Check if agent can accept new tasks"""
return (
self.status == AgentStatus.RUNNING and
self.current_load < self.capacity
)
def utilization(self) -> float:
"""Returns agent utilization (0-1)"""
return self.current_load / max(self.capacity, 1)
AgentFactory = Callable[[str], Awaitable[Any]]
class AgentSupervisor:
"""
Supervises and coordinates all agents.
Responsibilities:
- Agent registration and lifecycle
- Health monitoring via heartbeat
- Restart policies
- Load balancing
- Alert escalation
"""
def __init__(
self,
message_bus: MessageBus,
heartbeat_monitor: Optional[HeartbeatMonitor] = None,
check_interval_seconds: int = 10
):
"""
Initialize the supervisor.
Args:
message_bus: Message bus for inter-agent communication
heartbeat_monitor: Heartbeat monitor for liveness checks
check_interval_seconds: How often to run health checks
"""
self.bus = message_bus
self.heartbeat = heartbeat_monitor or HeartbeatMonitor()
self.check_interval = check_interval_seconds
self.agents: Dict[str, AgentInfo] = {}
self._factories: Dict[str, AgentFactory] = {}
self._running = False
self._health_task: Optional[asyncio.Task] = None
# Set up heartbeat timeout handler
self.heartbeat.on_timeout = self._handle_agent_timeout
async def start(self) -> None:
"""Starts the supervisor"""
self._running = True
await self.heartbeat.start_monitoring()
# Start health check loop
self._health_task = asyncio.create_task(self._health_check_loop())
logger.info("Agent supervisor started")
async def stop(self) -> None:
"""Stops the supervisor and all agents"""
self._running = False
# Stop health check
if self._health_task:
self._health_task.cancel()
try:
await self._health_task
except asyncio.CancelledError:
pass
# Stop heartbeat monitor
await self.heartbeat.stop_monitoring()
# Stop all agents
for agent_id in list(self.agents.keys()):
await self.stop_agent(agent_id)
logger.info("Agent supervisor stopped")
def register_factory(
self,
agent_type: str,
factory: AgentFactory
) -> None:
"""
Registers a factory function for creating agents.
Args:
agent_type: Type of agent this factory creates
factory: Async function that creates agent instances
"""
self._factories[agent_type] = factory
logger.debug(f"Registered factory for agent type: {agent_type}")
async def register_agent(
self,
agent_id: str,
agent_type: str,
restart_policy: RestartPolicy = RestartPolicy.ON_FAILURE,
max_restarts: int = 3,
capacity: int = 10,
metadata: Optional[Dict[str, Any]] = None
) -> AgentInfo:
"""
Registers a new agent with the supervisor.
Args:
agent_id: Unique agent identifier
agent_type: Type of agent
restart_policy: When to restart the agent
max_restarts: Maximum restart attempts
capacity: Max concurrent tasks
metadata: Additional agent metadata
Returns:
AgentInfo for the registered agent
"""
if agent_id in self.agents:
logger.warning(f"Agent {agent_id} already registered")
return self.agents[agent_id]
info = AgentInfo(
agent_id=agent_id,
agent_type=agent_type,
restart_policy=restart_policy,
max_restarts=max_restarts,
capacity=capacity,
metadata=metadata or {}
)
self.agents[agent_id] = info
self.heartbeat.register(agent_id, agent_type)
logger.info(f"Registered agent: {agent_id} ({agent_type})")
return info
async def start_agent(self, agent_id: str) -> bool:
"""
Starts a registered agent.
Args:
agent_id: The agent to start
Returns:
True if started successfully
"""
if agent_id not in self.agents:
logger.error(f"Cannot start unregistered agent: {agent_id}")
return False
info = self.agents[agent_id]
if info.status == AgentStatus.RUNNING:
logger.warning(f"Agent {agent_id} is already running")
return True
info.status = AgentStatus.STARTING
try:
# If we have a factory, create the agent
if info.agent_type in self._factories:
factory = self._factories[info.agent_type]
await factory(agent_id)
info.status = AgentStatus.RUNNING
info.started_at = datetime.now(timezone.utc)
info.last_activity = info.started_at
# Subscribe to message bus
await self.bus.subscribe(
agent_id,
self._create_message_handler(agent_id)
)
logger.info(f"Started agent: {agent_id}")
return True
except Exception as e:
info.status = AgentStatus.ERROR
info.error_count += 1
logger.error(f"Failed to start agent {agent_id}: {e}")
return False
async def stop_agent(self, agent_id: str) -> bool:
"""
Stops a running agent.
Args:
agent_id: The agent to stop
Returns:
True if stopped successfully
"""
if agent_id not in self.agents:
return False
info = self.agents[agent_id]
info.status = AgentStatus.STOPPING
try:
# Unsubscribe from message bus
await self.bus.unsubscribe(agent_id)
# Unregister from heartbeat
self.heartbeat.unregister(agent_id)
info.status = AgentStatus.STOPPED
logger.info(f"Stopped agent: {agent_id}")
return True
except Exception as e:
info.status = AgentStatus.ERROR
logger.error(f"Error stopping agent {agent_id}: {e}")
return False
async def restart_agent(self, agent_id: str) -> bool:
"""
Restarts an agent.
Args:
agent_id: The agent to restart
Returns:
True if restarted successfully
"""
if agent_id not in self.agents:
return False
info = self.agents[agent_id]
# Check restart count
if info.restart_count >= info.max_restarts:
logger.error(
f"Agent {agent_id} exceeded max restarts "
f"({info.restart_count}/{info.max_restarts})"
)
await self._escalate_agent_failure(agent_id)
return False
info.status = AgentStatus.RESTARTING
info.restart_count += 1
logger.info(
f"Restarting agent {agent_id} "
f"(attempt {info.restart_count}/{info.max_restarts})"
)
# Stop and start
await self.stop_agent(agent_id)
await asyncio.sleep(1) # Brief pause
return await self.start_agent(agent_id)
async def _handle_agent_timeout(
self,
session_id: str,
agent_type: str
) -> None:
"""Handles agent heartbeat timeout"""
# Find the agent by session/ID
agent_id = session_id # In our case, session_id == agent_id
if agent_id not in self.agents:
return
info = self.agents[agent_id]
info.status = AgentStatus.ERROR
info.error_count += 1
logger.warning(f"Agent {agent_id} timed out (heartbeat)")
# Apply restart policy
if info.restart_policy == RestartPolicy.NEVER:
await self._escalate_agent_failure(agent_id)
elif info.restart_policy == RestartPolicy.ON_FAILURE:
if info.restart_count < info.max_restarts:
await self.restart_agent(agent_id)
else:
await self._escalate_agent_failure(agent_id)
elif info.restart_policy == RestartPolicy.ALWAYS:
await self.restart_agent(agent_id)
async def _escalate_agent_failure(self, agent_id: str) -> None:
"""Escalates an agent failure to the alert system"""
info = self.agents.get(agent_id)
if not info:
return
await self.bus.publish(AgentMessage(
sender="supervisor",
receiver="alert-agent",
message_type="agent_failure",
payload={
"agent_id": agent_id,
"agent_type": info.agent_type,
"error_count": info.error_count,
"restart_count": info.restart_count,
"last_activity": info.last_activity.isoformat() if info.last_activity else None
},
priority=MessagePriority.CRITICAL
))
logger.error(f"Escalated agent failure: {agent_id}")
def _create_message_handler(self, agent_id: str):
"""Creates a message handler that updates agent activity"""
async def handler(message: AgentMessage):
if agent_id in self.agents:
self.agents[agent_id].last_activity = datetime.now(timezone.utc)
# Heartbeat on activity
self.heartbeat.beat(agent_id)
return None
return handler
async def _health_check_loop(self) -> None:
"""Periodic health check loop"""
while self._running:
try:
await asyncio.sleep(self.check_interval)
await self._run_health_checks()
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in health check loop: {e}")
async def _run_health_checks(self) -> None:
"""Runs health checks on all agents"""
now = datetime.now(timezone.utc)
for agent_id, info in list(self.agents.items()):
if info.status != AgentStatus.RUNNING:
continue
# Check for stale agents (no activity for 5 minutes)
if info.last_activity:
idle_time = now - info.last_activity
if idle_time > timedelta(minutes=5):
logger.warning(
f"Agent {agent_id} has been idle for "
f"{idle_time.total_seconds():.0f}s"
)
# Load Balancing
def get_available_agent(
self,
agent_type: str,
strategy: str = "least_loaded"
) -> Optional[str]:
"""
Gets an available agent of the specified type.
Args:
agent_type: Type of agent needed
strategy: Load balancing strategy
Returns:
Agent ID or None if none available
"""
candidates = [
info for info in self.agents.values()
if info.agent_type == agent_type and info.is_available()
]
if not candidates:
return None
if strategy == "least_loaded":
# Pick agent with lowest load
best = min(candidates, key=lambda a: a.utilization())
elif strategy == "round_robin":
# Simple round-robin (just pick first available)
best = candidates[0]
else:
best = candidates[0]
return best.agent_id
def acquire_capacity(self, agent_id: str) -> bool:
"""
Acquires capacity from an agent.
Args:
agent_id: The agent to acquire from
Returns:
True if capacity was acquired
"""
if agent_id not in self.agents:
return False
info = self.agents[agent_id]
if not info.is_available():
return False
info.current_load += 1
return True
def release_capacity(self, agent_id: str) -> None:
"""
Releases capacity back to an agent.
Args:
agent_id: The agent to release to
"""
if agent_id in self.agents:
info = self.agents[agent_id]
info.current_load = max(0, info.current_load - 1)
# Status and Metrics
def get_agent_status(self, agent_id: str) -> Optional[Dict[str, Any]]:
"""Gets status information for an agent"""
if agent_id not in self.agents:
return None
info = self.agents[agent_id]
return {
"agent_id": info.agent_id,
"agent_type": info.agent_type,
"status": info.status.value,
"current_task": info.current_task,
"started_at": info.started_at.isoformat() if info.started_at else None,
"last_activity": info.last_activity.isoformat() if info.last_activity else None,
"error_count": info.error_count,
"restart_count": info.restart_count,
"utilization": info.utilization(),
"is_healthy": info.is_healthy(),
"is_available": info.is_available()
}
def get_all_status(self) -> Dict[str, Dict[str, Any]]:
"""Gets status for all agents"""
return {
agent_id: self.get_agent_status(agent_id)
for agent_id in self.agents
}
def get_metrics(self) -> Dict[str, Any]:
"""Gets aggregate metrics"""
total = len(self.agents)
running = sum(
1 for a in self.agents.values()
if a.status == AgentStatus.RUNNING
)
healthy = sum(1 for a in self.agents.values() if a.is_healthy())
available = sum(1 for a in self.agents.values() if a.is_available())
total_capacity = sum(a.capacity for a in self.agents.values())
total_load = sum(a.current_load for a in self.agents.values())
return {
"total_agents": total,
"running_agents": running,
"healthy_agents": healthy,
"available_agents": available,
"total_capacity": total_capacity,
"total_load": total_load,
"overall_utilization": total_load / max(total_capacity, 1),
"by_type": self._metrics_by_type()
}
def _metrics_by_type(self) -> Dict[str, Dict[str, int]]:
"""Gets metrics grouped by agent type"""
by_type: Dict[str, Dict[str, int]] = {}
for info in self.agents.values():
if info.agent_type not in by_type:
by_type[info.agent_type] = {
"total": 0,
"running": 0,
"healthy": 0
}
by_type[info.agent_type]["total"] += 1
if info.status == AgentStatus.RUNNING:
by_type[info.agent_type]["running"] += 1
if info.is_healthy():
by_type[info.agent_type]["healthy"] += 1
return by_type

View File

@@ -0,0 +1,436 @@
"""
Task Router for Intent-Based Routing
Provides:
- Intent classification
- Agent selection
- Fallback handling
- Routing metrics
"""
from typing import Dict, Optional, List, Any, Callable, Awaitable
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
import logging
import re
logger = logging.getLogger(__name__)
class RoutingStrategy(Enum):
"""Routing strategies"""
DIRECT = "direct" # Route to specific agent
ROUND_ROBIN = "round_robin" # Distribute evenly
LEAST_LOADED = "least_loaded" # Route to least loaded
PRIORITY = "priority" # Route based on priority
@dataclass
class RoutingRule:
"""A rule for routing tasks to agents"""
intent_pattern: str
agent_type: str
priority: int = 0
conditions: Dict[str, Any] = field(default_factory=dict)
fallback_agent: Optional[str] = None
def matches(self, intent: str, context: Dict[str, Any]) -> bool:
"""Check if this rule matches the intent and context"""
# Check intent pattern (supports wildcards)
pattern = self.intent_pattern.replace("*", ".*")
if not re.match(f"^{pattern}$", intent, re.IGNORECASE):
return False
# Check conditions
for key, value in self.conditions.items():
if context.get(key) != value:
return False
return True
@dataclass
class RoutingResult:
"""Result of a routing decision"""
success: bool
agent_id: Optional[str] = None
agent_type: Optional[str] = None
is_fallback: bool = False
reason: str = ""
routing_time_ms: float = 0
class TaskRouter:
"""
Routes tasks to appropriate agents based on intent.
Features:
- Pattern-based routing rules
- Priority ordering
- Fallback chains
- Routing metrics
"""
def __init__(self, supervisor=None):
"""
Initialize the task router.
Args:
supervisor: AgentSupervisor for agent availability
"""
self.supervisor = supervisor
self.rules: List[RoutingRule] = []
self._default_routes: Dict[str, str] = {}
self._routing_history: List[Dict[str, Any]] = []
self._max_history = 1000
# Initialize default rules
self._setup_default_rules()
def _setup_default_rules(self) -> None:
"""Sets up default routing rules"""
default_rules = [
# Learning support
RoutingRule(
intent_pattern="learning_*",
agent_type="tutor-agent",
priority=10,
fallback_agent="orchestrator"
),
RoutingRule(
intent_pattern="help_*",
agent_type="tutor-agent",
priority=5
),
RoutingRule(
intent_pattern="explain_*",
agent_type="tutor-agent",
priority=5
),
# Grading
RoutingRule(
intent_pattern="grade_*",
agent_type="grader-agent",
priority=10,
fallback_agent="quality-judge"
),
RoutingRule(
intent_pattern="evaluate_*",
agent_type="grader-agent",
priority=5
),
RoutingRule(
intent_pattern="correct_*",
agent_type="grader-agent",
priority=5
),
# Quality checks
RoutingRule(
intent_pattern="quality_*",
agent_type="quality-judge",
priority=10
),
RoutingRule(
intent_pattern="review_*",
agent_type="quality-judge",
priority=5
),
# Alerts
RoutingRule(
intent_pattern="alert_*",
agent_type="alert-agent",
priority=10
),
RoutingRule(
intent_pattern="notify_*",
agent_type="alert-agent",
priority=5
),
# System/Admin
RoutingRule(
intent_pattern="system_*",
agent_type="orchestrator",
priority=10
),
RoutingRule(
intent_pattern="admin_*",
agent_type="orchestrator",
priority=10
),
]
for rule in default_rules:
self.add_rule(rule)
def add_rule(self, rule: RoutingRule) -> None:
"""
Adds a routing rule.
Args:
rule: The routing rule to add
"""
self.rules.append(rule)
# Sort by priority (higher first)
self.rules.sort(key=lambda r: r.priority, reverse=True)
def remove_rule(self, intent_pattern: str) -> bool:
"""
Removes a routing rule by pattern.
Args:
intent_pattern: The pattern to remove
Returns:
True if a rule was removed
"""
original_len = len(self.rules)
self.rules = [r for r in self.rules if r.intent_pattern != intent_pattern]
return len(self.rules) < original_len
def set_default_route(self, agent_type: str, agent_id: str) -> None:
"""
Sets a default agent for a type.
Args:
agent_type: The agent type
agent_id: The default agent ID
"""
self._default_routes[agent_type] = agent_id
async def route(
self,
intent: str,
context: Optional[Dict[str, Any]] = None,
strategy: RoutingStrategy = RoutingStrategy.LEAST_LOADED
) -> RoutingResult:
"""
Routes a task based on intent.
Args:
intent: The task intent
context: Additional context for routing
strategy: Load balancing strategy
Returns:
RoutingResult with agent assignment
"""
start_time = datetime.now(timezone.utc)
context = context or {}
# Find matching rule
matching_rule = None
for rule in self.rules:
if rule.matches(intent, context):
matching_rule = rule
break
if not matching_rule:
result = RoutingResult(
success=False,
reason=f"No routing rule matches intent: {intent}"
)
self._record_routing(intent, result)
return result
# Get available agent
agent_id = await self._get_agent(
matching_rule.agent_type,
strategy
)
if agent_id:
result = RoutingResult(
success=True,
agent_id=agent_id,
agent_type=matching_rule.agent_type,
is_fallback=False,
reason="Primary agent selected"
)
elif matching_rule.fallback_agent:
# Try fallback
agent_id = await self._get_agent(
matching_rule.fallback_agent,
strategy
)
if agent_id:
result = RoutingResult(
success=True,
agent_id=agent_id,
agent_type=matching_rule.fallback_agent,
is_fallback=True,
reason="Fallback agent selected"
)
else:
result = RoutingResult(
success=False,
reason="No agents available (primary or fallback)"
)
else:
result = RoutingResult(
success=False,
reason=f"No {matching_rule.agent_type} agents available"
)
# Calculate routing time
end_time = datetime.now(timezone.utc)
result.routing_time_ms = (end_time - start_time).total_seconds() * 1000
self._record_routing(intent, result)
return result
async def _get_agent(
self,
agent_type: str,
strategy: RoutingStrategy
) -> Optional[str]:
"""Gets an available agent of the specified type"""
# Check default route first
if agent_type in self._default_routes:
agent_id = self._default_routes[agent_type]
if self.supervisor and self.supervisor.agents.get(agent_id):
info = self.supervisor.agents[agent_id]
if info.is_available():
return agent_id
# Use supervisor for load balancing
if self.supervisor:
strategy_str = "least_loaded"
if strategy == RoutingStrategy.ROUND_ROBIN:
strategy_str = "round_robin"
return self.supervisor.get_available_agent(
agent_type,
strategy_str
)
return None
def _record_routing(
self,
intent: str,
result: RoutingResult
) -> None:
"""Records routing decision for analytics"""
record = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"intent": intent,
"success": result.success,
"agent_id": result.agent_id,
"agent_type": result.agent_type,
"is_fallback": result.is_fallback,
"routing_time_ms": result.routing_time_ms,
"reason": result.reason
}
self._routing_history.append(record)
# Trim history
if len(self._routing_history) > self._max_history:
self._routing_history = self._routing_history[-self._max_history:]
# Log
if result.success:
logger.debug(
f"Routed '{intent}' to {result.agent_id} "
f"({'fallback' if result.is_fallback else 'primary'})"
)
else:
logger.warning(f"Failed to route '{intent}': {result.reason}")
# Analytics
def get_routing_stats(self) -> Dict[str, Any]:
"""Gets routing statistics"""
if not self._routing_history:
return {
"total_routes": 0,
"success_rate": 0,
"fallback_rate": 0,
"avg_routing_time_ms": 0
}
total = len(self._routing_history)
successful = sum(1 for r in self._routing_history if r["success"])
fallbacks = sum(1 for r in self._routing_history if r["is_fallback"])
avg_time = sum(
r["routing_time_ms"] for r in self._routing_history
) / total
return {
"total_routes": total,
"successful_routes": successful,
"success_rate": successful / total,
"fallback_routes": fallbacks,
"fallback_rate": fallbacks / max(successful, 1),
"avg_routing_time_ms": avg_time
}
def get_intent_distribution(self) -> Dict[str, int]:
"""Gets distribution of routed intents"""
distribution: Dict[str, int] = {}
for record in self._routing_history:
intent = record["intent"]
# Extract base intent (before _)
base = intent.split("_")[0] if "_" in intent else intent
distribution[base] = distribution.get(base, 0) + 1
return distribution
def get_agent_distribution(self) -> Dict[str, int]:
"""Gets distribution of routes by agent type"""
distribution: Dict[str, int] = {}
for record in self._routing_history:
agent_type = record.get("agent_type", "unknown")
if agent_type:
distribution[agent_type] = distribution.get(agent_type, 0) + 1
return distribution
def get_failure_reasons(self) -> Dict[str, int]:
"""Gets distribution of routing failure reasons"""
reasons: Dict[str, int] = {}
for record in self._routing_history:
if not record["success"]:
reason = record["reason"]
reasons[reason] = reasons.get(reason, 0) + 1
return reasons
def clear_history(self) -> None:
"""Clears routing history"""
self._routing_history.clear()
# Rule inspection
def get_rules(self) -> List[Dict[str, Any]]:
"""Gets all routing rules as dicts"""
return [
{
"intent_pattern": r.intent_pattern,
"agent_type": r.agent_type,
"priority": r.priority,
"conditions": r.conditions,
"fallback_agent": r.fallback_agent
}
for r in self.rules
]
def find_matching_rules(
self,
intent: str,
context: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
"""Finds all rules that match an intent"""
context = context or {}
return [
{
"intent_pattern": r.intent_pattern,
"agent_type": r.agent_type,
"priority": r.priority
}
for r in self.rules
if r.matches(intent, context)
]